/** * This function implements a strategy similar to the one used in the * centralized case in NamdCentLB. */ CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) { int numProcessors = stats->nprocs(); // number of processors at group level int numPatches = PatchMap::Object()->numPatches(); ComputeMap *computeMap = ComputeMap::Object(); const int numComputes = computeMap->numComputes(); const int numGroupComputes = stats->n_migrateobjs; const SimParameters* simParams = Node::Object()->simParameters; if ( ! processorArray ) processorArray = new processorInfo[numProcessors]; // these data structures are global and need to be distributed if ( ! patchArray ) patchArray = new patchInfo[numPatches]; if ( ! computeArray ) computeArray = new computeInfo[numGroupComputes]; if ( ! from_procs ) from_procs = new int[numGroupComputes]; int nMoveableComputes = buildData(stats); CmiAssert(nMoveableComputes <= numGroupComputes); #if LDB_DEBUG #define DUMP_LDBDATA 1 #define LOAD_LDBDATA 1 #endif #if DUMP_LDBDATA dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes); #elif LOAD_LDBDATA loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes); // CkExit(); #endif double averageLoad = 0.; double avgCompute; double maxCompute; int maxComputeId; int numPesAvailable; { int i; double total = 0.; maxCompute = 0.; int maxi = 0; for (i=0; i<nMoveableComputes; i++) { double load = computeArray[i].load; total += load; if ( load > maxCompute ) { maxCompute = load; maxi = i; } } avgCompute = total / nMoveableComputes; maxComputeId = computeArray[maxi].handle.id.id[0]; int P = stats->nprocs(); numPesAvailable = 0; for (i=0; i<P; i++) { if (processorArray[i].available) { ++numPesAvailable; total += processorArray[i].backgroundLoad; } } if (numPesAvailable == 0) NAMD_die("No processors available for load balancing!\n"); averageLoad = total/numPesAvailable; } int i_split = 0; double maxUnsplit = 0.; if ( step() == 1 ) { for (int i=0; i<nMoveableComputes; i++) { const int cid = computeArray[i].handle.id.id[0]; if ( computeMap->numPartitions(cid) == 0 ) { const double load = computeArray[i].load; if ( load > maxUnsplit ) maxUnsplit = load; continue; } ++i_split; } } { SplitComputesMsg *msg = new(i_split,i_split) SplitComputesMsg; msg->maxUnsplit = maxUnsplit; msg->averageLoad = averageLoad; msg->avgCompute = avgCompute; msg->maxCompute = maxCompute; msg->maxComputeId = maxComputeId; msg->nMoveableComputes = nMoveableComputes; msg->numPesAvailable = numPesAvailable; msg->n = i_split; if ( step() == 1 ) { i_split = 0; for (int i=0; i<nMoveableComputes; i++) { computeArray[i].processor = computeArray[i].oldProcessor; const int cid = computeArray[i].handle.id.id[0]; if ( computeMap->numPartitions(cid) == 0 ) { continue; } msg->cid[i_split] = cid; msg->load[i_split] = computeArray[i].load; ++i_split; } } thisProxy[0].splitComputes(msg); } if ( step() == 1 ) { // compute splitting only } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default if (step() < 4) TorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); else RefineTorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors, 1); } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) { TorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) { RefineTorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors, 1); } else if (simParams->ldbStrategy == LDBSTRAT_OLD) { NAMD_die("Old load balancer strategy is not compatible with hybrid balancer."); if (step() < 4) Alg7(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); else RefineOnly(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); } #if LDB_DEBUG && USE_TOPOMAP TopoManager tmgr; int pe1, pe2, pe3, hops=0; /* This is double counting the hops for(int i=0; i<nMoveableComputes; i++) { pe1 = computeArray[i].processor; pe2 = patchArray[computeArray[i].patch1].processor; pe3 = patchArray[computeArray[i].patch2].processor; hops += tmgr.getHopsBetweenRanks(pe1, pe2); if(computeArray[i].patch1 != computeArray[i].patch2) hops += tmgr.getHopsBetweenRanks(pe1, pe3); }*/ for (int i=0; i<numPatches; i++) { //int num = patchArray[i].proxiesOn.numElements(); pe1 = patchArray[i].processor; Iterator nextProc; processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc); while (p) { pe2 = p->Id; hops += tmgr.getHopsBetweenRanks(pe1, pe2); p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc); } } CkPrintf("Load Balancing: Number of Hops: %d\n", hops); #endif #if DUMP_LDBDATA dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes); #elif LOAD_LDBDATA dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes); // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes); // CkExit(); #endif // For error checking: // Count up computes, to see if somebody doesn't have any computes int i; #if 0 int* computeCount = new int[numProcessors]; for(i=0; i<numProcessors; i++) computeCount[i]=0; for(i=0; i<nMoveableComputes; i++) computeCount[computeArray[i].processor]++; for(i=0; i<numProcessors; i++) { if (computeCount[i]==0) iout << iINFO <<"Warning: Processor " << i << " has NO moveable computes.\n" << endi; } delete [] computeCount; #endif CkVec<MigrateInfo *> migrateInfo; for(i=0;i<nMoveableComputes;i++) { if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) { /* CkPrintf("[%d] Obj %d migrating from %d (%d) to %d\n", CkMyPe(),computeArray[i].handle.id.id[0], from_procs[i], computeArray[i].oldProcessor, computeArray[i].processor); */ MigrateInfo *migrateMe = new MigrateInfo; migrateMe->obj = computeArray[i].handle; //migrateMe->from_pe = computeArray[i].oldProcessor; int frompe = from_procs[i]; if (frompe == numProcessors) frompe = -1; else frompe = frompe + stats->procs[0].pe; migrateMe->from_pe = frompe; migrateMe->to_pe = computeArray[i].processor; if (frompe == -1) { // don't know yet which processor this compute belongs to, but // inform receiver LDObjData obj; obj.handle = computeArray[i].handle; thisProxy[computeArray[i].processor].ObjMigrated(obj, NULL, 0, currentLevel-1); } migrateInfo.insertAtEnd(migrateMe); // sneak in updates to ComputeMap //ERASE CkPrintf("%d setting %d to processor %d\n",CkMyPe(),computeArray[i].handle.id.id[0],computeArray[i].processor); computeMap->setNewNode(computeArray[i].handle.id.id[0], computeArray[i].processor); } } // CkPrintf("LOAD BALANCING READY %d\n",CkMyPe()); LBMigrateMsg* msg; msg = createMigrateMsg(migrateInfo, numProcessors); peLoads = new double [numProcessors]; startPE = processorArray[0].Id; endPE = processorArray[numProcessors-1].Id; // CkPrintf("[%d] numProcessors=%d, %d to %d\n",CkMyPe(),numProcessors,processorArray[0].Id,processorArray[numProcessors-1].Id); for (i=0; i<numProcessors; i++) { peLoads[i] = processorArray[i].load; } delete [] from_procs; delete [] processorArray; delete [] patchArray; delete [] computeArray; from_procs = NULL; processorArray = NULL; patchArray = NULL; computeArray = NULL; return msg; }
Main(CkArgMsg* m) { if ( (m->argc != 3) && (m->argc != 7) ) { CkPrintf("%s [array_size] [block_size]\n", m->argv[0]); CkPrintf("OR %s [array_size_X] [array_size_Y] [array_size_Z] [block_size_X] [block_size_Y] [block_size_Z]\n", m->argv[0]); CkAbort("Abort"); } // set iteration counter to zero iterations = 0; // store the main proxy mainProxy = thisProxy; if(m->argc == 3) { arrayDimX = arrayDimY = arrayDimZ = atoi(m->argv[1]); blockDimX = blockDimY = blockDimZ = atoi(m->argv[2]); } else if (m->argc == 7) { arrayDimX = atoi(m->argv[1]); arrayDimY = atoi(m->argv[2]); arrayDimZ = atoi(m->argv[3]); blockDimX = atoi(m->argv[4]); blockDimY = atoi(m->argv[5]); blockDimZ = atoi(m->argv[6]); } if (arrayDimX < blockDimX || arrayDimX % blockDimX != 0) CkAbort("array_size_X % block_size_X != 0!"); if (arrayDimY < blockDimY || arrayDimY % blockDimY != 0) CkAbort("array_size_Y % block_size_Y != 0!"); if (arrayDimZ < blockDimZ || arrayDimZ % blockDimZ != 0) CkAbort("array_size_Z % block_size_Z != 0!"); num_chare_x = arrayDimX / blockDimX; num_chare_y = arrayDimY / blockDimY; num_chare_z = arrayDimZ / blockDimZ; // print info CkPrintf("\nSTENCIL COMPUTATION WITH NO BARRIERS\n"); CkPrintf("Running Jacobi on %d processors with (%d, %d, %d) chares\n", CkNumPes(), num_chare_x, num_chare_y, num_chare_z); CkPrintf("Array Dimensions: %d %d %d\n", arrayDimX, arrayDimY, arrayDimZ); CkPrintf("Block Dimensions: %d %d %d\n", blockDimX, blockDimY, blockDimZ); // Create new array of worker chares #if USE_TOPOMAP CProxy_JacobiMap map = CProxy_JacobiMap::ckNew(num_chare_x, num_chare_y, num_chare_z); CkPrintf("Topology Mapping is being done ... \n"); CkArrayOptions opts(num_chare_x, num_chare_y, num_chare_z); opts.setMap(map); array = CProxy_Jacobi::ckNew(opts); #else array = CProxy_Jacobi::ckNew(num_chare_x, num_chare_y, num_chare_z); #endif TopoManager tmgr; CkArray *jarr = array.ckLocalBranch(); int jmap[num_chare_x][num_chare_y][num_chare_z]; int hops=0, p; for(int i=0; i<num_chare_x; i++) for(int j=0; j<num_chare_y; j++) for(int k=0; k<num_chare_z; k++) { jmap[i][j][k] = jarr->procNum(CkArrayIndex3D(i, j, k)); } for(int i=0; i<num_chare_x; i++) for(int j=0; j<num_chare_y; j++) for(int k=0; k<num_chare_z; k++) { p = jmap[i][j][k]; hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i+1)][j][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i-1)][j][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j+1)][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j-1)][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k+1)]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k-1)]); } CkPrintf("Total Hops: %d\n", hops); #ifdef JACOBI_OPENMP CProxy_OmpInitializer ompInit = CProxy_OmpInitializer::ckNew(4); #else //Start the computation start(); #endif }