void TempAwareRefineLB::work(LDStats* stats) { #ifdef TEMP_LDB //////////////////////////////////////////////////// numProcs=stats->nprocs(); numChips=numProcs/logicalCoresPerChip; avgChipTemp=new float[numChips]; if(procFreq!=NULL) delete [] procFreq; if(procFreqEffect!=NULL) delete [] procFreqEffect; // if(procFreqPtr!=NULL) delete [] procFreqPtr; if(procTemp!=NULL) delete [] procTemp; if(procFreqNew!=NULL) delete [] procFreqNew; if(procFreqNewEffect!=NULL) delete [] procFreqNewEffect; if(avgChipTemp!=NULL) delete [] avgChipTemp; procFreq = new int[numProcs]; procFreqEffect = new int[numProcs]; // procFreqPtr = new int[numProcs]; procTemp = new float[numProcs]; procFreqNew = new int[numProcs]; procFreqNewEffect = new int[numProcs]; avgChipTemp = new float[numChips]; for(int i=0;i<numChips;i++) avgChipTemp[i]=0; for(int i=0;i<numProcs;i++) { procFreq[i] = stats->procs[i].pe_speed; procTemp[i] = stats->procs[i].pe_temp; // procFreqPtr[i] = getProcFreqPtr(freqs,numAvailFreqs,procFreq[i]); avgChipTemp[i/logicalCoresPerChip] += procTemp[i]; } for(int i=0;i<numChips;i++) { avgChipTemp[i]/=logicalCoresPerChip; //CkPrintf("---- CHIP#%d has temp=%f ----------\n",i,avgChipTemp[i]); } for(int i=0;i<numChips;i++) { int over=0,under=0; if(avgChipTemp[i] > MAX_TEMP) { over=1; if(procFreqPtr[i*logicalCoresPerChip]==numAvailFreqs-1) { for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]]; CkPrintf("CHIP#%d RUNNING HOT EVEN WITH MIN FREQUENCY!!\n",i); } else { for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) { if(procFreqPtr[j]<numAvailFreqs-1) procFreqPtr[j]++; #ifdef MAX_MIN /// PLEASE COMMENT OUT .. TESTING ONLY if(i==0) {procFreqPtr[j] = numAvailFreqs-1;/*CkPrintf("C for i:%d\n",j);*/} //if(i<numChips-1) procFreqPtr[j]=0; else procFreqPtr[j]=0; ///////////////////////// #endif procFreqNew[j] = freqs[procFreqPtr[j]]; } #ifndef ORG_VERSION CkPrintf("!!!!! Chip#%d running HOT shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]); #endif } } else // if(avgChipTemp[i] < MAX_TEMP-1) { under=1; if(procFreqPtr[i*logicalCoresPerChip]>0) { for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) { if(procFreqPtr[j]>0) procFreqPtr[j]--; #ifdef MAX_MIN /// PLEASE COMMENT OUT .. TESTING ONLY if(i==0) procFreqPtr[j] = numAvailFreqs-1; //if(i<numChips-1) procFreqPtr[j]=0; else procFreqPtr[j]=0; ///////////////////////// #endif procFreqNew[j] = freqs[procFreqPtr[j]]; } #ifndef ORG_VERSION CkPrintf("!!!!! Chip#%d running COLD shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]); #endif } else { for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]]; } } /* if(under==0 && over==0) { for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]]; } */ //if(i==5) for(int j=i*c(resPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[numAvailFreqs-1]; //else #ifdef ORG_VERSION for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0]; #endif //for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0]; } //for(int x=0;x<numProcs;x+=logicalCoresPerChip) if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]); //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d freq %d\n",x,procFreqNew[x]); //////////////////////////////////////////////////// #ifndef NO_TEMP_LB int obj; int n_pes = stats->nprocs(); // CkPrintf("[%d] RefineLB strategy\n",CkMyPe()); // RemoveNonMigratable(stats, n_pes); // get original object mapping int* from_procs = RefinerTemp::AllocProcs(n_pes, stats); for(obj=0;obj<stats->n_objs;obj++) { int pe = stats->from_proc[obj]; from_procs[obj] = pe; } // Get a new buffer to refine into populateEffectiveFreq(numProcs); int* to_procs = RefinerTemp::AllocProcs(n_pes, stats); // RefinerTemp refiner(1.03,procFreqEffect,procFreqNewEffect,n_pes); // overload tolerance=1.05 RefinerTemp refiner(1.03,procFreq,procFreqNew,n_pes); refiner.Refine(n_pes, stats, from_procs, to_procs); // Save output int migs=0; int *numMigs = new int[numProcs]; int totE = 0; for(int mm=0;mm<numProcs;mm++) numMigs[mm] = 0; for(obj=0;obj<stats->n_objs;obj++) { int pe = stats->from_proc[obj]; numMigs[to_procs[obj]]++; //stats->objData[obj].objID(); LDObjData &odata = stats->objData[obj]; computeInfo *c1 = new computeInfo(); c1->id = odata.objID(); //if(to_procs[obj]==3) CkPrintf("[%d,%d] going to 3 totE:%d\n",c1->id.getID()[0],c1->id.getID()[1],totE++);//,(stats->objData[obj].objID().getID())[1],totE++); if (to_procs[obj] != pe) { migs++; //if (_lb_args.debug()>=2) { // CkPrintf("[%d,%d] Obj %d migrating from %d to %d\n", // c1->id.getID()[0],c1->id.getID()[1],obj,pe,to_procs[obj]); } stats->to_proc[obj] = to_procs[obj]; } } for(int mm=0;mm<numProcs;mm++) { //CkPrintf("PROC#%d freq:%d objs:%d ----------\n",mm,procFreqNew[mm],numMigs[mm]); } CkPrintf("TEMPLB INFO: Total Objs:%d migrations:%d time:%f \n",stats->n_objs,migs,CmiWallTimer()-starting); fprintf(migFile,"%f %d\n",CmiWallTimer()-starting,migs); // Free the refine buffers RefinerTemp::FreeProcs(from_procs); RefinerTemp::FreeProcs(to_procs); #endif //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d ------- freq %d\n",x,procFreqNew[x]); /* for(int x=0;x<numProcs;x+=logicalCoresPerChip) { if(procFreq[x]!=procFreqNew[x]) { CkPrintf("Chaning the freq for PROC#%d\n",x); thisProxy[x].changeFreq(procFreqNew[x]); } } */ for(int x=0;x<numProcs;x++) { //CkPrintf("--------- Proc#%d %d numProcs=%d\n",x,procFreqNew[x],numProcs); if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]); } #endif // TEMP_LDB endif }
void TempAwareCommLB::work(LDStats* stats) { /** ========================== INITIALIZATION ============================= */ #ifdef TEMP_LDB ////////////////////////////////////////////////////// // initialize structures for TempLBs initStructs(stats); tempControl(); populateEffectiveFreq(stats->nprocs()); ////////////////////////////////////////////////////// CkPrintf(" ================== in TempAwareCommLB::work() ===========\n"); ProcArrayTemp *parr = new ProcArrayTemp(stats,procFreq,procFreqNew); // Processor Array parr->convertToInsts(stats); ObjGraphTemp *ogr = new ObjGraphTemp(stats,procFreq,procFreqNew); // Object Graph ogr->convertToInsts(stats); double avgload = parr->getAverageLoad(); // Average load of processors // Sets to false if it is overloaded, else to true vector<bool> proc_load_info(parr->procs.size(), false); // Create an array of vectors for each processor mapping to the objects in // that processor std::vector<int>* parr_objs = new std::vector<int>[parr->procs.size()]; upper_threshold_temp = avgload + (avgload * THRESHOLD); //lower_threshold = avgload - (avgload * THRESHOLD * THRESHOLD); lower_threshold_temp = avgload; int less_loaded_counter = 0; srand(time(NULL)); /** ============================= STRATEGY ================================ */ if (_lb_args.debug()>1) CkPrintf("[%d] In TempAwareCommLB strategy\n",CkMyPe()); CkPrintf("Average load %E\n", avgload); int vert, i, j; int curr_pe; // Iterate over all the chares and construct the peid, vector<chareid> array for(vert = 0; vert < ogr->vertices.size(); vert++) { curr_pe = ogr->vertices[vert].getCurrentPe(); parr_objs[curr_pe].push_back(vert); ogr->vertices[vert].setNewPe(curr_pe); } std::vector<int> parr_above_avg; std::vector<int> parr_below_avg; double pe_load; // Insert into parr_above_avg if the processor fits under the criteria of // overloaded processor. // Insert the processor id into parr_below_avg if the processor is underloaded for (vert = 0; vert < parr->procs.size(); vert++) { pe_load = parr->procs[vert].getTotalLoad(); if (pe_load > upper_threshold_temp) { // Pushing ProcInfo into this list parr_above_avg.push_back(vert); } else if (pe_load < lower_threshold_temp) { parr_below_avg.push_back(parr->procs[vert].getProcId()); proc_load_info[parr->procs[vert].getProcId()] = true; less_loaded_counter++; } } std::make_heap(parr_above_avg.begin(), parr_above_avg.end(), ProcLoadGreater(parr)); int random; int randomly_obj_id; bool obj_allocated; int num_tries; // Allow as many swaps as there are chares int total_swaps = ogr->vertices.size() * SWAP_MULTIPLIER; int possible_pe; double obj_load; // Keep on loadbalancing until the number of above avg processors is 0 while (parr_above_avg.size() != 0 && total_swaps > 0 && parr_below_avg.size() != 0) { // CkPrintf("Above avg : %d Below avg : %d Total swaps: %d\n", parr_above_avg.size(), // parr_below_avg.size(), total_swaps); obj_allocated = false; num_tries = 0; // Pop the heaviest processor int p_index = popFromProcHeap(parr_above_avg, parr); ProcInfo& p = parr->procs[p_index]; while (!obj_allocated && num_tries < parr_objs[p.getProcId()].size()) { // It might so happen that due to overhead load, it might not have any // more objects in its list if (parr_objs[p.getProcId()].size() == 0) { // CkPrintf("No obj left to be allocated\n"); obj_allocated = true; break; } int randd = rand(); random = randd % parr_objs[p.getProcId()].size(); randomly_obj_id = parr_objs[p.getProcId()][random]; //need to update the load below .. account for freqs obj_load = ogr->vertices[randomly_obj_id].getVertexLoad(); // CkPrintf("Heavy %d: Parr obj size : %d random : %d random obj id : %d\n", p_index, // parr_objs[p.getProcId()].size(), randd, randomly_obj_id); std::vector<int> possible_pes; getPossiblePes(possible_pes, randomly_obj_id, ogr, parr); for (i = 0; i < possible_pes.size(); i++) { // If the heaviest communicating processor is there in the list, then // assign it to that. possible_pe = possible_pes[i]; if ((parr->procs[possible_pe].getTotalLoad() + obj_load) < upper_threshold_temp) { // CkPrintf("** Transfered %d(Load %lf) from %d:%d(Load %lf) to %d:%d(Load %lf)\n", // randomly_obj_id, obj_load, CkNodeOf(p.getProcId()), p.getProcId(), p.getTotalLoad(), // CkNodeOf(possible_pe), possible_pe, // parr->procs[possible_pe].getTotalLoad()); handleTransfer(randomly_obj_id, p, possible_pe, parr_objs, ogr, parr); obj_allocated = true; total_swaps--; updateLoadInfo(p_index, possible_pe, upper_threshold_temp, lower_threshold_temp, parr_above_avg, parr_below_avg, proc_load_info, parr); break; } } // Since there is no processor in the least loaded list with which this // chare communicates, pick a random least loaded processor. if (!obj_allocated) { //CkPrintf(":( Could not transfer to the nearest communicating ones\n"); for (int x = 0; x < parr_below_avg.size(); x++) { int random_pe = parr_below_avg[x]; if ((parr->procs[random_pe].getTotalLoad() + obj_load) < upper_threshold_temp) { obj_allocated = true; total_swaps--; handleTransfer(randomly_obj_id, p, random_pe, parr_objs, ogr, parr); updateLoadInfo(p_index, random_pe, upper_threshold_temp, lower_threshold_temp, parr_above_avg, parr_below_avg, proc_load_info, parr); break; } num_tries++; } } } if (!obj_allocated) { //CkPrintf("!!!! Could not handle the heavy proc %d so giving up\n", p_index); // parr_above_avg.push_back(p_index); // std::push_heap(parr_above_avg.begin(), parr_above_avg.end(), // ProcLoadGreater(parr)); } } //CkPrintf("CommAwareRefine> After lb max load: %lf avg load: %lf\n", max_load, avg_load/parr->procs.size()); /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats delete parr; delete ogr; delete[] parr_objs; #else CmiAbort("TempLBs are not supported without the TEMP_LDB flag\n"); #endif }