void RecBipartLB::work(LDStats *stats) { vector<Vertex *> ptrvector; /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); // Processor Array ObjGraph *ogr = new ObjGraph(stats); // Object Graph /** ============================= STRATEGY ================================ */ level=0; peno=0; TOTALLOAD=0; numparts=CkNumPes(); parray=parr; double avgLoad = parr->getAverageLoad(); int numPes = parr->procs.size(); parr->resetTotalLoad(); for(int i=0;i<ogr->vertices.size();i++) { Vertex_helper *helper = new Vertex_helper(); vhelpers.push_back(helper); ptrvector.push_back((Vertex *)&(ogr->vertices[i])); } RecursiveBiPart(ogr,ptrvector,1,numparts); /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats }
void GraphBFTLB::work(LDStats *stats) { /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); // Processor Array ObjGraph *ogr = new ObjGraph(stats); // Object Graph /** ============================= STRATEGY ================================ */ double avgLoad = parr->getAverageLoad(); int numPes = parr->procs.size(); // CkPrintf("Average Load %g\n\n", avgLoad); // for(int i=0; i<numPes; i++) // CkPrintf("PE [%d] %g %g\n", i, parr->procs[i].getTotalLoad(), parr->procs[i].getOverhead()); parr->resetTotalLoad(); int start = 0, nextPe = 0; std::queue<int> vertexq; // start at vertex with id 0 vertexq.push(start); if(parr->procs[nextPe].getTotalLoad() + ogr->vertices[start].getVertexLoad() > avgLoad) { nextPe++; avgLoad += (avgLoad - parr->procs[nextPe].getTotalLoad())/(numPes-nextPe); } ogr->vertices[start].setNewPe(nextPe); // CkPrintf("[%d] %d %d %g %g %g\n", start, ogr->vertices[start].getCurrentPe(), ogr->vertices[start].getNewPe(), parr->procs[nextPe].getTotalLoad(), ogr->vertices[start].getVertexLoad(), parr->procs[nextPe].getTotalLoad() + ogr->vertices[start].getVertexLoad()); parr->procs[nextPe].totalLoad() += ogr->vertices[start].getVertexLoad(); int i, nbr; // breadth first traversal while(!vertexq.empty()) { start = vertexq.front(); vertexq.pop(); for(i = 0; i < ogr->vertices[start].sendToList.size(); i++) { // look at all neighbors of a node in the queue and map them while // inserting them in the queue (so we can look at their neighbors next) nbr = ogr->vertices[start].sendToList[i].getNeighborId(); if(ogr->vertices[nbr].getNewPe() == -1) { vertexq.push(nbr); if(parr->procs[nextPe].getTotalLoad() + ogr->vertices[nbr].getVertexLoad() > avgLoad) { nextPe++; avgLoad += (avgLoad - parr->procs[nextPe].getTotalLoad())/(numPes-nextPe); } ogr->vertices[nbr].setNewPe(nextPe); // CkPrintf("[%d] %d %d %g %g %g\n", nbr, ogr->vertices[nbr].getCurrentPe(), ogr->vertices[nbr].getNewPe(), parr->procs[nextPe].getTotalLoad(), ogr->vertices[start].getVertexLoad(), parr->procs[nextPe].getTotalLoad() + ogr->vertices[start].getVertexLoad()); parr->procs[nextPe].totalLoad() += ogr->vertices[nbr].getVertexLoad(); } } // end of for loop } // end of while loop /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats }
void TempAwareGreedyLB::work(LDStats* stats) { CkPrintf("----------------- in TempAwareGreedyLB -----------\n"); /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); // Processor Array ObjGraph *ogr = new ObjGraph(stats); // Object Graph /** ============================= STRATEGY ================================ */ parr->resetTotalLoad(); if (_lb_args.debug()>1) CkPrintf("[%d] In TempAwareGreedyLB strategy\n",CkMyPe()); int vert; // max heap of objects std::sort(ogr->vertices.begin(), ogr->vertices.end(), ObjLoadGreater()); // min heap of processors std::make_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); for(vert = 0; vert < ogr->vertices.size(); vert++) { // Pop the least loaded processor ProcInfo p = parr->procs.front(); std::pop_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); parr->procs.pop_back(); // Increment the load of the least loaded processor by the load of the // 'heaviest' unmapped object p.setTotalLoad(p.getTotalLoad() + ogr->vertices[vert].getVertexLoad()); ogr->vertices[vert].setNewPe(p.getProcId()); // Insert the least loaded processor with load updated back into the heap parr->procs.push_back(p); std::push_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); } /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats }
void ScotchRefineLB::work(LDStats *stats) { /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); ObjGraph *ogr = new ObjGraph(stats); int cost_array[10] = {64, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; /** ============================= STRATEGY ================================ */ // convert ObjGraph to the Scotch graph SCOTCH_Num baseval = 0; // starting index of vertices SCOTCH_Num vertnbr = ogr->vertices.size(); // number of vertices SCOTCH_Num edgenbr = 0; // number of edges SCOTCH_Num *oldpemap = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * vertnbr); double maxLoad = 0.0; double minLoad = 0.0; if (vertnbr > 0) { minLoad = ogr->vertices[baseval].getVertexLoad(); } long maxBytes = 1; int i, j, k, vert; /** remove duplicate edges from recvFrom */ for(i = baseval; i < vertnbr; i++) { for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { vert = ogr->vertices[i].sendToList[j].getNeighborId(); for(k = 0; k < ogr->vertices[i].recvFromList.size(); k++) { if(ogr->vertices[i].recvFromList[k].getNeighborId() == vert) { ogr->vertices[i].sendToList[j].setNumBytes(ogr->vertices[i].sendToList[j].getNumBytes() + ogr->vertices[i].recvFromList[k].getNumBytes()); ogr->vertices[i].recvFromList.erase(ogr->vertices[i].recvFromList.begin() + k); } } } } /** the object load is normalized to an integer between 0 and 256 */ for(i = baseval; i < vertnbr; i++) { if(ogr->vertices[i].getVertexLoad() > maxLoad) maxLoad = ogr->vertices[i].getVertexLoad(); if (ogr->vertices[i].getVertexLoad() < minLoad) { minLoad = ogr->vertices[i].getVertexLoad(); } edgenbr += ogr->vertices[i].sendToList.size() + ogr->vertices[i].recvFromList.size(); oldpemap[i] = ogr->vertices[i].getCurrentPe(); } for(i = baseval; i < vertnbr; i++) { for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { if (ogr->vertices[i].sendToList[j].getNumBytes() > maxBytes) { maxBytes = ogr->vertices[i].sendToList[j].getNumBytes(); } } for(j = 0; j < ogr->vertices[i].recvFromList.size(); j++) { if (ogr->vertices[i].recvFromList[j].getNumBytes() > maxBytes) { maxBytes = ogr->vertices[i].recvFromList[j].getNumBytes(); } } } /* adjacency list */ SCOTCH_Num *verttab = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * (vertnbr+1)); /* loads of vertices */ SCOTCH_Num *velotab = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * vertnbr); /* id of the neighbors */ SCOTCH_Num *edgetab = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * edgenbr); /* number of bytes exchanged */ SCOTCH_Num *edlotab = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * edgenbr); int edgeNum = 0; double ratio = 256.0/maxLoad; double byteRatio = 1024.0/maxBytes; for(i = baseval; i < vertnbr; i++) { verttab[i] = edgeNum; velotab[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio); for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { edgetab[edgeNum] = ogr->vertices[i].sendToList[j].getNeighborId(); edlotab[edgeNum] = (int) ceil(ogr->vertices[i].sendToList[j].getNumBytes() * byteRatio); edgeNum++; } for(j = 0; j < ogr->vertices[i].recvFromList.size(); j++) { edgetab[edgeNum] = ogr->vertices[i].recvFromList[j].getNeighborId(); edlotab[edgeNum] = (int) ceil(ogr->vertices[i].recvFromList[j].getNumBytes() * byteRatio); edgeNum++; } } verttab[i] = edgeNum; CkAssert(edgeNum == edgenbr); SCOTCH_Graph graph; // Graph to partition SCOTCH_Strat strat; // Strategy to achieve partitioning /* Initialize data structures */ SCOTCH_graphInit (&graph); SCOTCH_stratInit (&strat); SCOTCH_graphBuild (&graph, baseval, vertnbr, verttab, NULL, velotab, NULL, edgenbr, edgetab, edlotab); SCOTCH_graphCheck (&graph); double migration_cost = 1024.0; if (step() == 0) { SCOTCH_stratGraphMapBuild (&strat, SCOTCH_STRATBALANCE, parr->procs.size (), 0.01); } else { SCOTCH_stratGraphMapBuild (&strat, SCOTCH_STRATBALANCE | SCOTCH_STRATREMAP, parr->procs.size (), 0.01); } SCOTCH_Num *pemap = (SCOTCH_Num *)malloc(sizeof(SCOTCH_Num) * vertnbr); // Takes as input the graph, arch graph, strategy, migration cost in // double, old mapping and new mapping if (step() == 0) { SCOTCH_graphPart(&graph, parr->procs.size(), &strat, pemap); } else { SCOTCH_graphRepart(&graph, parr->procs.size(), oldpemap, migration_cost, NULL, &strat, pemap); } SCOTCH_graphExit (&graph); SCOTCH_stratExit (&strat); free(verttab); free(velotab); free(edgetab); free(edlotab); for(i = baseval; i < vertnbr; i++) { if(pemap[i] != ogr->vertices[i].getCurrentPe()) ogr->vertices[i].setNewPe(pemap[i]); } free(pemap); free(oldpemap); /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); delete parr; delete ogr; }
void RefineSwapLB::work(LDStats* stats) { /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); // Processor Array ObjGraph *ogr = new ObjGraph(stats); // Object Graph /** ============================= STRATEGY ================================ */ if (_lb_args.debug()>1) CkPrintf("[%d] In RefineSwapLB strategy\n",CkMyPe()); int vert; double avg_load = parr->getAverageLoad(); double threshold = avg_load * 0.01; double lower_bound_load = avg_load - threshold; double upper_bound_load = avg_load + threshold; cout <<"Average load " << avg_load << endl; std::vector<int> min_pe_heap; std::vector<int> max_pe_heap; std::vector<int>* pe_obj = new std::vector<int>[parr->procs.size()]; // Create a datastructure to store the objects in a processor for (int i = 0; i < ogr->vertices.size(); i++) { pe_obj[ogr->vertices[i].getCurrentPe()].push_back(i); // CkPrintf("%d pe %d: %lf\n", i, ogr->vertices[i].getCurrentPe(), ogr->vertices[i].getVertexLoad()); } // Construct max heap of overloaded processors and min heap of underloaded // processors. for (int i = 0; i < parr->procs.size(); i++) { //CkPrintf("%d : %lf\n", i, parr->procs[i].getTotalLoad()); if (parr->procs[i].getTotalLoad() > upper_bound_load) { max_pe_heap.push_back(i); } else if (parr->procs[i].getTotalLoad() < lower_bound_load) { min_pe_heap.push_back(i); } } std::make_heap(max_pe_heap.begin(), max_pe_heap.end(), ProcLoadGreaterIndex(parr)); while (max_pe_heap.size() != 0 && min_pe_heap.size() != 0) { int p_index = getMax(parr, max_pe_heap); ProcInfo &pinfo = parr->procs[p_index]; bool success = refine(parr, ogr, max_pe_heap, min_pe_heap, pe_obj, p_index, avg_load, threshold); if (!success) { // Swap with something. if (!refineSwap(parr, ogr, max_pe_heap, min_pe_heap, pe_obj, p_index, avg_load, threshold)) { max_pe_heap.push_back(p_index); std::push_heap(max_pe_heap.begin(), max_pe_heap.end(), ProcLoadGreaterIndex(parr)); break; } } } /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats delete[] pe_obj; delete parr; delete ogr; }
void MetisLB::work(LDStats* stats) { /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); ObjGraph *ogr = new ObjGraph(stats); /** ============================= STRATEGY ================================ */ if (_lb_args.debug() >= 2) { CkPrintf("[%d] In MetisLB Strategy...\n", CkMyPe()); } // convert ObjGraph to the adjacency structure int numVertices = ogr->vertices.size(); // number of vertices int numEdges = 0; // number of edges double maxLoad = 0.0; int i, j, k, vert; /** remove duplicate edges from recvFrom */ for(i = 0; i < numVertices; i++) { for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { vert = ogr->vertices[i].sendToList[j].getNeighborId(); for(k = 0; k < ogr->vertices[i].recvFromList.size(); k++) { if(ogr->vertices[i].recvFromList[k].getNeighborId() == vert) { ogr->vertices[i].sendToList[j].setNumBytes(ogr->vertices[i].sendToList[j].getNumBytes() + ogr->vertices[i].recvFromList[k].getNumBytes()); ogr->vertices[i].recvFromList.erase(ogr->vertices[i].recvFromList.begin() + k); } } } } /** the object load is normalized to an integer between 0 and 256 */ for(i = 0; i < numVertices; i++) { if(ogr->vertices[i].getVertexLoad() > maxLoad) maxLoad = ogr->vertices[i].getVertexLoad(); numEdges = numEdges + ogr->vertices[i].sendToList.size() + ogr->vertices[i].recvFromList.size(); } /* adjacency list */ idx_t *xadj = new idx_t[numVertices + 1]; /* id of the neighbors */ idx_t *adjncy = new idx_t[numEdges]; /* weights of the vertices */ idx_t *vwgt = new idx_t[numVertices]; /* weights of the edges */ idx_t *adjwgt = new idx_t[numEdges]; int edgeNum = 0; double ratio = 256.0/maxLoad; for(i = 0; i < numVertices; i++) { xadj[i] = edgeNum; vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio); for(j = 0; j < ogr->vertices[i].sendToList.size(); j++) { adjncy[edgeNum] = ogr->vertices[i].sendToList[j].getNeighborId(); adjwgt[edgeNum] = ogr->vertices[i].sendToList[j].getNumBytes(); edgeNum++; } for(j = 0; j < ogr->vertices[i].recvFromList.size(); j++) { adjncy[edgeNum] = ogr->vertices[i].recvFromList[j].getNeighborId(); adjwgt[edgeNum] = ogr->vertices[i].recvFromList[j].getNumBytes(); edgeNum++; } } xadj[i] = edgeNum; CkAssert(edgeNum == numEdges); idx_t edgecut; // number of edges cut by the partitioning idx_t *pemap; idx_t options[METIS_NOPTIONS]; METIS_SetDefaultOptions(options); //options[METIS_OPTION_PTYPE] = METIS_PTYPE_RB; // C style numbering options[METIS_OPTION_NUMBERING] = 0; // number of constrains idx_t ncon = 1; // number of partitions idx_t numPes = parr->procs.size(); real_t ubvec[ncon]; // allow 10% imbalance ubvec[0] = 1.1; // mapping of objs to partitions pemap = new idx_t[numVertices]; // Specifies size of vertices for computing the total communication volume idx_t *vsize = NULL; // This array of size nparts specifies the desired weight for each partition // and setting it to NULL indicates graph should be equally divided among // partitions real_t *tpwgts = NULL; int option = 0; if (WEIGHTED == option) { // set up the different weights between 0 and 1 tpwgts = new real_t[numPes]; for (i = 0; i < numPes; i++) { tpwgts[i] = 1.0/(real_t)numPes; } } else if (MULTI_CONSTRAINT == option) { CkAbort("Multiple constraints not implemented.\n"); } // numVertices: num vertices in the graph; ncon: num balancing constrains // xadj, adjncy: of size n+1 and adjncy of 2m, adjncy[xadj[i]] through and // including adjncy[xadj[i+1]-1]; // vwgt: weight of the vertices; vsize: amt of data that needs to be sent // for ith vertex is vsize[i] // adjwght: the weight of edges; numPes: total parts // tpwghts: target partition weight, can pass NULL to equally divide // ubvec: of size ncon to indicate allowed load imbalance tolerance (> 1.0) // options: array of options; edgecut: stores the edgecut; pemap: mapping METIS_PartGraphRecursive(&numVertices, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &numPes, tpwgts, ubvec, options, &edgecut, pemap); delete[] xadj; delete[] adjncy; delete[] vwgt; delete[] adjwgt; delete[] vsize; delete[] tpwgts; if (_lb_args.debug() >= 1) { CkPrintf("[%d] MetisLB done! \n", CkMyPe()); } for(i = 0; i < numVertices; i++) { if(pemap[i] != ogr->vertices[i].getCurrentPe()) ogr->vertices[i].setNewPe(pemap[i]); } delete[] pemap; /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); delete parr; delete ogr; }
void TreeMatchLB::work(BaseLB::LDStats* stats) { /** ========================= 1st Do Load Balancing =======================*/ /** ========================== INITIALIZATION ============================= */ ProcArray *parr = new ProcArray(stats); // Processor Array ObjGraph *ogr = new ObjGraph(stats); // Object Graph /** ============================= STRATEGY ================================ */ parr->resetTotalLoad(); if (_lb_args.debug()>1) CkPrintf("[%d] In GreedyLB strategy\n",CkMyPe()); int vert; // max heap of objects std::sort(ogr->vertices.begin(), ogr->vertices.end(), ObjLoadGreater()); // min heap of processors std::make_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); for(vert = 0; vert < ogr->vertices.size(); vert++) { // Pop the least loaded processor ProcInfo p = parr->procs.front(); std::pop_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); parr->procs.pop_back(); // Increment the load of the least loaded processor by the load of the // 'heaviest' unmapped object p.totalLoad() += ogr->vertices[vert].getVertexLoad(); ogr->vertices[vert].setNewPe(p.getProcId()); // Insert the least loaded processor with load updated back into the heap parr->procs.push_back(p); std::push_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater()); } /** ============================== CLEANUP ================================ */ ogr->convertDecisions(stats); // Send decisions back to LDStats /** ====================== 2nd do Topology aware mapping ====================*/ int nb_procs; double **comm_mat; int i; int *object_mapping, *permutation; /* get number of processors and teh greedy load balancing*/ nb_procs = stats->nprocs(); object_mapping=stats->to_proc.getVec(); stats->makeCommHash(); // allocate communication matrix comm_mat=(double**)malloc(sizeof(double*)*nb_procs); for(i=0;i<nb_procs;i++){ comm_mat[i]=(double*)calloc(nb_procs,sizeof(double)); } /* Build the communicartion matrix*/ for(i=0;i<stats->n_comm;i++){ LDCommData &commData = stats->commData[i]; if((!commData.from_proc())&&(commData.recv_type()==LD_OBJ_MSG)){ /* object_mapping[i] is the processors of object i*/ int from = object_mapping[stats->getHash(commData.sender)]; int to = object_mapping[stats->getHash(commData.receiver.get_destObj())]; if(from!=to){ comm_mat[from][to]+=commData.bytes; comm_mat[to][from]+=commData.bytes; } } } /* build the topology of the hardware (abe machine here)*/ tm_topology_t *topology=build_abe_topology(nb_procs); display_topology(topology); /* compute the affinity tree */ tree_t *comm_tree=build_tree_from_topology(topology,comm_mat,nb_procs,NULL,NULL); /* Compute the processor permutation*/ permutation=(int*)malloc(sizeof(int)*nb_procs); map_topology_simple(topology,comm_tree,permutation,NULL); /* Apply this perutation to all objects Side effect: object_mapping points to the stats->to_proc.getVec() So, these lines change also stats->to_proc.getVec() */ for(i=0;i<nb_procs;i++) object_mapping[i]=permutation[object_mapping[i]]; // free communication matrix; for(i=0;i<nb_procs;i++){ free(comm_mat[i]); } free(comm_mat); free_topology(topology); }