int main(int argc, char *argv[]) { idx_t mype, npes; MPI_Comm comm; MPI_Init(&argc, &argv); MPI_Comm_dup(MPI_COMM_WORLD, &comm); gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); if (argc != 2 && argc != 3) { if (mype == 0) printf("Usage: %s <graph-file> [coord-file]\n", argv[0]); MPI_Finalize(); exit(0); } TestParMetis_GPart(argv[1], (argc == 3 ? argv[2] : NULL), comm); gkMPI_Comm_free(&comm); MPI_Finalize(); return 0; }
/****************************************************************************** * This function takes a graph and its partition vector and creates a new * graph corresponding to the one after the movement *******************************************************************************/ void TestMoveGraph(graph_t *ograph, graph_t *omgraph, idx_t *part, MPI_Comm comm) { idx_t npes, mype; ctrl_t *ctrl; graph_t *graph, *mgraph; idx_t options[5] = {0, 0, 1, 0, 0}; gkMPI_Comm_size(comm, &npes); ctrl = SetupCtrl(PARMETIS_OP_KMETIS, NULL, 1, npes, NULL, NULL, comm); mype = ctrl->mype; ctrl->CoarsenTo = 1; /* Needed by SetUpGraph, otherwise we can FP errors */ graph = TestSetUpGraph(ctrl, ograph->vtxdist, ograph->xadj, NULL, ograph->adjncy, NULL, 0); AllocateWSpace(ctrl, 0); CommSetup(ctrl, graph); graph->where = part; graph->ncon = 1; mgraph = MoveGraph(ctrl, graph); omgraph->gnvtxs = mgraph->gnvtxs; omgraph->nvtxs = mgraph->nvtxs; omgraph->nedges = mgraph->nedges; omgraph->vtxdist = mgraph->vtxdist; omgraph->xadj = mgraph->xadj; omgraph->adjncy = mgraph->adjncy; mgraph->vtxdist = NULL; mgraph->xadj = NULL; mgraph->adjncy = NULL; FreeGraph(mgraph); graph->where = NULL; FreeInitialGraphAndRemap(graph); FreeCtrl(&ctrl); }
/************************************************************************* * This function implements a simple graph adaption strategy. **************************************************************************/ void Mc_AdaptGraph(graph_t *graph, idx_t *part, idx_t ncon, idx_t nparts, MPI_Comm comm) { idx_t h, i; idx_t nvtxs; idx_t npes, mype; idx_t *vwgt, *pwgts; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); nvtxs = graph->nvtxs; vwgt = graph->vwgt; pwgts = ismalloc(nparts*ncon, 1, "pwgts"); if (mype == 0) { for (i=0; i<nparts; i++) for (h=0; h<ncon; h++) pwgts[i*ncon+h] = RandomInRange(20)+1; } MPI_Bcast((void *)pwgts, nparts*ncon, IDX_T, 0, comm); for (i=0; i<nvtxs; i++) for (h=0; h<ncon; h++) vwgt[i*ncon+h] = pwgts[part[i]*ncon+h]; free(pwgts); return; }
/************************************************************************* * This function implements a simple graph adaption strategy. **************************************************************************/ void AdaptGraph(graph_t *graph, idx_t afactor, MPI_Comm comm) { idx_t i, nvtxs, nadapt, firstvtx, lastvtx; idx_t npes, mype, mypwgt, max, min, sum; idx_t *vwgt, *xadj, *adjncy, *adjwgt, *perm; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); srand(mype*afactor); nvtxs = graph->nvtxs; xadj = graph->xadj; adjncy = graph->adjncy; if (graph->adjwgt == NULL) adjwgt = graph->adjwgt = ismalloc(graph->nedges, 1, "AdaptGraph: adjwgt"); else adjwgt = graph->adjwgt; vwgt = graph->vwgt; firstvtx = graph->vtxdist[mype]; lastvtx = graph->vtxdist[mype+1]; perm = imalloc(nvtxs, "AdaptGraph: perm"); FastRandomPermute(nvtxs, perm, 1); nadapt = RandomInRange(nvtxs); nadapt = RandomInRange(nvtxs); nadapt = RandomInRange(nvtxs); for (i=0; i<nadapt; i++) vwgt[perm[i]] = afactor*vwgt[perm[i]]; /* for (i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { k = adjncy[j]; if (k >= firstvtx && k < lastvtx) { adjwgt[j] = (int)pow(1.0*(gk_min(vwgt[i],vwgt[k-firstvtx])), .6667); if (adjwgt[j] == 0) adjwgt[j] = 1; } } } */ mypwgt = isum(nvtxs, vwgt, 1); MPI_Allreduce((void *)&mypwgt, (void *)&max, 1, IDX_T, MPI_MAX, comm); MPI_Allreduce((void *)&mypwgt, (void *)&min, 1, IDX_T, MPI_MIN, comm); MPI_Allreduce((void *)&mypwgt, (void *)&sum, 1, IDX_T, MPI_SUM, comm); if (mype == 0) printf("Initial Load Imbalance: %5.4"PRREAL", [%5"PRIDX" %5"PRIDX" %5"PRIDX"] for afactor: %"PRIDX"\n", (1.0*max*npes)/(1.0*sum), min, max, sum, afactor); free(perm); }
idx_t ComputeRealCutFromMoved(idx_t *vtxdist, idx_t *mvtxdist, idx_t *part, idx_t *mpart, char *filename, MPI_Comm comm) { idx_t i, j, nvtxs, mype, npes, cut; idx_t *xadj, *adjncy, *gpart, *gmpart, *perm, *sizes; MPI_Status status; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); if (mype != 0) { gkMPI_Send((void *)part, vtxdist[mype+1]-vtxdist[mype], IDX_T, 0, 1, comm); gkMPI_Send((void *)mpart, mvtxdist[mype+1]-mvtxdist[mype], IDX_T, 0, 1, comm); } else { /* Processor 0 does all the rest */ gpart = imalloc(vtxdist[npes], "ComputeRealCut: gpart"); icopy(vtxdist[1], part, gpart); gmpart = imalloc(mvtxdist[npes], "ComputeRealCut: gmpart"); icopy(mvtxdist[1], mpart, gmpart); for (i=1; i<npes; i++) { gkMPI_Recv((void *)(gpart+vtxdist[i]), vtxdist[i+1]-vtxdist[i], IDX_T, i, 1, comm, &status); gkMPI_Recv((void *)(gmpart+mvtxdist[i]), mvtxdist[i+1]-mvtxdist[i], IDX_T, i, 1, comm, &status); } /* OK, now go and reconstruct the permutation to go from the graph to mgraph */ perm = imalloc(vtxdist[npes], "ComputeRealCut: perm"); sizes = ismalloc(npes+1, 0, "ComputeRealCut: sizes"); for (i=0; i<vtxdist[npes]; i++) sizes[gpart[i]]++; MAKECSR(i, npes, sizes); for (i=0; i<vtxdist[npes]; i++) perm[i] = sizes[gpart[i]]++; /* Ok, now read the graph from the file */ ReadMetisGraph(filename, &nvtxs, &xadj, &adjncy); /* OK, now compute the cut */ for (cut=0, i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { if (gmpart[perm[i]] != gmpart[perm[adjncy[j]]]) cut++; } } cut = cut/2; gk_free((void **)&gpart, &gmpart, &perm, &sizes, &xadj, &adjncy, LTERM); return cut; } return 0; }
/************************************************************************* * This function implements a simple graph adaption strategy. **************************************************************************/ void AdaptGraph2(graph_t *graph, idx_t afactor, MPI_Comm comm) { idx_t i, j, k, nvtxs, firstvtx, lastvtx; idx_t npes, mype, mypwgt, max, min, sum; idx_t *vwgt, *xadj, *adjncy, *adjwgt; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); srand(mype*afactor); nvtxs = graph->nvtxs; xadj = graph->xadj; adjncy = graph->adjncy; if (graph->adjwgt == NULL) adjwgt = graph->adjwgt = ismalloc(graph->nedges, 1, "AdaptGraph: adjwgt"); else adjwgt = graph->adjwgt; vwgt = graph->vwgt; firstvtx = graph->vtxdist[mype]; lastvtx = graph->vtxdist[mype+1]; /* if (RandomInRange(npes+1) < .05*npes) { */ if (RandomInRange(npes+1) < 2) { printf("[%"PRIDX"] is adapting\n", mype); for (i=0; i<nvtxs; i++) vwgt[i] = afactor*vwgt[i]; } for (i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { k = adjncy[j]; if (k >= firstvtx && k < lastvtx) { adjwgt[j] = (int)pow(1.0*(gk_min(vwgt[i],vwgt[k-firstvtx])), .6667); if (adjwgt[j] == 0) adjwgt[j] = 1; } } } mypwgt = isum(nvtxs, vwgt, 1); gkMPI_Allreduce((void *)&mypwgt, (void *)&max, 1, IDX_T, MPI_MAX, comm); gkMPI_Allreduce((void *)&mypwgt, (void *)&min, 1, IDX_T, MPI_MIN, comm); gkMPI_Allreduce((void *)&mypwgt, (void *)&sum, 1, IDX_T, MPI_SUM, comm); if (mype == 0) printf("Initial Load Imbalance: %5.4"PRREAL", [%5"PRIDX" %5"PRIDX" %5"PRIDX"]\n", (1.0*max*npes)/(1.0*sum), min, max, sum); }
idx_t ComputeRealCut(idx_t *vtxdist, idx_t *part, char *filename, MPI_Comm comm) { idx_t i, j, nvtxs, mype, npes, cut; idx_t *xadj, *adjncy, *gpart; MPI_Status status; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); if (mype != 0) { gkMPI_Send((void *)part, vtxdist[mype+1]-vtxdist[mype], IDX_T, 0, 1, comm); } else { /* Processor 0 does all the rest */ gpart = imalloc(vtxdist[npes], "ComputeRealCut: gpart"); icopy(vtxdist[1], part, gpart); for (i=1; i<npes; i++) gkMPI_Recv((void *)(gpart+vtxdist[i]), vtxdist[i+1]-vtxdist[i], IDX_T, i, 1, comm, &status); ReadMetisGraph(filename, &nvtxs, &xadj, &adjncy); /* OK, now compute the cut */ for (cut=0, i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { if (gpart[i] != gpart[adjncy[j]]) cut++; } } cut = cut/2; gk_free((void **)&gpart, &xadj, &adjncy, LTERM); return cut; } return 0; }
/************************************************************************* * This function is the entry point of the initial partition algorithm * that does recursive bissection. * This algorithm assembles the graph to all the processors and preceeds * by parallelizing the recursive bisection step. **************************************************************************/ void InitPartition(ctrl_t *ctrl, graph_t *graph) { idx_t i, j, ncon, mype, npes, gnvtxs, ngroups; idx_t *xadj, *adjncy, *adjwgt, *vwgt; idx_t *part, *gwhere0, *gwhere1; idx_t *tmpwhere, *tmpvwgt, *tmpxadj, *tmpadjncy, *tmpadjwgt; graph_t *agraph; idx_t lnparts, fpart, fpe, lnpes; idx_t twoparts=2, moptions[METIS_NOPTIONS], edgecut, max_cut; real_t *tpwgts, *tpwgts2, *lbvec, lbsum, min_lbsum, wsum; MPI_Comm ipcomm; struct { double sum; int rank; } lpesum, gpesum; WCOREPUSH; ncon = graph->ncon; ngroups = gk_max(gk_min(RIP_SPLIT_FACTOR, ctrl->npes), 1); IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->InitPartTmr)); lbvec = rwspacemalloc(ctrl, ncon); /* assemble the graph to all the processors */ agraph = AssembleAdaptiveGraph(ctrl, graph); gnvtxs = agraph->nvtxs; /* make a copy of the graph's structure for later */ xadj = icopy(gnvtxs+1, agraph->xadj, iwspacemalloc(ctrl, gnvtxs+1)); vwgt = icopy(gnvtxs*ncon, agraph->vwgt, iwspacemalloc(ctrl, gnvtxs*ncon)); adjncy = icopy(agraph->nedges, agraph->adjncy, iwspacemalloc(ctrl, agraph->nedges)); adjwgt = icopy(agraph->nedges, agraph->adjwgt, iwspacemalloc(ctrl, agraph->nedges)); part = iwspacemalloc(ctrl, gnvtxs); /* create different processor groups */ gkMPI_Comm_split(ctrl->gcomm, ctrl->mype % ngroups, 0, &ipcomm); gkMPI_Comm_rank(ipcomm, &mype); gkMPI_Comm_size(ipcomm, &npes); /* Go into the recursive bisection */ METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_SEED] = ctrl->sync + (ctrl->mype % ngroups) + 1; tpwgts = ctrl->tpwgts; tpwgts2 = rwspacemalloc(ctrl, 2*ncon); lnparts = ctrl->nparts; fpart = fpe = 0; lnpes = npes; while (lnpes > 1 && lnparts > 1) { /* determine the weights of the two partitions as a function of the weight of the target partition weights */ for (j=(lnparts>>1), i=0; i<ncon; i++) { tpwgts2[i] = rsum(j, tpwgts+fpart*ncon+i, ncon); tpwgts2[ncon+i] = rsum(lnparts-j, tpwgts+(fpart+j)*ncon+i, ncon); wsum = 1.0/(tpwgts2[i] + tpwgts2[ncon+i]); tpwgts2[i] *= wsum; tpwgts2[ncon+i] *= wsum; } METIS_PartGraphRecursive(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &twoparts, tpwgts2, NULL, moptions, &edgecut, part); /* pick one of the branches */ if (mype < fpe+lnpes/2) { KeepPart(ctrl, agraph, part, 0); lnpes = lnpes/2; lnparts = lnparts/2; } else { KeepPart(ctrl, agraph, part, 1); fpart = fpart + lnparts/2; fpe = fpe + lnpes/2; lnpes = lnpes - lnpes/2; lnparts = lnparts - lnparts/2; } } gwhere0 = iset(gnvtxs, 0, iwspacemalloc(ctrl, gnvtxs)); gwhere1 = iwspacemalloc(ctrl, gnvtxs); if (lnparts == 1) { /* Case npes is greater than or equal to nparts */ /* Only the first process will assign labels (for the reduction to work) */ if (mype == fpe) { for (i=0; i<agraph->nvtxs; i++) gwhere0[agraph->label[i]] = fpart; } } else { /* Case in which npes is smaller than nparts */ /* create the normalized tpwgts for the lnparts from ctrl->tpwgts */ tpwgts = rwspacemalloc(ctrl, lnparts*ncon); for (j=0; j<ncon; j++) { for (wsum=0.0, i=0; i<lnparts; i++) { tpwgts[i*ncon+j] = ctrl->tpwgts[(fpart+i)*ncon+j]; wsum += tpwgts[i*ncon+j]; } for (wsum=1.0/wsum, i=0; i<lnparts; i++) tpwgts[i*ncon+j] *= wsum; } METIS_PartGraphKway(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &lnparts, tpwgts, NULL, moptions, &edgecut, part); for (i=0; i<agraph->nvtxs; i++) gwhere0[agraph->label[i]] = fpart + part[i]; } gkMPI_Allreduce((void *)gwhere0, (void *)gwhere1, gnvtxs, IDX_T, MPI_SUM, ipcomm); if (ngroups > 1) { tmpxadj = agraph->xadj; tmpadjncy = agraph->adjncy; tmpadjwgt = agraph->adjwgt; tmpvwgt = agraph->vwgt; tmpwhere = agraph->where; agraph->xadj = xadj; agraph->adjncy = adjncy; agraph->adjwgt = adjwgt; agraph->vwgt = vwgt; agraph->where = gwhere1; agraph->vwgt = vwgt; agraph->nvtxs = gnvtxs; edgecut = ComputeSerialEdgeCut(agraph); ComputeSerialBalance(ctrl, agraph, gwhere1, lbvec); lbsum = rsum(ncon, lbvec, 1); gkMPI_Allreduce((void *)&edgecut, (void *)&max_cut, 1, IDX_T, MPI_MAX, ctrl->gcomm); gkMPI_Allreduce((void *)&lbsum, (void *)&min_lbsum, 1, REAL_T, MPI_MIN, ctrl->gcomm); lpesum.sum = lbsum; if (min_lbsum < UNBALANCE_FRACTION*ncon) { if (lbsum < UNBALANCE_FRACTION*ncon) lpesum.sum = edgecut; else lpesum.sum = max_cut; } lpesum.rank = ctrl->mype; gkMPI_Allreduce((void *)&lpesum, (void *)&gpesum, 1, MPI_DOUBLE_INT, MPI_MINLOC, ctrl->gcomm); gkMPI_Bcast((void *)gwhere1, gnvtxs, IDX_T, gpesum.rank, ctrl->gcomm); agraph->xadj = tmpxadj; agraph->adjncy = tmpadjncy; agraph->adjwgt = tmpadjwgt; agraph->vwgt = tmpvwgt; agraph->where = tmpwhere; } icopy(graph->nvtxs, gwhere1+graph->vtxdist[ctrl->mype], graph->where); FreeGraph(agraph); gkMPI_Comm_free(&ipcomm); IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->InitPartTmr)); WCOREPOP; }
/************************************************************************* * Let the game begin **************************************************************************/ int main(int argc, char *argv[]) { idx_t i, j, npes, mype, optype, nparts, adptf, options[10]; idx_t *part=NULL, *sizes=NULL; graph_t graph; real_t ipc2redist, *xyz=NULL, *tpwgts=NULL, ubvec[MAXNCON]; MPI_Comm comm; idx_t numflag=0, wgtflag=0, ndims, edgecut; char xyzfilename[8192]; MPI_Init(&argc, &argv); MPI_Comm_dup(MPI_COMM_WORLD, &comm); gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); if (argc != 8) { if (mype == 0) printf("Usage: %s <graph-file> <op-type> <nparts> <adapth-factor> <ipc2redist> <dbglvl> <seed>\n", argv[0]); MPI_Finalize(); exit(0); } optype = atoi(argv[2]); nparts = atoi(argv[3]); adptf = atoi(argv[4]); ipc2redist = atof(argv[5]); options[0] = 1; options[PMV3_OPTION_DBGLVL] = atoi(argv[6]); options[PMV3_OPTION_SEED] = atoi(argv[7]); if (mype == 0) printf("reading file: %s\n", argv[1]); ParallelReadGraph(&graph, argv[1], comm); /* Remove the edges for testing */ /*iset(graph.vtxdist[mype+1]-graph.vtxdist[mype]+1, 0, graph.xadj); */ rset(graph.ncon, 1.05, ubvec); tpwgts = rmalloc(nparts*graph.ncon, "tpwgts"); rset(nparts*graph.ncon, 1.0/(real_t)nparts, tpwgts); /* ChangeToFortranNumbering(graph.vtxdist, graph.xadj, graph.adjncy, mype, npes); numflag = 1; nvtxs = graph.vtxdist[mype+1]-graph.vtxdist[mype]; nedges = graph.xadj[nvtxs]; printf("%"PRIDX" %"PRIDX"\n", isum(nvtxs, graph.xadj, 1), isum(nedges, graph.adjncy, 1)); */ if (optype >= 20) { sprintf(xyzfilename, "%s.xyz", argv[1]); xyz = ReadTestCoordinates(&graph, xyzfilename, &ndims, comm); } if (mype == 0) printf("finished reading file: %s\n", argv[1]); part = ismalloc(graph.nvtxs, mype%nparts, "main: part"); sizes = imalloc(2*npes, "main: sizes"); switch (optype) { case 1: wgtflag = 3; ParMETIS_V3_PartKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, graph.adjwgt, &wgtflag, &numflag, &graph.ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); WritePVector(argv[1], graph.vtxdist, part, MPI_COMM_WORLD); break; case 2: wgtflag = 3; options[PMV3_OPTION_PSR] = PARMETIS_PSR_COUPLED; ParMETIS_V3_RefineKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, graph.adjwgt, &wgtflag, &numflag, &graph.ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); WritePVector(argv[1], graph.vtxdist, part, MPI_COMM_WORLD); break; case 3: options[PMV3_OPTION_PSR] = PARMETIS_PSR_COUPLED; graph.vwgt = ismalloc(graph.nvtxs, 1, "main: vwgt"); if (npes > 1) { AdaptGraph(&graph, adptf, comm); } else { wgtflag = 3; ParMETIS_V3_PartKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, graph.adjwgt, &wgtflag, &numflag, &graph.ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); printf("Initial partitioning with edgecut of %"PRIDX"\n", edgecut); for (i=0; i<graph.ncon; i++) { for (j=0; j<graph.nvtxs; j++) { if (part[j] == i) graph.vwgt[j*graph.ncon+i] = adptf; else graph.vwgt[j*graph.ncon+i] = 1; } } } wgtflag = 3; ParMETIS_V3_AdaptiveRepart(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, NULL, graph.adjwgt, &wgtflag, &numflag, &graph.ncon, &nparts, tpwgts, ubvec, &ipc2redist, options, &edgecut, part, &comm); break; case 4: ParMETIS_V3_NodeND(graph.vtxdist, graph.xadj, graph.adjncy, &numflag, options, part, sizes, &comm); /* WriteOVector(argv[1], graph.vtxdist, part, comm); */ break; case 5: ParMETIS_SerialNodeND(graph.vtxdist, graph.xadj, graph.adjncy, &numflag, options, part, sizes, &comm); /* WriteOVector(argv[1], graph.vtxdist, part, comm); */ printf("%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"\n", sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], sizes[5], sizes[6]); break; case 11: /* TestAdaptiveMETIS(graph.vtxdist, graph.xadj, graph.adjncy, part, options, adptf, comm); */ break; case 20: wgtflag = 3; ParMETIS_V3_PartGeomKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, graph.adjwgt, &wgtflag, &numflag, &ndims, xyz, &graph.ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); break; case 21: ParMETIS_V3_PartGeom(graph.vtxdist, &ndims, xyz, part, &comm); break; } /* printf("%"PRIDX" %"PRIDX"\n", isum(nvtxs, graph.xadj, 1), isum(nedges, graph.adjncy, 1)); */ gk_free((void **)&part, &sizes, &tpwgts, &graph.vtxdist, &graph.xadj, &graph.adjncy, &graph.vwgt, &graph.adjwgt, &xyz, LTERM); MPI_Comm_free(&comm); MPI_Finalize(); return 0; }
void TestParMetis_GPart(char *filename, char *xyzfile, MPI_Comm comm) { idx_t ncon, nparts, npes, mype, opt2, realcut; graph_t graph, mgraph; idx_t *part, *mpart, *savepart, *order, *sizes; idx_t numflag=0, wgtflag=0, options[10], edgecut, ndims; real_t ipc2redist, *xyz=NULL, *tpwgts = NULL, ubvec[MAXNCON]; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); ParallelReadGraph(&graph, filename, comm); if (xyzfile) xyz = ReadTestCoordinates(&graph, xyzfile, &ndims, comm); gkMPI_Barrier(comm); part = imalloc(graph.nvtxs, "TestParMetis_V3: part"); tpwgts = rmalloc(MAXNCON*npes*2, "TestParMetis_V3: tpwgts"); rset(MAXNCON, 1.05, ubvec); graph.vwgt = ismalloc(graph.nvtxs*5, 1, "TestParMetis_GPart: vwgt"); /*====================================================================== / ParMETIS_V3_PartKway /=======================================================================*/ options[0] = 1; options[1] = 3; options[2] = 1; wgtflag = 2; numflag = 0; edgecut = 0; for (nparts=2*npes; nparts>=npes/2 && nparts > 0; nparts = nparts/2) { for (ncon=1; ncon<=NCON; ncon++) { if (ncon > 1 && nparts > 1) Mc_AdaptGraph(&graph, part, ncon, nparts, comm); else iset(graph.nvtxs, 1, graph.vwgt); if (mype == 0) printf("\nTesting ParMETIS_V3_PartKway with ncon: %"PRIDX", nparts: %"PRIDX"\n", ncon, nparts); rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); ParMETIS_V3_PartKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); realcut = ComputeRealCut(graph.vtxdist, part, filename, comm); if (mype == 0) { printf("ParMETIS_V3_PartKway reported a cut of %"PRIDX" [%s:%"PRIDX"]\n", edgecut, (edgecut == realcut ? "OK" : "ERROR"), realcut); } if (mype == 0) printf("\nTesting ParMETIS_V3_RefineKway with ncon: %"PRIDX", nparts: %"PRIDX"\n", ncon, nparts); options[3] = PARMETIS_PSR_UNCOUPLED; ParMETIS_V3_RefineKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); realcut = ComputeRealCut(graph.vtxdist, part, filename, comm); if (mype == 0) { printf("ParMETIS_V3_RefineKway reported a cut of %"PRIDX" [%s:%"PRIDX"]\n", edgecut, (edgecut == realcut ? "OK" : "ERROR"), realcut); } } } /*====================================================================== / ParMETIS_V3_PartGeomKway /=======================================================================*/ if (xyzfile != NULL) { options[0] = 1; options[1] = 3; options[2] = 1; wgtflag = 2; numflag = 0; for (nparts=2*npes; nparts>=npes/2 && nparts > 0; nparts = nparts/2) { for (ncon=1; ncon<=NCON; ncon++) { if (ncon > 1) Mc_AdaptGraph(&graph, part, ncon, nparts, comm); else iset(graph.nvtxs, 1, graph.vwgt); if (mype == 0) printf("\nTesting ParMETIS_V3_PartGeomKway with ncon: %"PRIDX", nparts: %"PRIDX"\n", ncon, nparts); rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); ParMETIS_V3_PartGeomKway(graph.vtxdist, graph.xadj, graph.adjncy, graph.vwgt, NULL, &wgtflag, &numflag, &ndims, xyz, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); realcut = ComputeRealCut(graph.vtxdist, part, filename, comm); if (mype == 0) printf("ParMETIS_V3_PartGeomKway reported a cut of %"PRIDX" [%s:%"PRIDX"]\n", edgecut, (edgecut == realcut ? "OK" : "ERROR"), realcut); } } } /*====================================================================== / ParMETIS_V3_PartGeom /=======================================================================*/ if (xyz != NULL) { wgtflag = 0; numflag = 0; if (mype == 0) printf("\nTesting ParMETIS_V3_PartGeom\n"); ParMETIS_V3_PartGeom(graph.vtxdist, &ndims, xyz, part, &comm); realcut = ComputeRealCut(graph.vtxdist, part, filename, comm); if (mype == 0) printf("ParMETIS_V3_PartGeom reported a cut of %"PRIDX"\n", realcut); } /*====================================================================== / Coupled ParMETIS_V3_RefineKway /=======================================================================*/ options[0] = 1; options[1] = 3; options[2] = 1; options[3] = PARMETIS_PSR_COUPLED; nparts = npes; wgtflag = 0; numflag = 0; ncon = 1; rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); if (mype == 0) printf("\nTesting coupled ParMETIS_V3_RefineKway with default options (before move)\n"); ParMETIS_V3_RefineKway(graph.vtxdist, graph.xadj, graph.adjncy, NULL, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); /* Compute a good partition and move the graph. Do so quietly! */ options[0] = 0; nparts = npes; wgtflag = 0; numflag = 0; ncon = 1; rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); ParMETIS_V3_PartKway(graph.vtxdist, graph.xadj, graph.adjncy, NULL, NULL, &wgtflag, &numflag, &ncon, &npes, tpwgts, ubvec, options, &edgecut, part, &comm); TestMoveGraph(&graph, &mgraph, part, comm); gk_free((void **)&(graph.vwgt), LTERM); mpart = ismalloc(mgraph.nvtxs, mype, "TestParMetis_V3: mpart"); savepart = imalloc(mgraph.nvtxs, "TestParMetis_V3: savepart"); /*====================================================================== / Coupled ParMETIS_V3_RefineKway /=======================================================================*/ options[0] = 1; options[1] = 3; options[2] = 1; options[3] = PARMETIS_PSR_COUPLED; nparts = npes; wgtflag = 0; numflag = 0; for (ncon=1; ncon<=NCON; ncon++) { if (mype == 0) printf("\nTesting coupled ParMETIS_V3_RefineKway with ncon: %"PRIDX", nparts: %"PRIDX"\n", ncon, nparts); rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); ParMETIS_V3_RefineKway(mgraph.vtxdist, mgraph.xadj, mgraph.adjncy, NULL, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, mpart, &comm); realcut = ComputeRealCutFromMoved(graph.vtxdist, mgraph.vtxdist, part, mpart, filename, comm); if (mype == 0) printf("ParMETIS_V3_RefineKway reported a cut of %"PRIDX" [%s:%"PRIDX"]\n", edgecut, (edgecut == realcut ? "OK" : "ERROR"), realcut); } /*ADAPTIVE:*/ /*====================================================================== / ParMETIS_V3_AdaptiveRepart /=======================================================================*/ mgraph.vwgt = ismalloc(mgraph.nvtxs*NCON, 1, "TestParMetis_V3: mgraph.vwgt"); mgraph.vsize = ismalloc(mgraph.nvtxs, 1, "TestParMetis_V3: mgraph.vsize"); AdaptGraph(&mgraph, 4, comm); options[0] = 1; options[1] = 7; options[2] = 1; options[3] = PARMETIS_PSR_COUPLED; wgtflag = 2; numflag = 0; for (nparts=2*npes; nparts>=npes/2; nparts = nparts/2) { options[0] = 0; ncon = 1; wgtflag = 0; rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); ParMETIS_V3_PartKway(mgraph.vtxdist, mgraph.xadj, mgraph.adjncy, NULL, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, options, &edgecut, savepart, &comm); options[0] = 1; wgtflag = 2; for (ncon=1; ncon<=NCON; ncon++) { rset(nparts*ncon, 1.0/(real_t)nparts, tpwgts); if (ncon > 1) Mc_AdaptGraph(&mgraph, savepart, ncon, nparts, comm); else AdaptGraph(&mgraph, 4, comm); for (ipc2redist=1000.0; ipc2redist>=0.001; ipc2redist/=1000.0) { icopy(mgraph.nvtxs, savepart, mpart); if (mype == 0) printf("\nTesting ParMETIS_V3_AdaptiveRepart with ipc2redist: %.3"PRREAL", ncon: %"PRIDX", nparts: %"PRIDX"\n", ipc2redist, ncon, nparts); ParMETIS_V3_AdaptiveRepart(mgraph.vtxdist, mgraph.xadj, mgraph.adjncy, mgraph.vwgt, mgraph.vsize, NULL, &wgtflag, &numflag, &ncon, &nparts, tpwgts, ubvec, &ipc2redist, options, &edgecut, mpart, &comm); realcut = ComputeRealCutFromMoved(graph.vtxdist, mgraph.vtxdist, part, mpart, filename, comm); if (mype == 0) printf("ParMETIS_V3_AdaptiveRepart reported a cut of %"PRIDX" [%s:%"PRIDX"]\n", edgecut, (edgecut == realcut ? "OK" : "ERROR"), realcut); } } } gk_free((void **)&tpwgts, &part, &mpart, &savepart, &xyz, &mgraph.vwgt, &mgraph.vsize, LTERM); }
/************************************************************************* * This function converts a mesh into a dual graph **************************************************************************/ int ParMETIS_V3_Mesh2Dual(idx_t *elmdist, idx_t *eptr, idx_t *eind, idx_t *numflag, idx_t *ncommon, idx_t **r_xadj, idx_t **r_adjncy, MPI_Comm *comm) { idx_t i, j, jj, k, kk, m; idx_t npes, mype, pe, count, mask, pass; idx_t nelms, lnns, my_nns, node; idx_t firstelm, firstnode, lnode, nrecv, nsend; idx_t *scounts, *rcounts, *sdispl, *rdispl; idx_t *nodedist, *nmap, *auxarray; idx_t *gnptr, *gnind, *nptr, *nind, *myxadj=NULL, *myadjncy = NULL; idx_t *sbuffer, *rbuffer, *htable; ikv_t *nodelist, *recvbuffer; idx_t maxcount, *ind, *wgt; idx_t gmaxnode, gminnode; size_t curmem; gk_malloc_init(); curmem = gk_GetCurMemoryUsed(); /* Get basic comm info */ gkMPI_Comm_size(*comm, &npes); gkMPI_Comm_rank(*comm, &mype); nelms = elmdist[mype+1]-elmdist[mype]; if (*numflag > 0) ChangeNumberingMesh(elmdist, eptr, eind, NULL, NULL, NULL, npes, mype, 1); mask = (1<<11)-1; /*****************************/ /* Determine number of nodes */ /*****************************/ gminnode = GlobalSEMinComm(*comm, imin(eptr[nelms], eind)); for (i=0; i<eptr[nelms]; i++) eind[i] -= gminnode; gmaxnode = GlobalSEMaxComm(*comm, imax(eptr[nelms], eind)); /**************************/ /* Check for input errors */ /**************************/ ASSERT(nelms > 0); /* construct node distribution array */ nodedist = ismalloc(npes+1, 0, "nodedist"); for (nodedist[0]=0, i=0,j=gmaxnode+1; i<npes; i++) { k = j/(npes-i); nodedist[i+1] = nodedist[i]+k; j -= k; } my_nns = nodedist[mype+1]-nodedist[mype]; firstnode = nodedist[mype]; nodelist = ikvmalloc(eptr[nelms], "nodelist"); auxarray = imalloc(eptr[nelms], "auxarray"); htable = ismalloc(gk_max(my_nns, mask+1), -1, "htable"); scounts = imalloc(npes, "scounts"); rcounts = imalloc(npes, "rcounts"); sdispl = imalloc(npes+1, "sdispl"); rdispl = imalloc(npes+1, "rdispl"); /*********************************************/ /* first find a local numbering of the nodes */ /*********************************************/ for (i=0; i<nelms; i++) { for (j=eptr[i]; j<eptr[i+1]; j++) { nodelist[j].key = eind[j]; nodelist[j].val = j; auxarray[j] = i; /* remember the local element ID that uses this node */ } } ikvsorti(eptr[nelms], nodelist); for (count=1, i=1; i<eptr[nelms]; i++) { if (nodelist[i].key > nodelist[i-1].key) count++; } lnns = count; nmap = imalloc(lnns, "nmap"); /* renumber the nodes of the elements array */ count = 1; nmap[0] = nodelist[0].key; eind[nodelist[0].val] = 0; nodelist[0].val = auxarray[nodelist[0].val]; /* Store the local element ID */ for (i=1; i<eptr[nelms]; i++) { if (nodelist[i].key > nodelist[i-1].key) { nmap[count] = nodelist[i].key; count++; } eind[nodelist[i].val] = count-1; nodelist[i].val = auxarray[nodelist[i].val]; /* Store the local element ID */ } gkMPI_Barrier(*comm); /**********************************************************/ /* perform comms necessary to construct node-element list */ /**********************************************************/ iset(npes, 0, scounts); for (pe=i=0; i<eptr[nelms]; i++) { while (nodelist[i].key >= nodedist[pe+1]) pe++; scounts[pe] += 2; } ASSERT(pe < npes); gkMPI_Alltoall((void *)scounts, 1, IDX_T, (void *)rcounts, 1, IDX_T, *comm); icopy(npes, scounts, sdispl); MAKECSR(i, npes, sdispl); icopy(npes, rcounts, rdispl); MAKECSR(i, npes, rdispl); ASSERT(sdispl[npes] == eptr[nelms]*2); nrecv = rdispl[npes]/2; recvbuffer = ikvmalloc(gk_max(1, nrecv), "recvbuffer"); gkMPI_Alltoallv((void *)nodelist, scounts, sdispl, IDX_T, (void *)recvbuffer, rcounts, rdispl, IDX_T, *comm); /**************************************/ /* construct global node-element list */ /**************************************/ gnptr = ismalloc(my_nns+1, 0, "gnptr"); for (i=0; i<npes; i++) { for (j=rdispl[i]/2; j<rdispl[i+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; ASSERT(lnode >= 0 && lnode < my_nns) gnptr[lnode]++; } } MAKECSR(i, my_nns, gnptr); gnind = imalloc(gk_max(1, gnptr[my_nns]), "gnind"); for (pe=0; pe<npes; pe++) { firstelm = elmdist[pe]; for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; gnind[gnptr[lnode]++] = recvbuffer[j].val+firstelm; } } SHIFTCSR(i, my_nns, gnptr); /*********************************************************/ /* send the node-element info to the relevant processors */ /*********************************************************/ iset(npes, 0, scounts); /* use a hash table to ensure that each node is sent to a proc only once */ for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; if (htable[lnode] == -1) { scounts[pe] += gnptr[lnode+1]-gnptr[lnode]; htable[lnode] = 1; } } /* now reset the hash table */ for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; htable[lnode] = -1; } } gkMPI_Alltoall((void *)scounts, 1, IDX_T, (void *)rcounts, 1, IDX_T, *comm); icopy(npes, scounts, sdispl); MAKECSR(i, npes, sdispl); /* create the send buffer */ nsend = sdispl[npes]; sbuffer = imalloc(gk_max(1, nsend), "sbuffer"); count = 0; for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; if (htable[lnode] == -1) { for (k=gnptr[lnode]; k<gnptr[lnode+1]; k++) { if (k == gnptr[lnode]) sbuffer[count++] = -1*(gnind[k]+1); else sbuffer[count++] = gnind[k]; } htable[lnode] = 1; } } ASSERT(count == sdispl[pe+1]); /* now reset the hash table */ for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; htable[lnode] = -1; } } icopy(npes, rcounts, rdispl); MAKECSR(i, npes, rdispl); nrecv = rdispl[npes]; rbuffer = imalloc(gk_max(1, nrecv), "rbuffer"); gkMPI_Alltoallv((void *)sbuffer, scounts, sdispl, IDX_T, (void *)rbuffer, rcounts, rdispl, IDX_T, *comm); k = -1; nptr = ismalloc(lnns+1, 0, "nptr"); nind = rbuffer; for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]; j<rdispl[pe+1]; j++) { if (nind[j] < 0) { k++; nind[j] = (-1*nind[j])-1; } nptr[k]++; } } MAKECSR(i, lnns, nptr); ASSERT(k+1 == lnns); ASSERT(nptr[lnns] == nrecv) myxadj = *r_xadj = (idx_t *)malloc(sizeof(idx_t)*(nelms+1)); if (myxadj == NULL) gk_errexit(SIGMEM, "Failed to allocate memory for the dual graph's xadj array.\n"); iset(nelms+1, 0, myxadj); iset(mask+1, -1, htable); firstelm = elmdist[mype]; /* Two passes -- in first pass, simply find out the memory requirements */ maxcount = 200; ind = imalloc(maxcount, "ParMETIS_V3_Mesh2Dual: ind"); wgt = imalloc(maxcount, "ParMETIS_V3_Mesh2Dual: wgt"); for (pass=0; pass<2; pass++) { for (i=0; i<nelms; i++) { for (count=0, j=eptr[i]; j<eptr[i+1]; j++) { node = eind[j]; for (k=nptr[node]; k<nptr[node+1]; k++) { if ((kk=nind[k]) == firstelm+i) continue; m = htable[(kk&mask)]; if (m == -1) { ind[count] = kk; wgt[count] = 1; htable[(kk&mask)] = count++; } else { if (ind[m] == kk) { wgt[m]++; } else { for (jj=0; jj<count; jj++) { if (ind[jj] == kk) { wgt[jj]++; break; } } if (jj == count) { ind[count] = kk; wgt[count++] = 1; } } } /* Adjust the memory. This will be replaced by a idxrealloc() when GKlib will be incorporated */ if (count == maxcount-1) { maxcount *= 2; ind = irealloc(ind, maxcount, "ParMETIS_V3_Mesh2Dual: ind"); wgt = irealloc(wgt, maxcount, "ParMETIS_V3_Mesh2Dual: wgt"); } } } for (j=0; j<count; j++) { htable[(ind[j]&mask)] = -1; if (wgt[j] >= *ncommon) { if (pass == 0) myxadj[i]++; else myadjncy[myxadj[i]++] = ind[j]; } } } if (pass == 0) { MAKECSR(i, nelms, myxadj); myadjncy = *r_adjncy = (idx_t *)malloc(sizeof(idx_t)*myxadj[nelms]); if (myadjncy == NULL) gk_errexit(SIGMEM, "Failed to allocate memory for dual graph's adjncy array.\n"); } else { SHIFTCSR(i, nelms, myxadj); } } /*****************************************/ /* correctly renumber the elements array */ /*****************************************/ for (i=0; i<eptr[nelms]; i++) eind[i] = nmap[eind[i]] + gminnode; if (*numflag == 1) ChangeNumberingMesh(elmdist, eptr, eind, myxadj, myadjncy, NULL, npes, mype, 0); /* do not free nodelist, recvbuffer, rbuffer */ gk_free((void **)&nodedist, &nodelist, &auxarray, &htable, &scounts, &rcounts, &sdispl, &rdispl, &nmap, &recvbuffer, &gnptr, &gnind, &sbuffer, &rbuffer, &nptr, &ind, &wgt, LTERM); if (gk_GetCurMemoryUsed() - curmem > 0) { printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n", (ssize_t)(gk_GetCurMemoryUsed() - curmem)); } gk_malloc_cleanup(0); return METIS_OK; }
/************************************************************************* * This function is the entry point of the initial balancing algorithm. * This algorithm assembles the graph to all the processors and preceeds * with the balancing step. **************************************************************************/ void Balance_Partition(ctrl_t *ctrl, graph_t *graph) { idx_t i, j, nvtxs, nedges, ncon; idx_t mype, npes, srnpes, srmype; idx_t *vtxdist, *xadj, *adjncy, *adjwgt, *vwgt, *vsize; idx_t *part, *lwhere, *home; idx_t lnparts, fpart, fpe, lnpes, ngroups; idx_t *rcounts, *rdispls; idx_t twoparts=2, moptions[METIS_NOPTIONS], edgecut, max_cut; idx_t sr_pe, gd_pe, sr, gd, who_wins; real_t my_cut, my_totalv, my_cost = -1.0, my_balance = -1.0, wsum; real_t rating, max_rating, your_cost = -1.0, your_balance = -1.0; real_t lbsum, min_lbsum, *lbvec, *tpwgts, *tpwgts2, buffer[2]; graph_t *agraph, cgraph; ctrl_t *myctrl; MPI_Status status; MPI_Comm ipcomm, srcomm; struct { double cost; int rank; } lpecost, gpecost; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->InitPartTmr)); WCOREPUSH; vtxdist = graph->vtxdist; agraph = AssembleAdaptiveGraph(ctrl, graph); nvtxs = cgraph.nvtxs = agraph->nvtxs; nedges = cgraph.nedges = agraph->nedges; ncon = cgraph.ncon = agraph->ncon; xadj = cgraph.xadj = icopy(nvtxs+1, agraph->xadj, iwspacemalloc(ctrl, nvtxs+1)); vwgt = cgraph.vwgt = icopy(nvtxs*ncon, agraph->vwgt, iwspacemalloc(ctrl, nvtxs*ncon)); vsize = cgraph.vsize = icopy(nvtxs, agraph->vsize, iwspacemalloc(ctrl, nvtxs)); adjncy = cgraph.adjncy = icopy(nedges, agraph->adjncy, iwspacemalloc(ctrl, nedges)); adjwgt = cgraph.adjwgt = icopy(nedges, agraph->adjwgt, iwspacemalloc(ctrl, nedges)); part = cgraph.where = agraph->where = iwspacemalloc(ctrl, nvtxs); lwhere = iwspacemalloc(ctrl, nvtxs); home = iwspacemalloc(ctrl, nvtxs); lbvec = rwspacemalloc(ctrl, graph->ncon); /****************************************/ /****************************************/ if (ctrl->ps_relation == PARMETIS_PSR_UNCOUPLED) { WCOREPUSH; rcounts = iwspacemalloc(ctrl, ctrl->npes); rdispls = iwspacemalloc(ctrl, ctrl->npes+1); for (i=0; i<ctrl->npes; i++) rdispls[i] = rcounts[i] = vtxdist[i+1]-vtxdist[i]; MAKECSR(i, ctrl->npes, rdispls); gkMPI_Allgatherv((void *)graph->home, graph->nvtxs, IDX_T, (void *)part, rcounts, rdispls, IDX_T, ctrl->comm); for (i=0; i<agraph->nvtxs; i++) home[i] = part[i]; WCOREPOP; /* local frees */ } else { for (i=0; i<ctrl->npes; i++) { for (j=vtxdist[i]; j<vtxdist[i+1]; j++) part[j] = home[j] = i; } } /* Ensure that the initial partitioning is legal */ for (i=0; i<agraph->nvtxs; i++) { if (part[i] >= ctrl->nparts) part[i] = home[i] = part[i] % ctrl->nparts; if (part[i] < 0) part[i] = home[i] = (-1*part[i]) % ctrl->nparts; } /****************************************/ /****************************************/ IFSET(ctrl->dbglvl, DBG_REFINEINFO, ComputeSerialBalance(ctrl, agraph, agraph->where, lbvec)); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "input cut: %"PRIDX", balance: ", ComputeSerialEdgeCut(agraph))); for (i=0; i<agraph->ncon; i++) IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "%.3"PRREAL" ", lbvec[i])); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "\n")); /****************************************/ /* Split the processors into two groups */ /****************************************/ sr = (ctrl->mype % 2 == 0) ? 1 : 0; gd = (ctrl->mype % 2 == 1) ? 1 : 0; if (graph->ncon > MAX_NCON_FOR_DIFFUSION || ctrl->npes == 1) { sr = 1; gd = 0; } sr_pe = 0; gd_pe = 1; gkMPI_Comm_split(ctrl->gcomm, sr, 0, &ipcomm); gkMPI_Comm_rank(ipcomm, &mype); gkMPI_Comm_size(ipcomm, &npes); if (sr == 1) { /* Half of the processors do scratch-remap */ ngroups = gk_max(gk_min(RIP_SPLIT_FACTOR, npes), 1); gkMPI_Comm_split(ipcomm, mype % ngroups, 0, &srcomm); gkMPI_Comm_rank(srcomm, &srmype); gkMPI_Comm_size(srcomm, &srnpes); METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_SEED] = ctrl->sync + (mype % ngroups) + 1; tpwgts = ctrl->tpwgts; tpwgts2 = rwspacemalloc(ctrl, 2*ncon); iset(nvtxs, 0, lwhere); lnparts = ctrl->nparts; fpart = fpe = 0; lnpes = srnpes; while (lnpes > 1 && lnparts > 1) { PASSERT(ctrl, agraph->nvtxs > 1); /* determine the weights of the two partitions as a function of the weight of the target partition weights */ for (j=(lnparts>>1), i=0; i<ncon; i++) { tpwgts2[i] = rsum(j, tpwgts+fpart*ncon+i, ncon); tpwgts2[ncon+i] = rsum(lnparts-j, tpwgts+(fpart+j)*ncon+i, ncon); wsum = 1.0/(tpwgts2[i] + tpwgts2[ncon+i]); tpwgts2[i] *= wsum; tpwgts2[ncon+i] *= wsum; } METIS_PartGraphRecursive(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &twoparts, tpwgts2, NULL, moptions, &edgecut, part); /* pick one of the branches */ if (srmype < fpe+lnpes/2) { KeepPart(ctrl, agraph, part, 0); lnpes = lnpes/2; lnparts = lnparts/2; } else { KeepPart(ctrl, agraph, part, 1); fpart = fpart + lnparts/2; fpe = fpe + lnpes/2; lnpes = lnpes - lnpes/2; lnparts = lnparts - lnparts/2; } } if (lnparts == 1) { /* Case in which srnpes is greater or equal to nparts */ /* Only the first process will assign labels (for the reduction to work) */ if (srmype == fpe) { for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart; } } else { /* Case in which srnpes is smaller than nparts */ /* create the normalized tpwgts for the lnparts from ctrl->tpwgts */ tpwgts = rwspacemalloc(ctrl, lnparts*ncon); for (j=0; j<ncon; j++) { for (wsum=0.0, i=0; i<lnparts; i++) { tpwgts[i*ncon+j] = ctrl->tpwgts[(fpart+i)*ncon+j]; wsum += tpwgts[i*ncon+j]; } for (wsum=1.0/wsum, i=0; i<lnparts; i++) tpwgts[i*ncon+j] *= wsum; } METIS_PartGraphKway(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &lnparts, tpwgts, NULL, moptions, &edgecut, part); for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart + part[i]; } gkMPI_Allreduce((void *)lwhere, (void *)part, nvtxs, IDX_T, MPI_SUM, srcomm); edgecut = ComputeSerialEdgeCut(&cgraph); ComputeSerialBalance(ctrl, &cgraph, part, lbvec); lbsum = rsum(ncon, lbvec, 1); gkMPI_Allreduce((void *)&edgecut, (void *)&max_cut, 1, IDX_T, MPI_MAX, ipcomm); gkMPI_Allreduce((void *)&lbsum, (void *)&min_lbsum, 1, REAL_T, MPI_MIN, ipcomm); lpecost.rank = ctrl->mype; lpecost.cost = lbsum; if (min_lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) { if (lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) lpecost.cost = (double)edgecut; else lpecost.cost = (double)max_cut + lbsum; } gkMPI_Allreduce((void *)&lpecost, (void *)&gpecost, 1, MPI_DOUBLE_INT, MPI_MINLOC, ipcomm); if (ctrl->mype == gpecost.rank && ctrl->mype != sr_pe) gkMPI_Send((void *)part, nvtxs, IDX_T, sr_pe, 1, ctrl->comm); if (ctrl->mype != gpecost.rank && ctrl->mype == sr_pe) gkMPI_Recv((void *)part, nvtxs, IDX_T, gpecost.rank, 1, ctrl->comm, &status); if (ctrl->mype == sr_pe) { icopy(nvtxs, part, lwhere); SerialRemap(ctrl, &cgraph, ctrl->nparts, home, lwhere, part, ctrl->tpwgts); } gkMPI_Comm_free(&srcomm); }