/************************************************************************* * This function is the driver for the partition refinement mode of ParMETIS **************************************************************************/ void Order_Partition(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace) { SetUp(ctrl, graph, wspace); graph->ncon = 1; IFSET(ctrl->dbglvl, DBG_PROGRESS, rprintf(ctrl, "[%6d %8d %5d %5d][%d][%d]\n", graph->gnvtxs, GlobalSESum(ctrl, graph->nedges), GlobalSEMin(ctrl, graph->nvtxs), GlobalSEMax(ctrl, graph->nvtxs), ctrl->CoarsenTo, GlobalSEMax(ctrl, graph->vwgt[idxamax(graph->nvtxs, graph->vwgt)]))); if (graph->gnvtxs < 1.3*ctrl->CoarsenTo || (graph->finer != NULL && graph->gnvtxs > graph->finer->gnvtxs*COARSEN_FRACTION)) { /* Compute the initial npart-way multisection */ InitMultisection(ctrl, graph, wspace); if (graph->finer == NULL) { /* Do that only of no-coarsening took place */ ComputeNodePartitionParams(ctrl, graph, wspace); KWayNodeRefine(ctrl, graph, wspace, 2*NGR_PASSES, ORDER_UNBALANCE_FRACTION); } } else { /* Coarsen it and the partition it */ Mc_LocalMatch_HEM(ctrl, graph, wspace); Order_Partition(ctrl, graph->coarser, wspace); Moc_ProjectPartition(ctrl, graph, wspace); ComputeNodePartitionParams(ctrl, graph, wspace); KWayNodeRefine(ctrl, graph, wspace, 2*NGR_PASSES, ORDER_UNBALANCE_FRACTION); } }
/************************************************************************* * This function computes movement statistics for adaptive refinement * schemes **************************************************************************/ void ComputeMoveStatistics(CtrlType *ctrl, GraphType *graph, int *nmoved, int *maxin, int *maxout) { int i, j, nvtxs; idxtype *vwgt, *where; idxtype *lpvtxs, *gpvtxs; nvtxs = graph->nvtxs; vwgt = graph->vwgt; where = graph->where; lpvtxs = idxsmalloc(ctrl->nparts, 0, "ComputeMoveStatistics: lpvtxs"); gpvtxs = idxsmalloc(ctrl->nparts, 0, "ComputeMoveStatistics: gpvtxs"); for (j=i=0; i<nvtxs; i++) { lpvtxs[where[i]]++; if (where[i] != ctrl->mype) j++; } /* PrintVector(ctrl, ctrl->npes, 0, lpvtxs, "Lpvtxs: "); */ MPI_Allreduce((void *)lpvtxs, (void *)gpvtxs, ctrl->nparts, IDX_DATATYPE, MPI_SUM, ctrl->comm); *nmoved = GlobalSESum(ctrl, j); *maxout = GlobalSEMax(ctrl, j); *maxin = GlobalSEMax(ctrl, gpvtxs[ctrl->mype]-(nvtxs-j)); GKfree((void **)&lpvtxs, (void **)&gpvtxs, LTERM); }
/************************************************************************* * This function setsup the CtrlType structure **************************************************************************/ GraphType *Moc_SetUpGraph(CtrlType *ctrl, int ncon, idxtype *vtxdist, idxtype *xadj, idxtype *vwgt, idxtype *adjncy, idxtype *adjwgt, int *wgtflag) { int i, j; GraphType *graph; int ltvwgts[MAXNCON]; graph = CreateGraph(); graph->level = 0; graph->gnvtxs = vtxdist[ctrl->npes]; graph->nvtxs = vtxdist[ctrl->mype+1]-vtxdist[ctrl->mype]; graph->ncon = ncon; graph->nedges = xadj[graph->nvtxs]; graph->xadj = xadj; graph->vwgt = vwgt; graph->adjncy = adjncy; graph->adjwgt = adjwgt; graph->vtxdist = vtxdist; if (((*wgtflag)&2) == 0) graph->vwgt = idxsmalloc(graph->nvtxs*ncon, 1, "Par_KMetis: vwgt"); if (((*wgtflag)&1) == 0) graph->adjwgt = idxsmalloc(graph->nedges, 1, "Par_KMetis: adjwgt"); /* compute tvwgts */ for (j=0; j<ncon; j++) ltvwgts[j] = 0; for (i=0; i<graph->nvtxs; i++) for (j=0; j<ncon; j++) ltvwgts[j] += graph->vwgt[i*ncon+j]; for (j=0; j<ncon; j++) ctrl->tvwgts[j] = GlobalSESum(ctrl, ltvwgts[j]); /* check for zero wgt constraints */ for (i=0; i<ncon; i++) { /* ADD: take care of the case in which tvwgts is zero */ if (ctrl->tvwgts[i] == 0) { rprintf(ctrl, "ERROR: sum weight for constraint %d is zero\n", i); MPI_Finalize(); exit(-1); } } /* compute nvwgts */ graph->nvwgt = fmalloc(graph->nvtxs*ncon, "graph->nvwgt"); for (i=0; i<graph->nvtxs; i++) { for (j=0; j<ncon; j++) graph->nvwgt[i*ncon+j] = (floattype)(graph->vwgt[i*ncon+j]) / (floattype)(ctrl->tvwgts[j]); } srand(ctrl->seed); return graph; }
/************************************************************************* * This function writes a distributed graph to file **************************************************************************/ void Moc_ParallelWriteGraph(CtrlType *ctrl, GraphType *graph, char *filename, int nparts, int testset) { int h, i, j; int npes, mype, penum, gnedges; char partfile[256]; FILE *fpin; MPI_Comm comm; comm = ctrl->comm; MPI_Comm_size(comm, &npes); MPI_Comm_rank(comm, &mype); gnedges = GlobalSESum(ctrl, graph->nedges); sprintf(partfile, "%s.%d.%d.%d", filename, testset, graph->ncon, nparts); if (mype == 0) { if ((fpin = fopen(partfile, "w")) == NULL) errexit("Failed to open file %s", partfile); fprintf(fpin, "%d %d %d %d %d\n", graph->gnvtxs, gnedges/2, 11, graph->ncon, 1); fclose(fpin); } MPI_Barrier(comm); for (penum=0; penum<npes; penum++) { if (mype == penum) { if ((fpin = fopen(partfile, "a")) == NULL) errexit("Failed to open file %s", partfile); for (i=0; i<graph->nvtxs; i++) { for (h=0; h<graph->ncon; h++) fprintf(fpin, "%d ", graph->vwgt[i*graph->ncon+h]); for (j=graph->xadj[i]; j<graph->xadj[i+1]; j++) { fprintf(fpin, "%d ", graph->adjncy[j]+1); fprintf(fpin, "%d ", graph->adjwgt[j]); } fprintf(fpin, "\n"); } fclose(fpin); } MPI_Barrier(comm); } return; }
/*********************************************************************************** * This function is the entry point of the parallel ordering algorithm. * This function assumes that the graph is already nice partitioned among the * processors and then proceeds to perform recursive bisection. ************************************************************************************/ void ParMETIS_V3_NodeND(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy, int *numflag, int *options, idxtype *order, idxtype *sizes, MPI_Comm *comm) { int i, j; int ltvwgts[MAXNCON]; int nparts, npes, mype, wgtflag = 0, seed = GLOBAL_SEED; CtrlType ctrl; WorkSpaceType wspace; GraphType *graph, *mgraph; idxtype *morder; int minnvtxs; MPI_Comm_size(*comm, &npes); MPI_Comm_rank(*comm, &mype); nparts = npes; if (!ispow2(npes)) { if (mype == 0) printf("Error: The number of processors must be a power of 2!\n"); return; } if (vtxdist[npes] < (int)((float)(npes*npes)*1.2)) { if (mype == 0) printf("Error: Too many processors for this many vertices.\n"); return; } minnvtxs = vtxdist[1]-vtxdist[0]; for (i=0; i<npes; i++) minnvtxs = (minnvtxs < vtxdist[i+1]-vtxdist[i]) ? minnvtxs : vtxdist[i+1]-vtxdist[i]; if (minnvtxs < (int)((float)npes*1.1)) { if (mype == 0) printf("Error: vertices are not distributed equally.\n"); return; } if (*numflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 1); SetUpCtrl(&ctrl, nparts, options[PMV3_OPTION_DBGLVL], *comm); ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*npes); ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*amax(npes, nparts)); ctrl.seed = mype; ctrl.sync = seed; ctrl.partType = STATIC_PARTITION; ctrl.ps_relation = -1; ctrl.tpwgts = fsmalloc(nparts, 1.0/(float)(nparts), "tpwgts"); ctrl.ubvec[0] = 1.03; graph = Moc_SetUpGraph(&ctrl, 1, vtxdist, xadj, NULL, adjncy, NULL, &wgtflag); PreAllocateMemory(&ctrl, graph, &wspace); /*======================================================= * Compute the initial k-way partitioning =======================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr)); Moc_Global_Partition(&ctrl, graph, &wspace); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr)); IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl)); /*======================================================= * Move the graph according to the partitioning =======================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.MoveTmr)); MALLOC_CHECK(NULL); graph->ncon = 1; mgraph = Moc_MoveGraph(&ctrl, graph, &wspace); MALLOC_CHECK(NULL); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.MoveTmr)); /*======================================================= * Now compute an ordering of the moved graph =======================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr)); FreeWSpace(&wspace); PreAllocateMemory(&ctrl, mgraph, &wspace); ctrl.ipart = ISEP_NODE; ctrl.CoarsenTo = amin(vtxdist[npes]+1, amax(20*npes, 1000)); /* compute tvwgts */ for (j=0; j<mgraph->ncon; j++) ltvwgts[j] = 0; for (i=0; i<mgraph->nvtxs; i++) for (j=0; j<mgraph->ncon; j++) ltvwgts[j] += mgraph->vwgt[i*mgraph->ncon+j]; for (j=0; j<mgraph->ncon; j++) ctrl.tvwgts[j] = GlobalSESum(&ctrl, ltvwgts[j]); mgraph->nvwgt = fmalloc(mgraph->nvtxs*mgraph->ncon, "mgraph->nvwgt"); for (i=0; i<mgraph->nvtxs; i++) for (j=0; j<mgraph->ncon; j++) mgraph->nvwgt[i*mgraph->ncon+j] = (float)(mgraph->vwgt[i*mgraph->ncon+j]) / (float)(ctrl.tvwgts[j]); morder = idxmalloc(mgraph->nvtxs, "PAROMETIS: morder"); MultilevelOrder(&ctrl, mgraph, morder, sizes, &wspace); MALLOC_CHECK(NULL); /* Invert the ordering back to the original graph */ ProjectInfoBack(&ctrl, graph, order, morder, &wspace); MALLOC_CHECK(NULL); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr)); IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); free(ctrl.tpwgts); free(morder); FreeGraph(mgraph); FreeInitialGraphAndRemap(graph, 0); FreeWSpace(&wspace); FreeCtrl(&ctrl); if (*numflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 0); MALLOC_CHECK(NULL); }
/*********************************************************************************** * This function is the entry point of the parallel kmetis algorithm that uses * coordinates to compute an initial graph distribution. ************************************************************************************/ void ParMETIS_V3_PartGeomKway(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy, idxtype *vwgt, idxtype *adjwgt, int *wgtflag, int *numflag, int *ndims, float *xyz, int *ncon, int *nparts, float *tpwgts, float *ubvec, int *options, int *edgecut, idxtype *part, MPI_Comm *comm) { int h, i, j; int nvtxs = -1, npes, mype; int uwgtflag, cut, gcut, maxnvtxs; int ltvwgts[MAXNCON]; int moptions[10]; CtrlType ctrl; idxtype *uvwgt; WorkSpaceType wspace; GraphType *graph, *mgraph; float avg, maximb, balance, *mytpwgts; int seed, dbglvl = 0; int iwgtflag, inumflag, incon, inparts, ioptions[10]; float *itpwgts, iubvec[MAXNCON]; MPI_Comm_size(*comm, &npes); MPI_Comm_rank(*comm, &mype); /********************************/ /* Try and take care bad inputs */ /********************************/ if (options != NULL && options[0] == 1) dbglvl = options[PMV3_OPTION_DBGLVL]; CheckInputs(STATIC_PARTITION, npes, dbglvl, wgtflag, &iwgtflag, numflag, &inumflag, ncon, &incon, nparts, &inparts, tpwgts, &itpwgts, ubvec, iubvec, NULL, NULL, options, ioptions, part, comm); /*********************************/ /* Take care the nparts = 1 case */ /*********************************/ if (inparts <= 1) { idxset(vtxdist[mype+1]-vtxdist[mype], 0, part); *edgecut = 0; return; } /******************************/ /* Take care of npes = 1 case */ /******************************/ if (npes == 1 && inparts > 1) { moptions[0] = 0; nvtxs = vtxdist[1]; if (incon == 1) { METIS_WPartGraphKway(&nvtxs, xadj, adjncy, vwgt, adjwgt, &iwgtflag, &inumflag, &inparts, itpwgts, moptions, edgecut, part); } else { /* ADD: this is because METIS does not support tpwgts for all constraints */ mytpwgts = fmalloc(inparts, "mytpwgts"); for (i=0; i<inparts; i++) mytpwgts[i] = itpwgts[i*incon]; moptions[7] = -1; METIS_mCPartGraphRecursive2(&nvtxs, &incon, xadj, adjncy, vwgt, adjwgt, &iwgtflag, &inumflag, &inparts, mytpwgts, moptions, edgecut, part); free(mytpwgts); } return; } if (inumflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1); /*****************************/ /* Set up control structures */ /*****************************/ if (ioptions[0] == 1) { dbglvl = ioptions[PMV3_OPTION_DBGLVL]; seed = ioptions[PMV3_OPTION_SEED]; } else { dbglvl = GLOBAL_DBGLVL; seed = GLOBAL_SEED; } SetUpCtrl(&ctrl, npes, dbglvl, *comm); ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*incon*amax(npes, inparts)); ctrl.seed = (seed == 0) ? mype : seed*mype; ctrl.sync = GlobalSEMax(&ctrl, seed); ctrl.partType = STATIC_PARTITION; ctrl.ps_relation = -1; ctrl.tpwgts = itpwgts; scopy(incon, iubvec, ctrl.ubvec); uwgtflag = iwgtflag|2; uvwgt = idxsmalloc(vtxdist[mype+1]-vtxdist[mype], 1, "uvwgt"); graph = Moc_SetUpGraph(&ctrl, 1, vtxdist, xadj, uvwgt, adjncy, adjwgt, &uwgtflag); free(graph->nvwgt); graph->nvwgt = NULL; PreAllocateMemory(&ctrl, graph, &wspace); /*================================================================= * Compute the initial npes-way partitioning geometric partitioning =================================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr)); Coordinate_Partition(&ctrl, graph, *ndims, xyz, 1, &wspace); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr)); IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl)); /*================================================================= * Move the graph according to the partitioning =================================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.MoveTmr)); free(uvwgt); graph->vwgt = ((iwgtflag&2) != 0) ? vwgt : idxsmalloc(graph->nvtxs*incon, 1, "vwgt"); graph->ncon = incon; j = ctrl.nparts; ctrl.nparts = ctrl.npes; mgraph = Moc_MoveGraph(&ctrl, graph, &wspace); ctrl.nparts = j; /**********************************************************/ /* Do the same functionality as Moc_SetUpGraph for mgraph */ /**********************************************************/ /* compute tvwgts */ for (j=0; j<incon; j++) ltvwgts[j] = 0; for (i=0; i<graph->nvtxs; i++) for (j=0; j<incon; j++) ltvwgts[j] += mgraph->vwgt[i*incon+j]; for (j=0; j<incon; j++) ctrl.tvwgts[j] = GlobalSESum(&ctrl, ltvwgts[j]); /* check for zero wgt constraints */ for (i=0; i<incon; i++) { /* ADD: take care of the case in which tvwgts is zero */ if (ctrl.tvwgts[i] == 0) { if (ctrl.mype == 0) printf("ERROR: sum weight for constraint %d is zero\n", i); MPI_Finalize(); exit(-1); } } /* compute nvwgt */ mgraph->nvwgt = fmalloc(mgraph->nvtxs*incon, "mgraph->nvwgt"); for (i=0; i<mgraph->nvtxs; i++) for (j=0; j<incon; j++) mgraph->nvwgt[i*incon+j] = (float)(mgraph->vwgt[i*incon+j]) / (float)(ctrl.tvwgts[j]); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.MoveTmr)); if (ctrl.dbglvl&DBG_INFO) { cut = 0; for (i=0; i<graph->nvtxs; i++) for (j=graph->xadj[i]; j<graph->xadj[i+1]; j++) if (graph->where[i] != graph->where[graph->adjncy[j]]) cut += graph->adjwgt[j]; gcut = GlobalSESum(&ctrl, cut)/2; maxnvtxs = GlobalSEMax(&ctrl, mgraph->nvtxs); balance = (float)(maxnvtxs)/((float)(graph->gnvtxs)/(float)(npes)); rprintf(&ctrl, "XYZ Cut: %6d \tBalance: %6.3f [%d %d %d]\n", gcut, balance, maxnvtxs, graph->gnvtxs, npes); } /*================================================================= * Set up the newly moved graph =================================================================*/ IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr)); ctrl.nparts = inparts; FreeWSpace(&wspace); PreAllocateMemory(&ctrl, mgraph, &wspace); /*======================================================= * Now compute the partition of the moved graph =======================================================*/ if (vtxdist[npes] < SMALLGRAPH || vtxdist[npes] < npes*20 || GlobalSESum(&ctrl, mgraph->nedges) == 0) { IFSET(ctrl.dbglvl, DBG_INFO, rprintf(&ctrl, "Partitioning a graph of size %d serially\n", vtxdist[npes])); PartitionSmallGraph(&ctrl, mgraph, &wspace); } else { Moc_Global_Partition(&ctrl, mgraph, &wspace); } ParallelReMapGraph(&ctrl, mgraph, &wspace); /* Invert the ordering back to the original graph */ ctrl.nparts = npes; ProjectInfoBack(&ctrl, graph, part, mgraph->where, &wspace); *edgecut = mgraph->mincut; IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr)); /*******************/ /* Print out stats */ /*******************/ IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); if (ctrl.dbglvl&DBG_INFO) { rprintf(&ctrl, "Final %d-way CUT: %6d \tBalance: ", inparts, mgraph->mincut); avg = 0.0; for (h=0; h<incon; h++) { maximb = 0.0; for (i=0; i<inparts; i++) maximb = amax(maximb, mgraph->gnpwgts[i*incon+h]/itpwgts[i*incon+h]); avg += maximb; rprintf(&ctrl, "%.3f ", maximb); } rprintf(&ctrl, " avg: %.3f\n", avg/(float)incon); } GKfree((void **)&itpwgts, LTERM); FreeGraph(mgraph); FreeInitialGraphAndRemap(graph, iwgtflag); FreeWSpace(&wspace); FreeCtrl(&ctrl); if (inumflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0); }
void KWayNodeRefine(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace, int npasses, float ubfraction) { int i, ii, iii, j, k, pass, nvtxs, firstvtx, lastvtx, otherlastvtx, c, nmoves, nlupd, nsupd, nnbrs, nchanged, nsep; int npes = ctrl->npes, mype = ctrl->mype, nparts = ctrl->nparts; idxtype *xadj, *adjncy, *adjwgt, *vtxdist, *vwgt; idxtype *where, *lpwgts, *gpwgts, *sepind; idxtype *peind, *recvptr, *sendptr; idxtype *update, *supdate, *rupdate, *pe_updates, *htable, *changed; idxtype *badminpwgt, *badmaxpwgt; KeyValueType *swchanges, *rwchanges; int *nupds_pe; NRInfoType *rinfo, *myrinfo; int from, to, me, other, oldcut; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->KWayTmr)); nvtxs = graph->nvtxs; vtxdist = graph->vtxdist; xadj = graph->xadj; adjncy = graph->adjncy; adjwgt = graph->adjwgt; vwgt = graph->vwgt; firstvtx = vtxdist[mype]; lastvtx = vtxdist[mype+1]; where = graph->where; rinfo = graph->nrinfo; lpwgts = graph->lpwgts; gpwgts = graph->gpwgts; nsep = graph->nsep; sepind = graph->sepind; nnbrs = graph->nnbrs; peind = graph->peind; recvptr = graph->recvptr; sendptr = graph->sendptr; changed = idxmalloc(nvtxs, "KWayRefine: changed"); rwchanges = wspace->pairs; swchanges = rwchanges + recvptr[nnbrs]; update = idxmalloc(nvtxs, "KWayRefine: update"); supdate = wspace->indices; rupdate = supdate + recvptr[nnbrs]; nupds_pe = imalloc(npes, "KWayRefine: nupds_pe"); htable = idxsmalloc(nvtxs+graph->nrecv, 0, "KWayRefine: lhtable"); badminpwgt = wspace->pv1; badmaxpwgt = wspace->pv2; for (i=0; i<nparts; i+=2) { badminpwgt[i] = badminpwgt[i+1] = (1.0/ubfraction)*(gpwgts[i]+gpwgts[i+1])/2; badmaxpwgt[i] = badmaxpwgt[i+1] = ubfraction*(gpwgts[i]+gpwgts[i+1])/2; } //myprintf(ctrl, "%6d %6d %6d %6d %6d %6d %6d\n", lpwgts[0], lpwgts[1], lpwgts[2], gpwgts[0], gpwgts[1], gpwgts[2], badmaxpwgt[0]); IFSET(ctrl->dbglvl, DBG_REFINEINFO, PrintNodeBalanceInfo(ctrl, nparts, gpwgts, badminpwgt, badmaxpwgt, 1)); for (pass=0; pass<npasses; pass++) { oldcut = graph->mincut; for (c=0; c<2; c++) { for (i=0; i<nparts; i+=2) { badminpwgt[i] = badminpwgt[i+1] = (1.0/ubfraction)*(gpwgts[i]+gpwgts[i+1])/2; badmaxpwgt[i] = badmaxpwgt[i+1] = ubfraction*(gpwgts[i]+gpwgts[i+1])/2; } nlupd = nsupd = nmoves = nchanged = 0; for (ii=0; ii<nsep; ii++) { i = sepind[ii]; from = where[i]; ASSERT(ctrl, from >= nparts); /* Go through the loop to see if gain is possible for the separator vertex */ if (rinfo[i].edegrees[(c+1)%2] <= vwgt[i]) { /* It is a one-sded move so it will go to the other partition. Look at the comments in InitMultisection to understand the meaning of from%nparts */ to = from%nparts+c; if (gpwgts[to]+vwgt[i] > badmaxpwgt[to]) { /* printf("Skip because of weight! %d\n", vwgt[i]-rinfo[i].edegrees[(c+1)%2]); */ continue; /* We cannot move it there because it gets too heavy */ } /* Update the where information of the vertex you moved */ where[i] = to; /* Remove this vertex from the sepind. Note the trick for looking at the sepind[ii] again */ sepind[ii--] = sepind[--nsep]; /* myprintf(ctrl, "Vertex %d [%d %d] is moving to %d from %d [%d]\n", i+firstvtx, vwgt[i], rinfo[i].edegrees[(c+1)%2], to, from, where[i]); */ lpwgts[from] -= vwgt[i]; lpwgts[2*nparts-1] -= vwgt[i]; lpwgts[to] += vwgt[i]; gpwgts[to] += vwgt[i]; /* Put the vertices adjacent to i that belong to either the separator or the (c+1)%2 partition into the update array */ for (j=xadj[i]; j<xadj[i+1]; j++) { k = adjncy[j]; if (htable[k] == 0 && where[k] != to) { htable[k] = 1; if (k<nvtxs) update[nlupd++] = k; else supdate[nsupd++] = k; } } nmoves++; if (graph->pexadj[i+1]-graph->pexadj[i] > 0) changed[nchanged++] = i; } } /* myprintf(ctrl, "nmoves: %d, nlupd: %d, nsupd: %d\n", nmoves, nlupd, nsupd); */ /* Tell everybody interested what the new where[] info is for the interface vertices */ CommChangedInterfaceData(ctrl, graph, nchanged, changed, where, swchanges, rwchanges, wspace->pv4); IFSET(ctrl->dbglvl, DBG_RMOVEINFO, rprintf(ctrl, "\t[%d %d], [%d %d %d]\n", pass, c, GlobalSESum(ctrl, nmoves), GlobalSESum(ctrl, nsupd), GlobalSESum(ctrl, nlupd))); /*----------------------------------------------------------------------- / Time to communicate with processors to send the vertices whose degrees / need to be updated. /-----------------------------------------------------------------------*/ /* Issue the receives first */ for (i=0; i<nnbrs; i++) { MPI_Irecv((void *)(rupdate+sendptr[i]), sendptr[i+1]-sendptr[i], IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Issue the sends next. This needs some preporcessing */ for (i=0; i<nsupd; i++) { htable[supdate[i]] = 0; supdate[i] = graph->imap[supdate[i]]; } iidxsort(nsupd, supdate); for (j=i=0; i<nnbrs; i++) { otherlastvtx = vtxdist[peind[i]+1]; for (k=j; k<nsupd && supdate[k] < otherlastvtx; k++); MPI_Isend((void *)(supdate+j), k-j, IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->sreq+i); j = k; } /* OK, now get into the loop waiting for the send/recv operations to finish */ MPI_Waitall(nnbrs, ctrl->rreq, ctrl->statuses); for (i=0; i<nnbrs; i++) MPI_Get_count(ctrl->statuses+i, IDX_DATATYPE, nupds_pe+i); MPI_Waitall(nnbrs, ctrl->sreq, ctrl->statuses); /*------------------------------------------------------------- / Place the received to-be updated vertices into update[] /-------------------------------------------------------------*/ for (i=0; i<nnbrs; i++) { pe_updates = rupdate+sendptr[i]; for (j=0; j<nupds_pe[i]; j++) { k = pe_updates[j]; if (htable[k-firstvtx] == 0) { htable[k-firstvtx] = 1; update[nlupd++] = k-firstvtx; } } } /*------------------------------------------------------------- / Update the where information of the vertices that are pulled / into the separator. /-------------------------------------------------------------*/ nchanged = 0; for (ii=0; ii<nlupd; ii++) { i = update[ii]; me = where[i]; if (me < nparts && me%2 == (c+1)%2) { /* This vertex is pulled into the separator */ lpwgts[me] -= vwgt[i]; where[i] = nparts+me-(me%2); sepind[nsep++] = i; /* Put the vertex into the sepind array */ if (graph->pexadj[i+1]-graph->pexadj[i] > 0) changed[nchanged++] = i; lpwgts[where[i]] += vwgt[i]; lpwgts[2*nparts-1] += vwgt[i]; /* myprintf(ctrl, "Vertex %d moves into the separator from %d to %d\n", i+firstvtx, me, where[i]); */ } } /* Tell everybody interested what the new where[] info is for the interface vertices */ CommChangedInterfaceData(ctrl, graph, nchanged, changed, where, swchanges, rwchanges, wspace->pv4); /*------------------------------------------------------------- / Update the rinfo of the vertices in the update[] array /-------------------------------------------------------------*/ for (ii=0; ii<nlupd; ii++) { i = update[ii]; ASSERT(ctrl, htable[i] == 1); htable[i] = 0; me = where[i]; if (me >= nparts) { /* If it is a separator vertex */ /* myprintf(ctrl, "Updating %d %d\n", i+firstvtx, me); */ myrinfo = rinfo+i; myrinfo->edegrees[0] = myrinfo->edegrees[1] = 0; for (j=xadj[i]; j<xadj[i+1]; j++) { other = where[adjncy[j]]; if (me != other) myrinfo->edegrees[other%2] += vwgt[adjncy[j]]; } } } /* Finally, sum-up the partition weights */ MPI_Allreduce((void *)lpwgts, (void *)gpwgts, 2*nparts, IDX_DATATYPE, MPI_SUM, ctrl->comm); graph->mincut = gpwgts[2*nparts-1]; IFSET(ctrl->dbglvl, DBG_REFINEINFO, PrintNodeBalanceInfo(ctrl, nparts, gpwgts, badminpwgt, badmaxpwgt, 0)); } if (graph->mincut == oldcut) break; } GKfree((void **)&update, &nupds_pe, &htable, &changed, LTERM); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->KWayTmr)); }
/************************************************************************* * This function performs a k-way directed diffusion **************************************************************************/ real_t WavefrontDiffusion(ctrl_t *ctrl, graph_t *graph, idx_t *home) { idx_t ii, i, j, k, l, nvtxs, nedges, nparts; idx_t from, to, edge, done, nswaps, noswaps, totalv, wsize; idx_t npasses, first, second, third, mind, maxd; idx_t *xadj, *adjncy, *adjwgt, *where, *perm; idx_t *rowptr, *colind, *ed, *psize; real_t *transfer, *tmpvec; real_t balance = -1.0, *load, *solution, *workspace; real_t *nvwgt, *npwgts, flowFactor, cost, ubfactor; matrix_t matrix; ikv_t *cand; idx_t ndirty, nclean, dptr, clean; nvtxs = graph->nvtxs; nedges = graph->nedges; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; nparts = ctrl->nparts; ubfactor = ctrl->ubvec[0]; matrix.nrows = nparts; flowFactor = 0.35; flowFactor = (ctrl->mype == 2) ? 0.50 : flowFactor; flowFactor = (ctrl->mype == 3) ? 0.75 : flowFactor; flowFactor = (ctrl->mype == 4) ? 1.00 : flowFactor; /* allocate memory */ solution = rmalloc(4*nparts+2*nedges, "WavefrontDiffusion: solution"); tmpvec = solution + nparts; npwgts = solution + 2*nparts; load = solution + 3*nparts; matrix.values = solution + 4*nparts; transfer = matrix.transfer = solution + 4*nparts + nedges; perm = imalloc(2*nvtxs+2*nparts+nedges+1, "WavefrontDiffusion: perm"); ed = perm + nvtxs; psize = perm + 2*nvtxs; rowptr = matrix.rowptr = perm + 2*nvtxs + nparts; colind = matrix.colind = perm + 2*nvtxs + 2*nparts + 1; /*GKTODO - Potential problem with this malloc */ wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "WavefrontDiffusion: workspace"); cand = ikvmalloc(nvtxs, "WavefrontDiffusion: cand"); /*****************************/ /* Populate empty subdomains */ /*****************************/ iset(nparts, 0, psize); for (i=0; i<nvtxs; i++) psize[where[i]]++; mind = iargmin(nparts, psize); maxd = iargmax(nparts, psize); if (psize[mind] == 0) { for (i=0; i<nvtxs; i++) { k = (RandomInRange(nvtxs)+i)%nvtxs; if (where[k] == maxd) { where[k] = mind; psize[mind]++; psize[maxd]--; break; } } } iset(nvtxs, 0, ed); rset(nparts, 0.0, npwgts); for (i=0; i<nvtxs; i++) { npwgts[where[i]] += nvwgt[i]; for (j=xadj[i]; j<xadj[i+1]; j++) ed[i] += (where[i] != where[adjncy[j]] ? adjwgt[j] : 0); } ComputeLoad(graph, nparts, load, ctrl->tpwgts, 0); done = 0; /* zero out the tmpvec array */ rset(nparts, 0.0, tmpvec); npasses = gk_min(nparts/2, NGD_PASSES); for (l=0; l<npasses; l++) { /* Set-up and solve the diffusion equation */ nswaps = 0; /************************/ /* Solve flow equations */ /************************/ SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); /* check for disconnected subdomains */ for(i=0; i<matrix.nrows; i++) { if (matrix.rowptr[i]+1 == matrix.rowptr[i+1]) { cost = (real_t)(ctrl->mype); goto CleanUpAndExit; } } ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(1, &matrix, solution, transfer, 0); GetThreeMax(nparts, load, &first, &second, &third); if (l%3 == 0) { FastRandomPermute(nvtxs, perm, 1); } else { /*****************************/ /* move dirty vertices first */ /*****************************/ ndirty = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) ndirty++; } dptr = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) perm[dptr++] = i; else perm[ndirty++] = i; } PASSERT(ctrl, ndirty == nvtxs); ndirty = dptr; nclean = nvtxs-dptr; FastRandomPermute(ndirty, perm, 0); FastRandomPermute(nclean, perm+ndirty, 0); } if (ctrl->mype == 0) { for (j=nvtxs, k=0, ii=0; ii<nvtxs; ii++) { i = perm[ii]; if (ed[i] != 0) { cand[k].key = -ed[i]; cand[k++].val = i; } else { cand[--j].key = 0; cand[j].val = i; } } ikvsorti(k, cand); } for (ii=0; ii<nvtxs/3; ii++) { i = (ctrl->mype == 0) ? cand[ii].val : perm[ii]; from = where[i]; /* don't move out the last vertex in a subdomain */ if (psize[from] == 1) continue; clean = (from == home[i]) ? 1 : 0; /* only move from top three or dirty vertices */ if (from != first && from != second && from != third && clean) continue; /* Scatter the sparse transfer row into the dense tmpvec row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) tmpvec[colind[j]] = transfer[j]; for (j=xadj[i]; j<xadj[i+1]; j++) { to = where[adjncy[j]]; if (from != to) { if (tmpvec[to] > (flowFactor * nvwgt[i])) { tmpvec[to] -= nvwgt[i]; INC_DEC(psize[to], psize[from], 1); INC_DEC(npwgts[to], npwgts[from], nvwgt[i]); INC_DEC(load[to], load[from], nvwgt[i]); where[i] = to; nswaps++; /* Update external degrees */ ed[i] = 0; for (k=xadj[i]; k<xadj[i+1]; k++) { edge = adjncy[k]; ed[i] += (to != where[edge] ? adjwgt[k] : 0); if (where[edge] == from) ed[edge] += adjwgt[k]; if (where[edge] == to) ed[edge] -= adjwgt[k]; } break; } } } /* Gather the dense tmpvec row into the sparse transfer row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) { transfer[j] = tmpvec[colind[j]]; tmpvec[colind[j]] = 0.0; } ASSERT(fabs(rsum(nparts, tmpvec, 1)) < .0001) } if (l % 2 == 1) { balance = rmax(nparts, npwgts)*nparts; if (balance < ubfactor + 0.035) done = 1; if (GlobalSESum(ctrl, done) > 0) break; noswaps = (nswaps > 0) ? 0 : 1; if (GlobalSESum(ctrl, noswaps) > ctrl->npes/2) break; } } graph->mincut = ComputeSerialEdgeCut(graph); totalv = Mc_ComputeSerialTotalV(graph, home); cost = ctrl->ipc_factor * (real_t)graph->mincut + ctrl->redist_factor * (real_t)totalv; CleanUpAndExit: gk_free((void **)&solution, (void **)&perm, (void **)&workspace, (void **)&cand, LTERM); return cost; }
/************************************************************************* * This function is the driver for the adaptive refinement mode of ParMETIS **************************************************************************/ void Adaptive_Partition(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace) { int i; int tewgt, tvsize; floattype gtewgt, gtvsize; floattype ubavg, lbavg, lbvec[MAXNCON]; /************************************/ /* Set up important data structures */ /************************************/ SetUp(ctrl, graph, wspace); ubavg = savg(graph->ncon, ctrl->ubvec); tewgt = idxsum(graph->nedges, graph->adjwgt); tvsize = idxsum(graph->nvtxs, graph->vsize); gtewgt = (floattype) GlobalSESum(ctrl, tewgt) + 1.0; /* The +1 were added to remove any FPE */ gtvsize = (floattype) GlobalSESum(ctrl, tvsize) + 1.0; ctrl->redist_factor = ctrl->redist_base * ((gtewgt/gtvsize)/ ctrl->edge_size_ratio); IFSET(ctrl->dbglvl, DBG_PROGRESS, rprintf(ctrl, "[%6d %8d %5d %5d][%d]\n", graph->gnvtxs, GlobalSESum(ctrl, graph->nedges), GlobalSEMin(ctrl, graph->nvtxs), GlobalSEMax(ctrl, graph->nvtxs), ctrl->CoarsenTo)); if (graph->gnvtxs < 1.3*ctrl->CoarsenTo || (graph->finer != NULL && graph->gnvtxs > graph->finer->gnvtxs*COARSEN_FRACTION)) { /***********************************************/ /* Balance the partition on the coarsest graph */ /***********************************************/ graph->where = idxsmalloc(graph->nvtxs+graph->nrecv, -1, "graph->where"); idxcopy(graph->nvtxs, graph->home, graph->where); Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = savg(graph->ncon, lbvec); if (lbavg > ubavg + 0.035 && ctrl->partType != REFINE_PARTITION) Balance_Partition(ctrl, graph, wspace); if (ctrl->dbglvl&DBG_PROGRESS) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10d, balance: ", graph->gnvtxs); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3f ", lbvec[i]); rprintf(ctrl, "\n"); } /* check if no coarsening took place */ if (graph->finer == NULL) { Moc_ComputePartitionParams(ctrl, graph, wspace); Moc_KWayBalance(ctrl, graph, wspace, graph->ncon); Moc_KWayAdaptiveRefine(ctrl, graph, wspace, NGR_PASSES); } } else { /*******************************/ /* Coarsen it and partition it */ /*******************************/ switch (ctrl->ps_relation) { case COUPLED: Mc_LocalMatch_HEM(ctrl, graph, wspace); break; case DISCOUPLED: default: Moc_GlobalMatch_Balance(ctrl, graph, wspace); break; } Adaptive_Partition(ctrl, graph->coarser, wspace); /********************************/ /* project partition and refine */ /********************************/ Moc_ProjectPartition(ctrl, graph, wspace); Moc_ComputePartitionParams(ctrl, graph, wspace); if (graph->ncon > 1 && graph->level < 4) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = savg(graph->ncon, lbvec); if (lbavg > ubavg + 0.025) { Moc_KWayBalance(ctrl, graph, wspace, graph->ncon); } } Moc_KWayAdaptiveRefine(ctrl, graph, wspace, NGR_PASSES); if (ctrl->dbglvl&DBG_PROGRESS) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10d, cut: %8d, balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3f ", lbvec[i]); rprintf(ctrl, "\n"); } } }
/************************************************************************* * This function computes the initial id/ed **************************************************************************/ void Moc_ComputePartitionParams(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace) { int h, i, j, k; int nvtxs, ncon; int firstvtx, lastvtx; idxtype *xadj, *ladjncy, *adjwgt, *vtxdist; floattype *lnpwgts, *gnpwgts; idxtype *where, *swhere, *rwhere; RInfoType *rinfo, *myrinfo; EdgeType *edegrees; int me, other; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->KWayInitTmr)); nvtxs = graph->nvtxs; ncon = graph->ncon; vtxdist = graph->vtxdist; xadj = graph->xadj; ladjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; rinfo = graph->rinfo = (RInfoType *)GKmalloc(sizeof(RInfoType)*nvtxs, "CPP: rinfo"); lnpwgts = graph->lnpwgts = fmalloc(ctrl->nparts*ncon, "CPP: lnpwgts"); gnpwgts = graph->gnpwgts = fmalloc(ctrl->nparts*ncon, "CPP: gnpwgts"); sset(ctrl->nparts*ncon, 0, lnpwgts); firstvtx = vtxdist[ctrl->mype]; lastvtx = vtxdist[ctrl->mype+1]; /*------------------------------------------------------------ / Send/Receive the where information of interface vertices /------------------------------------------------------------*/ swhere = wspace->indices; rwhere = where + nvtxs; CommInterfaceData(ctrl, graph, where, swhere, rwhere); #ifdef DEBUG_COMPUTEPPARAM PrintVector(ctrl, nvtxs, firstvtx, where, "where"); #endif ASSERT(ctrl, wspace->nlarge >= xadj[nvtxs]); /*------------------------------------------------------------ / Compute now the id/ed degrees /------------------------------------------------------------*/ graph->lmincut = 0; for (i=0; i<nvtxs; i++) { me = where[i]; myrinfo = rinfo+i; for (h=0; h<ncon; h++) lnpwgts[me*ncon+h] += graph->nvwgt[i*ncon+h]; myrinfo->degrees = wspace->degrees + xadj[i]; myrinfo->ndegrees = myrinfo->id = myrinfo->ed = 0; for (j=xadj[i]; j<xadj[i+1]; j++) { if (me == where[ladjncy[j]]) myrinfo->id += adjwgt[j]; else myrinfo->ed += adjwgt[j]; } if (myrinfo->ed > 0) { /* Time to do some serious work */ graph->lmincut += myrinfo->ed; edegrees = myrinfo->degrees; for (j=xadj[i]; j<xadj[i+1]; j++) { other = where[ladjncy[j]]; if (me != other) { for (k=0; k<myrinfo->ndegrees; k++) { if (edegrees[k].edge == other) { edegrees[k].ewgt += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { edegrees[k].edge = other; edegrees[k].ewgt = adjwgt[j]; myrinfo->ndegrees++; } ASSERT(ctrl, myrinfo->ndegrees <= xadj[i+1]-xadj[i]); } } } } #ifdef DEBUG_COMPUTEPPARAM PrintVector(ctrl, ctrl->nparts*ncon, 0, lnpwgts, "lnpwgts"); #endif /* Finally, sum-up the partition weights */ MPI_Allreduce((void *)lnpwgts, (void *)gnpwgts, ctrl->nparts*ncon, MPI_DOUBLE, MPI_SUM, ctrl->comm); graph->mincut = GlobalSESum(ctrl, graph->lmincut)/2; #ifdef DEBUG_COMPUTEPPARAM PrintVector(ctrl, ctrl->nparts*ncon, 0, gnpwgts, "gnpwgts"); #endif IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->KWayInitTmr)); }
/*********************************************************************************** * This function is the entry point of the parallel kmetis algorithm that uses * coordinates to compute an initial graph distribution. ************************************************************************************/ int ParMETIS_V3_PartGeomKway(idx_t *vtxdist, idx_t *xadj, idx_t *adjncy, idx_t *vwgt, idx_t *adjwgt, idx_t *wgtflag, idx_t *numflag, idx_t *ndims, real_t *xyz, idx_t *ncon, idx_t *nparts, real_t *tpwgts, real_t *ubvec, idx_t *options, idx_t *edgecut, idx_t *part, MPI_Comm *comm) { idx_t h, i, j, npes, mype, status, nvtxs, seed, dbglvl; idx_t cut, gcut, maxnvtxs; idx_t moptions[METIS_NOPTIONS]; ctrl_t *ctrl; graph_t *graph, *mgraph; real_t balance; size_t curmem; /* Check the input parameters and return if an error */ status = CheckInputsPartGeomKway(vtxdist, xadj, adjncy, vwgt, adjwgt, wgtflag, numflag, ndims, xyz, ncon, nparts, tpwgts, ubvec, options, edgecut, part, comm); if (GlobalSEMinComm(*comm, status) == 0) return METIS_ERROR; status = METIS_OK; gk_malloc_init(); curmem = gk_GetCurMemoryUsed(); /* Setup the ctrl */ ctrl = SetupCtrl(PARMETIS_OP_GKMETIS, options, *ncon, *nparts, tpwgts, ubvec, *comm); npes = ctrl->npes; mype = ctrl->mype; /* Take care the nparts == 1 case */ if (*nparts == 1) { iset(vtxdist[mype+1]-vtxdist[mype], (*numflag == 0 ? 0 : 1), part); *edgecut = 0; goto DONE; } /* Take care of npes == 1 case */ if (npes == 1) { nvtxs = vtxdist[1] - vtxdist[0]; /* subtraction is required when numflag==1 */ METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_NUMBERING] = *numflag; status = METIS_PartGraphKway(&nvtxs, ncon, xadj, adjncy, vwgt, NULL, adjwgt, nparts, tpwgts, ubvec, moptions, edgecut, part); goto DONE; } /* Setup the graph */ if (*numflag > 0) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1); graph = SetupGraph(ctrl, *ncon, vtxdist, xadj, vwgt, NULL, adjncy, adjwgt, *wgtflag); gk_free((void **)&graph->nvwgt, LTERM); /* Allocate the workspace */ AllocateWSpace(ctrl, 10*graph->nvtxs); /* Compute the initial npes-way partitioning geometric partitioning */ STARTTIMER(ctrl, ctrl->TotalTmr); Coordinate_Partition(ctrl, graph, *ndims, xyz, 1); STOPTIMER(ctrl, ctrl->TotalTmr); /* Move the graph according to the partitioning */ STARTTIMER(ctrl, ctrl->MoveTmr); ctrl->nparts = npes; mgraph = MoveGraph(ctrl, graph); ctrl->nparts = *nparts; SetupGraph_nvwgts(ctrl, mgraph); /* compute nvwgts for the moved graph */ if (ctrl->dbglvl&DBG_INFO) { CommInterfaceData(ctrl, graph, graph->where, graph->where+graph->nvtxs); for (cut=0, i=0; i<graph->nvtxs; i++) { for (j=graph->xadj[i]; j<graph->xadj[i+1]; j++) { if (graph->where[i] != graph->where[graph->adjncy[j]]) cut += graph->adjwgt[j]; } } gcut = GlobalSESum(ctrl, cut)/2; maxnvtxs = GlobalSEMax(ctrl, mgraph->nvtxs); balance = (real_t)(maxnvtxs)/((real_t)(graph->gnvtxs)/(real_t)(npes)); rprintf(ctrl, "XYZ Cut: %6"PRIDX" \tBalance: %6.3"PRREAL" [%"PRIDX" %"PRIDX" %"PRIDX"]\n", gcut, balance, maxnvtxs, graph->gnvtxs, npes); } STOPTIMER(ctrl, ctrl->MoveTmr); /* Compute the partition of the moved graph */ STARTTIMER(ctrl, ctrl->TotalTmr); ctrl->CoarsenTo = gk_min(vtxdist[npes]+1, 25*(*ncon)*gk_max(npes, *nparts)); if (vtxdist[npes] < SMALLGRAPH || vtxdist[npes] < npes*20 || GlobalSESum(ctrl, mgraph->nedges) == 0) { /* serially */ IFSET(ctrl->dbglvl, DBG_INFO, rprintf(ctrl, "Partitioning a graph of size %"PRIDX" serially\n", vtxdist[npes])); PartitionSmallGraph(ctrl, mgraph); } else { /* in parallel */ Global_Partition(ctrl, mgraph); } ParallelReMapGraph(ctrl, mgraph); /* Invert the ordering back to the original graph */ ctrl->nparts = npes; ProjectInfoBack(ctrl, graph, part, mgraph->where); ctrl->nparts = *nparts; *edgecut = mgraph->mincut; STOPTIMER(ctrl, ctrl->TotalTmr); /* Print some stats */ IFSET(ctrl->dbglvl, DBG_TIME, PrintTimingInfo(ctrl)); IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->gcomm)); IFSET(ctrl->dbglvl, DBG_INFO, PrintPostPartInfo(ctrl, mgraph, 0)); FreeGraph(mgraph); FreeInitialGraphAndRemap(graph); if (*numflag > 0) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0); DONE: FreeCtrl(&ctrl); if (gk_GetCurMemoryUsed() - curmem > 0) { printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n", (ssize_t)(gk_GetCurMemoryUsed() - curmem)); } gk_malloc_cleanup(0); return (int)status; }
void Adaptive_Partition(ctrl_t *ctrl, graph_t *graph) { idx_t i; idx_t tewgt, tvsize; real_t gtewgt, gtvsize; real_t ubavg, lbavg, *lbvec; WCOREPUSH; lbvec = rwspacemalloc(ctrl, graph->ncon); /************************************/ /* Set up important data structures */ /************************************/ CommSetup(ctrl, graph); ubavg = ravg(graph->ncon, ctrl->ubvec); tewgt = isum(graph->nedges, graph->adjwgt, 1); tvsize = isum(graph->nvtxs, graph->vsize, 1); gtewgt = (real_t) GlobalSESum(ctrl, tewgt) + 1.0/graph->gnvtxs; /* The +1/graph->gnvtxs were added to remove any FPE */ gtvsize = (real_t) GlobalSESum(ctrl, tvsize) + 1.0/graph->gnvtxs; ctrl->redist_factor = ctrl->redist_base * ((gtewgt/gtvsize)/ ctrl->edge_size_ratio); IFSET(ctrl->dbglvl, DBG_PROGRESS, rprintf(ctrl, "[%6"PRIDX" %8"PRIDX" %5"PRIDX" %5"PRIDX"][%"PRIDX"]\n", graph->gnvtxs, GlobalSESum(ctrl, graph->nedges), GlobalSEMin(ctrl, graph->nvtxs), GlobalSEMax(ctrl, graph->nvtxs), ctrl->CoarsenTo)); if (graph->gnvtxs < 1.3*ctrl->CoarsenTo || (graph->finer != NULL && graph->gnvtxs > graph->finer->gnvtxs*COARSEN_FRACTION)) { AllocateRefinementWorkSpace(ctrl, 2*graph->nedges); /***********************************************/ /* Balance the partition on the coarsest graph */ /***********************************************/ graph->where = ismalloc(graph->nvtxs+graph->nrecv, -1, "graph->where"); icopy(graph->nvtxs, graph->home, graph->where); ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = ravg(graph->ncon, lbvec); if (lbavg > ubavg + 0.035 && ctrl->partType != REFINE_PARTITION) Balance_Partition(ctrl, graph); if (ctrl->dbglvl&DBG_PROGRESS) { ComputePartitionParams(ctrl, graph); ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10"PRIDX", cut: %8"PRIDX", balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3"PRREAL" ", lbvec[i]); rprintf(ctrl, "\n"); /* free memory allocated by ComputePartitionParams */ gk_free((void **)&graph->ckrinfo, &graph->lnpwgts, &graph->gnpwgts, LTERM); } /* check if no coarsening took place */ if (graph->finer == NULL) { ComputePartitionParams(ctrl, graph); KWayBalance(ctrl, graph, graph->ncon); KWayAdaptiveRefine(ctrl, graph, NGR_PASSES); } } else { /*******************************/ /* Coarsen it and partition it */ /*******************************/ switch (ctrl->ps_relation) { case PARMETIS_PSR_COUPLED: Match_Local(ctrl, graph); break; case PARMETIS_PSR_UNCOUPLED: default: Match_Global(ctrl, graph); break; } Adaptive_Partition(ctrl, graph->coarser); /********************************/ /* project partition and refine */ /********************************/ ProjectPartition(ctrl, graph); ComputePartitionParams(ctrl, graph); if (graph->ncon > 1 && graph->level < 4) { ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = ravg(graph->ncon, lbvec); if (lbavg > ubavg + 0.025) { KWayBalance(ctrl, graph, graph->ncon); } } KWayAdaptiveRefine(ctrl, graph, NGR_PASSES); if (ctrl->dbglvl&DBG_PROGRESS) { ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10"PRIDX", cut: %8"PRIDX", balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3"PRREAL" ", lbvec[i]); rprintf(ctrl, "\n"); } } WCOREPOP; }
/************************************************************************* * This function is the entry point of the initial partitioning algorithm. * This algorithm assembles the graph to all the processors and preceed * serially. **************************************************************************/ idx_t Mc_Diffusion(ctrl_t *ctrl, graph_t *graph, idx_t *vtxdist, idx_t *where, idx_t *home, idx_t npasses) { idx_t h, i, j; idx_t nvtxs, nedges, ncon, pass, iter, domain, processor; idx_t nparts, mype, npes, nlinks, me, you, wsize; idx_t nvisited, nswaps = -1, tnswaps, done, alldone = -1; idx_t *rowptr, *colind, *diff_where, *sr_where, *ehome, *map, *rmap; idx_t *pack, *unpack, *match, *proc2sub, *sub2proc; idx_t *visited, *gvisited; real_t *transfer, *npwgts, maxdiff, minflow, maxflow; real_t lbavg, oldlbavg, ubavg, *lbvec; real_t *diff_flows, *sr_flows; real_t diff_lbavg, sr_lbavg, diff_cost, sr_cost; idx_t *rbuffer, *sbuffer; idx_t *rcount, *rdispl; real_t *solution, *load, *workspace; matrix_t matrix; graph_t *egraph; if (graph->ncon > 3) return 0; WCOREPUSH; nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; nparts = ctrl->nparts; mype = ctrl->mype; npes = ctrl->npes; ubavg = ravg(ncon, ctrl->ubvec); /* initialize variables and allocate memory */ lbvec = rwspacemalloc(ctrl, ncon); diff_flows = rwspacemalloc(ctrl, ncon); sr_flows = rwspacemalloc(ctrl, ncon); load = rwspacemalloc(ctrl, nparts); solution = rwspacemalloc(ctrl, nparts); npwgts = graph->gnpwgts = rwspacemalloc(ctrl, ncon*nparts); matrix.values = rwspacemalloc(ctrl, nedges); transfer = matrix.transfer = rwspacemalloc(ctrl, ncon*nedges); proc2sub = iwspacemalloc(ctrl, gk_max(nparts, npes*2)); sub2proc = iwspacemalloc(ctrl, nparts); match = iwspacemalloc(ctrl, nparts); rowptr = matrix.rowptr = iwspacemalloc(ctrl, nparts+1); colind = matrix.colind = iwspacemalloc(ctrl, nedges); rcount = iwspacemalloc(ctrl, npes); rdispl = iwspacemalloc(ctrl, npes+1); pack = iwspacemalloc(ctrl, nvtxs); unpack = iwspacemalloc(ctrl, nvtxs); rbuffer = iwspacemalloc(ctrl, nvtxs); sbuffer = iwspacemalloc(ctrl, nvtxs); map = iwspacemalloc(ctrl, nvtxs); rmap = iwspacemalloc(ctrl, nvtxs); diff_where = iwspacemalloc(ctrl, nvtxs); ehome = iwspacemalloc(ctrl, nvtxs); wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "Mc_Diffusion: workspace"); graph->ckrinfo = (ckrinfo_t *)gk_malloc(nvtxs*sizeof(ckrinfo_t), "Mc_Diffusion: rinfo"); /* construct subdomain connectivity graph */ matrix.nrows = nparts; SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); nlinks = (matrix.nnzs-nparts) / 2; visited = iwspacemalloc(ctrl, matrix.nnzs); gvisited = iwspacemalloc(ctrl, matrix.nnzs); for (pass=0; pass<npasses; pass++) { rset(matrix.nnzs*ncon, 0.0, transfer); iset(matrix.nnzs, 0, gvisited); iset(matrix.nnzs, 0, visited); iter = nvisited = 0; /* compute ncon flow solutions */ for (h=0; h<ncon; h++) { rset(nparts, 0.0, solution); ComputeLoad(graph, nparts, load, ctrl->tpwgts, h); lbvec[h] = (rmax(nparts, load)+1.0/nparts) * (real_t)nparts; ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(ncon, &matrix, solution, transfer, h); } oldlbavg = ravg(ncon, lbvec); tnswaps = 0; maxdiff = 0.0; for (i=0; i<nparts; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { maxflow = rmax(ncon, transfer+j*ncon); minflow = rmin(ncon, transfer+j*ncon); maxdiff = (maxflow - minflow > maxdiff) ? maxflow - minflow : maxdiff; } } while (nvisited < nlinks) { /* compute independent sets of subdomains */ iset(gk_max(nparts, npes*2), UNMATCHED, proc2sub); CSR_Match_SHEM(&matrix, match, proc2sub, gvisited, ncon); /* set up the packing arrays */ iset(nparts, UNMATCHED, sub2proc); for (i=0; i<npes*2; i++) { if (proc2sub[i] == UNMATCHED) break; sub2proc[proc2sub[i]] = i/2; } iset(npes, 0, rcount); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) rcount[processor]++; } rdispl[0] = 0; for (i=1; i<npes+1; i++) rdispl[i] = rdispl[i-1] + rcount[i-1]; iset(nvtxs, UNMATCHED, unpack); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) unpack[rdispl[processor]++] = i; } SHIFTCSR(i, npes, rdispl); iset(nvtxs, UNMATCHED, pack); for (i=0; i<rdispl[npes]; i++) { ASSERT(unpack[i] != UNMATCHED); domain = where[unpack[i]]; processor = sub2proc[domain]; if (processor != UNMATCHED) pack[unpack[i]] = i; } /* Compute the flows */ if (proc2sub[mype*2] != UNMATCHED) { me = proc2sub[2*mype]; you = proc2sub[2*mype+1]; ASSERT(me != you); for (j=rowptr[me]; j<rowptr[me+1]; j++) { if (colind[j] == you) { visited[j] = 1; rcopy(ncon, transfer+j*ncon, diff_flows); break; } } for (j=rowptr[you]; j<rowptr[you+1]; j++) { if (colind[j] == me) { visited[j] = 1; for (h=0; h<ncon; h++) { if (transfer[j*ncon+h] > 0.0) diff_flows[h] = -1.0 * transfer[j*ncon+h]; } break; } } nswaps = 1; rcopy(ncon, diff_flows, sr_flows); iset(nvtxs, 0, sbuffer); for (i=0; i<nvtxs; i++) { if (where[i] == me || where[i] == you) sbuffer[i] = 1; } egraph = ExtractGraph(ctrl, graph, sbuffer, map, rmap); if (egraph != NULL) { icopy(egraph->nvtxs, egraph->where, diff_where); for (j=0; j<egraph->nvtxs; j++) ehome[j] = home[map[j]]; RedoMyLink(ctrl, egraph, ehome, me, you, sr_flows, &sr_cost, &sr_lbavg); if (ncon <= 4) { sr_where = egraph->where; egraph->where = diff_where; nswaps = BalanceMyLink(ctrl, egraph, ehome, me, you, diff_flows, maxdiff, &diff_cost, &diff_lbavg, 1.0/(real_t)nvtxs); if ((sr_lbavg < diff_lbavg && (diff_lbavg >= ubavg-1.0 || sr_cost == diff_cost)) || (sr_lbavg < ubavg-1.0 && sr_cost < diff_cost)) { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = sr_where[i]; } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = diff_where[i]; } } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = egraph->where[i]; } gk_free((void **)&egraph->xadj, &egraph->nvwgt, &egraph->adjncy, &egraph, LTERM); } /* Pack the flow data */ iset(nvtxs, UNMATCHED, sbuffer); for (i=0; i<nvtxs; i++) { domain = where[i]; if (domain == you || domain == me) sbuffer[pack[i]] = where[i]; } } /* Broadcast the flow data */ gkMPI_Allgatherv((void *)&sbuffer[rdispl[mype]], rcount[mype], IDX_T, (void *)rbuffer, rcount, rdispl, IDX_T, ctrl->comm); /* Unpack the flow data */ for (i=0; i<rdispl[npes]; i++) { if (rbuffer[i] != UNMATCHED) where[unpack[i]] = rbuffer[i]; } /* Do other stuff */ gkMPI_Allreduce((void *)visited, (void *)gvisited, matrix.nnzs, IDX_T, MPI_MAX, ctrl->comm); nvisited = isum(matrix.nnzs, gvisited, 1)/2; tnswaps += GlobalSESum(ctrl, nswaps); if (iter++ == NGD_PASSES) break; } /* perform serial refinement */ Mc_ComputeSerialPartitionParams(ctrl, graph, nparts); Mc_SerialKWayAdaptRefine(ctrl, graph, nparts, home, ctrl->ubvec, 10); /* check for early breakout */ for (h=0; h<ncon; h++) { lbvec[h] = (real_t)(nparts) * npwgts[rargmax_strd(nparts,npwgts+h,ncon)*ncon+h]; } lbavg = ravg(ncon, lbvec); done = 0; if (tnswaps == 0 || lbavg >= oldlbavg || lbavg <= ubavg + 0.035) done = 1; alldone = GlobalSEMax(ctrl, done); if (alldone == 1) break; } /* ensure that all subdomains have at least one vertex */ /* iset(nparts, 0, match); for (i=0; i<nvtxs; i++) match[where[i]]++; done = 0; while (done == 0) { done = 1; me = iargmin(nparts, match); if (match[me] == 0) { if (ctrl->mype == PE) printf("WARNING: empty subdomain %"PRIDX" in Mc_Diffusion\n", me); you = iargmax(nparts, match); for (i=0; i<nvtxs; i++) { if (where[i] == you) { where[i] = me; match[you]--; match[me]++; done = 0; break; } } } } */ /* now free memory and return */ gk_free((void **)&workspace, (void **)&graph->ckrinfo, LTERM); graph->gnpwgts = NULL; graph->ckrinfo = NULL; WCOREPOP; return 0; }
/************************************************************************* * This function is the driver to the multi-constraint partitioning algorithm. **************************************************************************/ void Moc_Global_Partition(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace) { int i, ncon, nparts; floattype ftmp, ubavg, lbavg, lbvec[MAXNCON]; ncon = graph->ncon; nparts = ctrl->nparts; ubavg = savg(graph->ncon, ctrl->ubvec); SetUp(ctrl, graph, wspace); if (ctrl->dbglvl&DBG_PROGRESS) { rprintf(ctrl, "[%6d %8d %5d %5d] [%d] [", graph->gnvtxs, GlobalSESum(ctrl, graph->nedges), GlobalSEMin(ctrl, graph->nvtxs), GlobalSEMax(ctrl, graph->nvtxs), ctrl->CoarsenTo); for (i=0; i<ncon; i++) rprintf(ctrl, " %.3f", GlobalSEMinFloat(ctrl,graph->nvwgt[samin_strd(graph->nvtxs, graph->nvwgt+i, ncon)*ncon+i])); rprintf(ctrl, "] ["); for (i=0; i<ncon; i++) rprintf(ctrl, " %.3f", GlobalSEMaxFloat(ctrl, graph->nvwgt[samax_strd(graph->nvtxs, graph->nvwgt+i, ncon)*ncon+i])); rprintf(ctrl, "]\n"); } if (graph->gnvtxs < 1.3*ctrl->CoarsenTo || (graph->finer != NULL && graph->gnvtxs > graph->finer->gnvtxs*COARSEN_FRACTION)) { /* Done with coarsening. Find a partition */ graph->where = idxmalloc(graph->nvtxs+graph->nrecv, "graph->where"); Moc_InitPartition_RB(ctrl, graph, wspace); if (ctrl->dbglvl&DBG_PROGRESS) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10d, balance: ", graph->gnvtxs); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3f ", lbvec[i]); rprintf(ctrl, "\n"); } /* In case no coarsening took place */ if (graph->finer == NULL) { Moc_ComputePartitionParams(ctrl, graph, wspace); Moc_KWayFM(ctrl, graph, wspace, NGR_PASSES); } } else { Moc_GlobalMatch_Balance(ctrl, graph, wspace); Moc_Global_Partition(ctrl, graph->coarser, wspace); Moc_ProjectPartition(ctrl, graph, wspace); Moc_ComputePartitionParams(ctrl, graph, wspace); if (graph->ncon > 1 && graph->level < 3) { for (i=0; i<ncon; i++) { ftmp = ssum_strd(nparts, graph->gnpwgts+i, ncon); if (ftmp != 0.0) lbvec[i] = (floattype)(nparts) * graph->gnpwgts[samax_strd(nparts, graph->gnpwgts+i, ncon)*ncon+i]/ftmp; else lbvec[i] = 1.0; } lbavg = savg(graph->ncon, lbvec); if (lbavg > ubavg + 0.035) { if (ctrl->dbglvl&DBG_PROGRESS) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10d, cut: %8d, balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3f ", lbvec[i]); rprintf(ctrl, "\n"); } Moc_KWayBalance(ctrl, graph, wspace, graph->ncon); } } Moc_KWayFM(ctrl, graph, wspace, NGR_PASSES); if (ctrl->dbglvl&DBG_PROGRESS) { Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10d, cut: %8d, balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3f ", lbvec[i]); rprintf(ctrl, "\n"); } if (graph->level != 0) GKfree((void **)&graph->lnpwgts, (void **)&graph->gnpwgts, LTERM); } return; }
/*********************************************************************************** * This function is the entry point of the parallel multilevel local diffusion * algorithm. It uses parallel undirected diffusion followed by adaptive k-way * refinement. This function utilizes local coarsening. ************************************************************************************/ void ParMETIS_V3_RefineKway(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy, idxtype *vwgt, idxtype *adjwgt, int *wgtflag, int *numflag, int *ncon, int *nparts, float *tpwgts, float *ubvec, int *options, int *edgecut, idxtype *part, MPI_Comm *comm) { int h, i; int npes, mype; CtrlType ctrl; WorkSpaceType wspace; GraphType *graph; int tewgt, tvsize, nmoved, maxin, maxout; float gtewgt, gtvsize, avg, maximb; int ps_relation, seed, dbglvl = 0; int iwgtflag, inumflag, incon, inparts, ioptions[10]; float *itpwgts, iubvec[MAXNCON]; MPI_Comm_size(*comm, &npes); MPI_Comm_rank(*comm, &mype); /********************************/ /* Try and take care bad inputs */ /********************************/ if (options != NULL && options[0] == 1) dbglvl = options[PMV3_OPTION_DBGLVL]; CheckInputs(REFINE_PARTITION, npes, dbglvl, wgtflag, &iwgtflag, numflag, &inumflag, ncon, &incon, nparts, &inparts, tpwgts, &itpwgts, ubvec, iubvec, NULL, NULL, options, ioptions, part, comm); /* ADD: take care of disconnected graph */ /* ADD: take care of highly unbalanced vtxdist */ /*********************************/ /* Take care the nparts = 1 case */ /*********************************/ if (inparts <= 1) { idxset(vtxdist[mype+1]-vtxdist[mype], 0, part); *edgecut = 0; return; } /**************************/ /* Set up data structures */ /**************************/ if (inumflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1); /*****************************/ /* Set up control structures */ /*****************************/ if (ioptions[0] == 1) { dbglvl = ioptions[PMV3_OPTION_DBGLVL]; seed = ioptions[PMV3_OPTION_SEED]; ps_relation = (npes == inparts) ? ioptions[PMV3_OPTION_PSR] : DISCOUPLED; } else { dbglvl = GLOBAL_DBGLVL; seed = GLOBAL_SEED; ps_relation = (npes == inparts) ? COUPLED : DISCOUPLED; } SetUpCtrl(&ctrl, inparts, dbglvl, *comm); ctrl.CoarsenTo = amin(vtxdist[npes]+1, 50*incon*amax(npes, inparts)); ctrl.ipc_factor = 1000.0; ctrl.redist_factor = 1.0; ctrl.redist_base = 1.0; ctrl.seed = (seed == 0) ? mype : seed*mype; ctrl.sync = GlobalSEMax(&ctrl, seed); ctrl.partType = REFINE_PARTITION; ctrl.ps_relation = ps_relation; ctrl.tpwgts = itpwgts; graph = Moc_SetUpGraph(&ctrl, incon, vtxdist, xadj, vwgt, adjncy, adjwgt, &iwgtflag); graph->vsize = idxsmalloc(graph->nvtxs, 1, "vsize"); graph->home = idxmalloc(graph->nvtxs, "home"); if (ctrl.ps_relation == COUPLED) idxset(graph->nvtxs, mype, graph->home); else idxcopy(graph->nvtxs, part, graph->home); tewgt = idxsum(graph->nedges, graph->adjwgt); tvsize = idxsum(graph->nvtxs, graph->vsize); gtewgt = (float) GlobalSESum(&ctrl, tewgt) + 1.0/graph->gnvtxs; gtvsize = (float) GlobalSESum(&ctrl, tvsize) + 1.0/graph->gnvtxs; ctrl.edge_size_ratio = gtewgt/gtvsize; scopy(incon, iubvec, ctrl.ubvec); PreAllocateMemory(&ctrl, graph, &wspace); /***********************/ /* Partition and Remap */ /***********************/ IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr)); Adaptive_Partition(&ctrl, graph, &wspace); ParallelReMapGraph(&ctrl, graph, &wspace); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr)); idxcopy(graph->nvtxs, graph->where, part); if (edgecut != NULL) *edgecut = graph->mincut; /***********************/ /* Take care of output */ /***********************/ IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl)); IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm)); if (ctrl.dbglvl&DBG_INFO) { Mc_ComputeMoveStatistics(&ctrl, graph, &nmoved, &maxin, &maxout); rprintf(&ctrl, "Final %3d-way Cut: %6d \tBalance: ", inparts, graph->mincut); avg = 0.0; for (h=0; h<incon; h++) { maximb = 0.0; for (i=0; i<inparts; i++) maximb = amax(maximb, graph->gnpwgts[i*incon+h]/itpwgts[i*incon+h]); avg += maximb; rprintf(&ctrl, "%.3f ", maximb); } rprintf(&ctrl, "\nNMoved: %d %d %d %d\n", nmoved, maxin, maxout, maxin+maxout); } /*************************************/ /* Free memory, renumber, and return */ /*************************************/ GKfree((void **)&graph->lnpwgts, (void **)&graph->gnpwgts, (void **)&graph->nvwgt, (void **)(&graph->home), (void **)(&graph->vsize), LTERM); GKfree((void **)&itpwgts, LTERM); FreeInitialGraphAndRemap(graph, iwgtflag); FreeWSpace(&wspace); FreeCtrl(&ctrl); if (inumflag == 1) ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0); return; }
/************************************************************************* * This function computes the assignment using the the objective the * minimization of the total volume of data that needs to move **************************************************************************/ void ParallelTotalVReMap(CtrlType *ctrl, idxtype *lpwgts, idxtype *map, WorkSpaceType *wspace, int npasses, int ncon) { int i, ii, j, k, nparts, mype; int pass, maxipwgt, nmapped, oldwgt, newwgt, done; idxtype *rowmap, *mylpwgts; KeyValueType *recv, send; int nsaved, gnsaved; mype = ctrl->mype; nparts = ctrl->nparts; recv = (KeyValueType *)GKmalloc(sizeof(KeyValueType)*nparts, "remap: recv"); mylpwgts = idxmalloc(nparts, "mylpwgts"); done = nmapped = 0; idxset(nparts, -1, map); rowmap = idxset(nparts, -1, wspace->pv3); idxcopy(nparts, lpwgts, mylpwgts); for (pass=0; pass<npasses; pass++) { maxipwgt = idxamax(nparts, mylpwgts); if (mylpwgts[maxipwgt] > 0 && !done) { send.key = -mylpwgts[maxipwgt]; send.val = mype*nparts+maxipwgt; } else { send.key = 0; send.val = -1; } /* each processor sends its selection */ MPI_Allgather((void *)&send, 2, IDX_DATATYPE, (void *)recv, 2, IDX_DATATYPE, ctrl->comm); ikeysort(nparts, recv); if (recv[0].key == 0) break; /* now make as many assignments as possible */ for (ii=0; ii<nparts; ii++) { i = recv[ii].val; if (i == -1) continue; j = i % nparts; k = i / nparts; if (map[j] == -1 && rowmap[k] == -1 && SimilarTpwgts(ctrl->tpwgts, ncon, j, k)) { map[j] = k; rowmap[k] = j; nmapped++; mylpwgts[j] = 0; if (mype == k) done = 1; } if (nmapped == nparts) break; } if (nmapped == nparts) break; } /* Map unmapped partitions */ if (nmapped < nparts) { for (i=j=0; j<nparts && nmapped<nparts; j++) { if (map[j] == -1) { for (; i<nparts; i++) { if (rowmap[i] == -1 && SimilarTpwgts(ctrl->tpwgts, ncon, i, j)) { map[j] = i; rowmap[i] = j; nmapped++; break; } } } } } /* check to see if remapping fails (due to dis-similar tpwgts) */ /* if remapping fails, revert to original mapping */ if (nmapped < nparts) { for (i=0; i<nparts; i++) map[i] = i; IFSET(ctrl->dbglvl, DBG_REMAP, rprintf(ctrl, "Savings from parallel remapping: %0\n")); } else { /* check for a savings */ oldwgt = lpwgts[mype]; newwgt = lpwgts[rowmap[mype]]; nsaved = newwgt - oldwgt; gnsaved = GlobalSESum(ctrl, nsaved); /* undo everything if we don't see a savings */ if (gnsaved <= 0) { for (i=0; i<nparts; i++) map[i] = i; } IFSET(ctrl->dbglvl, DBG_REMAP, rprintf(ctrl, "Savings from parallel remapping: %d\n", amax(0,gnsaved))); } GKfree((void **)&recv, (void **)&mylpwgts, LTERM); }
/************************************************************************* * This function performs k-way refinement **************************************************************************/ void Moc_KWayFM(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace, int npasses) { int h, i, ii, iii, j, k, c; int pass, nvtxs, nedges, ncon; int nmoves, nmoved, nswaps, nzgswaps; /* int gnswaps, gnzgswaps; */ int me, firstvtx, lastvtx, yourlastvtx; int from, to = -1, oldto, oldcut, mydomain, yourdomain, imbalanced, overweight; int npes = ctrl->npes, mype = ctrl->mype, nparts = ctrl->nparts; int nlupd, nsupd, nnbrs, nchanged; idxtype *xadj, *ladjncy, *adjwgt, *vtxdist; idxtype *where, *tmp_where, *moved; floattype *lnpwgts, *gnpwgts, *ognpwgts, *pgnpwgts, *movewgts, *overfill; idxtype *update, *supdate, *rupdate, *pe_updates; idxtype *changed, *perm, *pperm, *htable; idxtype *peind, *recvptr, *sendptr; KeyValueType *swchanges, *rwchanges; RInfoType *rinfo, *myrinfo, *tmp_myrinfo, *tmp_rinfo; EdgeType *tmp_edegrees, *my_edegrees, *your_edegrees; floattype lbvec[MAXNCON], *nvwgt, *badmaxpwgt, *ubvec, *tpwgts, lbavg, ubavg; int *nupds_pe; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->KWayTmr)); /*************************/ /* set up common aliases */ /*************************/ nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; vtxdist = graph->vtxdist; xadj = graph->xadj; ladjncy = graph->adjncy; adjwgt = graph->adjwgt; firstvtx = vtxdist[mype]; lastvtx = vtxdist[mype+1]; where = graph->where; rinfo = graph->rinfo; lnpwgts = graph->lnpwgts; gnpwgts = graph->gnpwgts; ubvec = ctrl->ubvec; tpwgts = ctrl->tpwgts; nnbrs = graph->nnbrs; peind = graph->peind; recvptr = graph->recvptr; sendptr = graph->sendptr; changed = idxmalloc(nvtxs, "KWR: changed"); rwchanges = wspace->pairs; swchanges = rwchanges + recvptr[nnbrs]; /************************************/ /* set up important data structures */ /************************************/ perm = idxmalloc(nvtxs, "KWR: perm"); pperm = idxmalloc(nparts, "KWR: pperm"); update = idxmalloc(nvtxs, "KWR: update"); supdate = wspace->indices; rupdate = supdate + recvptr[nnbrs]; nupds_pe = imalloc(npes, "KWR: nupds_pe"); htable = idxsmalloc(nvtxs+graph->nrecv, 0, "KWR: lhtable"); badmaxpwgt = fmalloc(nparts*ncon, "badmaxpwgt"); for (i=0; i<nparts; i++) { for (h=0; h<ncon; h++) { badmaxpwgt[i*ncon+h] = ubvec[h]*tpwgts[i*ncon+h]; } } movewgts = fmalloc(nparts*ncon, "KWR: movewgts"); ognpwgts = fmalloc(nparts*ncon, "KWR: ognpwgts"); pgnpwgts = fmalloc(nparts*ncon, "KWR: pgnpwgts"); overfill = fmalloc(nparts*ncon, "KWR: overfill"); moved = idxmalloc(nvtxs, "KWR: moved"); tmp_where = idxmalloc(nvtxs+graph->nrecv, "KWR: tmp_where"); tmp_rinfo = (RInfoType *)GKmalloc(sizeof(RInfoType)*nvtxs, "KWR: tmp_rinfo"); tmp_edegrees = (EdgeType *)GKmalloc(sizeof(EdgeType)*nedges, "KWR: tmp_edegrees"); idxcopy(nvtxs+graph->nrecv, where, tmp_where); for (i=0; i<nvtxs; i++) { tmp_rinfo[i].id = rinfo[i].id; tmp_rinfo[i].ed = rinfo[i].ed; tmp_rinfo[i].ndegrees = rinfo[i].ndegrees; tmp_rinfo[i].degrees = tmp_edegrees+xadj[i]; for (j=0; j<rinfo[i].ndegrees; j++) { tmp_rinfo[i].degrees[j].edge = rinfo[i].degrees[j].edge; tmp_rinfo[i].degrees[j].ewgt = rinfo[i].degrees[j].ewgt; } } nswaps = nzgswaps = 0; /*********************************************************/ /* perform a small number of passes through the vertices */ /*********************************************************/ for (pass=0; pass<npasses; pass++) { if (mype == 0) RandomPermute(nparts, pperm, 1); MPI_Bcast((void *)pperm, nparts, IDX_DATATYPE, 0, ctrl->comm); FastRandomPermute(nvtxs, perm, 1); oldcut = graph->mincut; /* check to see if the partitioning is imbalanced */ Moc_ComputeParallelBalance(ctrl, graph, graph->where, lbvec); ubavg = savg(ncon, ubvec); lbavg = savg(ncon, lbvec); imbalanced = (lbavg > ubavg) ? 1 : 0; for (c=0; c<2; c++) { scopy(ncon*nparts, gnpwgts, ognpwgts); sset(ncon*nparts, 0.0, movewgts); nmoved = 0; /**********************************************/ /* PASS ONE -- record stats for desired moves */ /**********************************************/ for (iii=0; iii<nvtxs; iii++) { i = perm[iii]; from = tmp_where[i]; nvwgt = graph->nvwgt+i*ncon; for (h=0; h<ncon; h++) if (fabs(nvwgt[h]-gnpwgts[from*ncon+h]) < SMALLFLOAT) break; if (h < ncon) { continue; } /* check for a potential improvement */ if (tmp_rinfo[i].ed >= tmp_rinfo[i].id) { my_edegrees = tmp_rinfo[i].degrees; for (k=0; k<tmp_rinfo[i].ndegrees; k++) { to = my_edegrees[k].edge; if (ProperSide(c, pperm[from], pperm[to])) { for (h=0; h<ncon; h++) if (gnpwgts[to*ncon+h]+nvwgt[h] > badmaxpwgt[to*ncon+h] && nvwgt[h] > 0.0) break; if (h == ncon) break; } } oldto = to; /* check if a subdomain was found that fits */ if (k < tmp_rinfo[i].ndegrees) { for (j=k+1; j<tmp_rinfo[i].ndegrees; j++) { to = my_edegrees[j].edge; if (ProperSide(c, pperm[from], pperm[to])) { for (h=0; h<ncon; h++) if (gnpwgts[to*ncon+h]+nvwgt[h] > badmaxpwgt[to*ncon+h] && nvwgt[h] > 0.0) break; if (h == ncon) { if (my_edegrees[j].ewgt > my_edegrees[k].ewgt || (my_edegrees[j].ewgt == my_edegrees[k].ewgt && IsHBalanceBetterTT(ncon,gnpwgts+oldto*ncon,gnpwgts+to*ncon,nvwgt,ubvec))){ k = j; oldto = my_edegrees[k].edge; } } } } to = oldto; if (my_edegrees[k].ewgt > tmp_rinfo[i].id || (my_edegrees[k].ewgt == tmp_rinfo[i].id && (imbalanced || graph->level > 3 || iii % 8 == 0) && IsHBalanceBetterFT(ncon,gnpwgts+from*ncon,gnpwgts+to*ncon,nvwgt,ubvec))){ /****************************************/ /* Update tmp arrays of the moved vertex */ /****************************************/ tmp_where[i] = to; moved[nmoved++] = i; for (h=0; h<ncon; h++) { lnpwgts[to*ncon+h] += nvwgt[h]; lnpwgts[from*ncon+h] -= nvwgt[h]; gnpwgts[to*ncon+h] += nvwgt[h]; gnpwgts[from*ncon+h] -= nvwgt[h]; movewgts[to*ncon+h] += nvwgt[h]; movewgts[from*ncon+h] -= nvwgt[h]; } tmp_rinfo[i].ed += tmp_rinfo[i].id-my_edegrees[k].ewgt; SWAP(tmp_rinfo[i].id, my_edegrees[k].ewgt, j); if (my_edegrees[k].ewgt == 0) { tmp_rinfo[i].ndegrees--; my_edegrees[k].edge = my_edegrees[tmp_rinfo[i].ndegrees].edge; my_edegrees[k].ewgt = my_edegrees[tmp_rinfo[i].ndegrees].ewgt; } else { my_edegrees[k].edge = from; } /* Update the degrees of adjacent vertices */ for (j=xadj[i]; j<xadj[i+1]; j++) { /* no need to bother about vertices on different pe's */ if (ladjncy[j] >= nvtxs) continue; me = ladjncy[j]; mydomain = tmp_where[me]; myrinfo = tmp_rinfo+me; your_edegrees = myrinfo->degrees; if (mydomain == from) { INC_DEC(myrinfo->ed, myrinfo->id, adjwgt[j]); } else { if (mydomain == to) { INC_DEC(myrinfo->id, myrinfo->ed, adjwgt[j]); } } /* Remove contribution from the .ed of 'from' */ if (mydomain != from) { for (k=0; k<myrinfo->ndegrees; k++) { if (your_edegrees[k].edge == from) { if (your_edegrees[k].ewgt == adjwgt[j]) { myrinfo->ndegrees--; your_edegrees[k].edge = your_edegrees[myrinfo->ndegrees].edge; your_edegrees[k].ewgt = your_edegrees[myrinfo->ndegrees].ewgt; } else { your_edegrees[k].ewgt -= adjwgt[j]; } break; } } } /* Add contribution to the .ed of 'to' */ if (mydomain != to) { for (k=0; k<myrinfo->ndegrees; k++) { if (your_edegrees[k].edge == to) { your_edegrees[k].ewgt += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { your_edegrees[myrinfo->ndegrees].edge = to; your_edegrees[myrinfo->ndegrees++].ewgt = adjwgt[j]; } } } } } } } /******************************************/ /* Let processors know the subdomain wgts */ /* if all proposed moves commit. */ /******************************************/ MPI_Allreduce((void *)lnpwgts, (void *)pgnpwgts, nparts*ncon, MPI_DOUBLE, MPI_SUM, ctrl->comm); /**************************/ /* compute overfill array */ /**************************/ overweight = 0; for (j=0; j<nparts; j++) { for (h=0; h<ncon; h++) { if (pgnpwgts[j*ncon+h] > ognpwgts[j*ncon+h]) { overfill[j*ncon+h] = (pgnpwgts[j*ncon+h]-badmaxpwgt[j*ncon+h]) / (pgnpwgts[j*ncon+h]-ognpwgts[j*ncon+h]); } else { overfill[j*ncon+h] = 0.0; } overfill[j*ncon+h] = amax(overfill[j*ncon+h], 0.0); overfill[j*ncon+h] *= movewgts[j*ncon+h]; if (overfill[j*ncon+h] > 0.0) overweight = 1; ASSERTP(ctrl, ognpwgts[j*ncon+h] <= badmaxpwgt[j*ncon+h] || pgnpwgts[j*ncon+h] <= ognpwgts[j*ncon+h], (ctrl, "%.4f %.4f %.4f\n", ognpwgts[j*ncon+h], badmaxpwgt[j*ncon+h], pgnpwgts[j*ncon+h])); } } /****************************************************/ /* select moves to undo according to overfill array */ /****************************************************/ if (overweight == 1) { for (iii=0; iii<nmoved; iii++) { i = moved[iii]; oldto = tmp_where[i]; nvwgt = graph->nvwgt+i*ncon; my_edegrees = tmp_rinfo[i].degrees; for (k=0; k<tmp_rinfo[i].ndegrees; k++) if (my_edegrees[k].edge == where[i]) break; for (h=0; h<ncon; h++) if (nvwgt[h] > 0.0 && overfill[oldto*ncon+h] > nvwgt[h]/4.0) break; /**********************************/ /* nullify this move if necessary */ /**********************************/ if (k != tmp_rinfo[i].ndegrees && h != ncon) { moved[iii] = -1; from = oldto; to = where[i]; for (h=0; h<ncon; h++) { overfill[oldto*ncon+h] = amax(overfill[oldto*ncon+h]-nvwgt[h], 0.0); } tmp_where[i] = to; tmp_rinfo[i].ed += tmp_rinfo[i].id-my_edegrees[k].ewgt; SWAP(tmp_rinfo[i].id, my_edegrees[k].ewgt, j); if (my_edegrees[k].ewgt == 0) { tmp_rinfo[i].ndegrees--; my_edegrees[k].edge = my_edegrees[tmp_rinfo[i].ndegrees].edge; my_edegrees[k].ewgt = my_edegrees[tmp_rinfo[i].ndegrees].ewgt; } else { my_edegrees[k].edge = from; } for (h=0; h<ncon; h++) { lnpwgts[to*ncon+h] += nvwgt[h]; lnpwgts[from*ncon+h] -= nvwgt[h]; } /* Update the degrees of adjacent vertices */ for (j=xadj[i]; j<xadj[i+1]; j++) { /* no need to bother about vertices on different pe's */ if (ladjncy[j] >= nvtxs) continue; me = ladjncy[j]; mydomain = tmp_where[me]; myrinfo = tmp_rinfo+me; your_edegrees = myrinfo->degrees; if (mydomain == from) { INC_DEC(myrinfo->ed, myrinfo->id, adjwgt[j]); } else { if (mydomain == to) { INC_DEC(myrinfo->id, myrinfo->ed, adjwgt[j]); } } /* Remove contribution from the .ed of 'from' */ if (mydomain != from) { for (k=0; k<myrinfo->ndegrees; k++) { if (your_edegrees[k].edge == from) { if (your_edegrees[k].ewgt == adjwgt[j]) { myrinfo->ndegrees--; your_edegrees[k].edge = your_edegrees[myrinfo->ndegrees].edge; your_edegrees[k].ewgt = your_edegrees[myrinfo->ndegrees].ewgt; } else { your_edegrees[k].ewgt -= adjwgt[j]; } break; } } } /* Add contribution to the .ed of 'to' */ if (mydomain != to) { for (k=0; k<myrinfo->ndegrees; k++) { if (your_edegrees[k].edge == to) { your_edegrees[k].ewgt += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { your_edegrees[myrinfo->ndegrees].edge = to; your_edegrees[myrinfo->ndegrees++].ewgt = adjwgt[j]; } } } } } } /*************************************************/ /* PASS TWO -- commit the remainder of the moves */ /*************************************************/ nlupd = nsupd = nmoves = nchanged = 0; for (iii=0; iii<nmoved; iii++) { i = moved[iii]; if (i == -1) continue; where[i] = tmp_where[i]; /* Make sure to update the vertex information */ if (htable[i] == 0) { /* make sure you do the update */ htable[i] = 1; update[nlupd++] = i; } /* Put the vertices adjacent to i into the update array */ for (j=xadj[i]; j<xadj[i+1]; j++) { k = ladjncy[j]; if (htable[k] == 0) { htable[k] = 1; if (k<nvtxs) update[nlupd++] = k; else supdate[nsupd++] = k; } } nmoves++; nswaps++; /* check number of zero-gain moves */ for (k=0; k<rinfo[i].ndegrees; k++) if (rinfo[i].degrees[k].edge == to) break; if (rinfo[i].id == rinfo[i].degrees[k].ewgt) nzgswaps++; if (graph->pexadj[i+1]-graph->pexadj[i] > 0) changed[nchanged++] = i; } /* Tell interested pe's the new where[] info for the interface vertices */ CommChangedInterfaceData(ctrl, graph, nchanged, changed, where, swchanges, rwchanges, wspace->pv4); IFSET(ctrl->dbglvl, DBG_RMOVEINFO, rprintf(ctrl, "\t[%d %d], [%.4f], [%d %d %d]\n", pass, c, badmaxpwgt[0], GlobalSESum(ctrl, nmoves), GlobalSESum(ctrl, nsupd), GlobalSESum(ctrl, nlupd))); /*------------------------------------------------------------- / Time to communicate with processors to send the vertices / whose degrees need to be update. /-------------------------------------------------------------*/ /* Issue the receives first */ for (i=0; i<nnbrs; i++) { MPI_Irecv((void *)(rupdate+sendptr[i]), sendptr[i+1]-sendptr[i], IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Issue the sends next. This needs some preporcessing */ for (i=0; i<nsupd; i++) { htable[supdate[i]] = 0; supdate[i] = graph->imap[supdate[i]]; } iidxsort(nsupd, supdate); for (j=i=0; i<nnbrs; i++) { yourlastvtx = vtxdist[peind[i]+1]; for (k=j; k<nsupd && supdate[k] < yourlastvtx; k++); MPI_Isend((void *)(supdate+j), k-j, IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->sreq+i); j = k; } /* OK, now get into the loop waiting for the send/recv operations to finish */ MPI_Waitall(nnbrs, ctrl->rreq, ctrl->statuses); for (i=0; i<nnbrs; i++) MPI_Get_count(ctrl->statuses+i, IDX_DATATYPE, nupds_pe+i); MPI_Waitall(nnbrs, ctrl->sreq, ctrl->statuses); /*------------------------------------------------------------- / Place the recieved to-be updated vertices into update[] /-------------------------------------------------------------*/ for (i=0; i<nnbrs; i++) { pe_updates = rupdate+sendptr[i]; for (j=0; j<nupds_pe[i]; j++) { k = pe_updates[j]; if (htable[k-firstvtx] == 0) { htable[k-firstvtx] = 1; update[nlupd++] = k-firstvtx; } } } /*------------------------------------------------------------- / Update the rinfo of the vertices in the update[] array /-------------------------------------------------------------*/ for (ii=0; ii<nlupd; ii++) { i = update[ii]; ASSERT(ctrl, htable[i] == 1); htable[i] = 0; mydomain = where[i]; myrinfo = rinfo+i; tmp_myrinfo = tmp_rinfo+i; my_edegrees = myrinfo->degrees; your_edegrees = tmp_myrinfo->degrees; graph->lmincut -= myrinfo->ed; myrinfo->ndegrees = 0; myrinfo->id = 0; myrinfo->ed = 0; for (j=xadj[i]; j<xadj[i+1]; j++) { yourdomain = where[ladjncy[j]]; if (mydomain != yourdomain) { myrinfo->ed += adjwgt[j]; for (k=0; k<myrinfo->ndegrees; k++) { if (my_edegrees[k].edge == yourdomain) { my_edegrees[k].ewgt += adjwgt[j]; your_edegrees[k].ewgt += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { my_edegrees[k].edge = yourdomain; my_edegrees[k].ewgt = adjwgt[j]; your_edegrees[k].edge = yourdomain; your_edegrees[k].ewgt = adjwgt[j]; myrinfo->ndegrees++; } ASSERT(ctrl, myrinfo->ndegrees <= xadj[i+1]-xadj[i]); ASSERT(ctrl, tmp_myrinfo->ndegrees <= xadj[i+1]-xadj[i]); } else { myrinfo->id += adjwgt[j]; } } graph->lmincut += myrinfo->ed; tmp_myrinfo->id = myrinfo->id; tmp_myrinfo->ed = myrinfo->ed; tmp_myrinfo->ndegrees = myrinfo->ndegrees; } /* finally, sum-up the partition weights */ MPI_Allreduce((void *)lnpwgts, (void *)gnpwgts, nparts*ncon, MPI_DOUBLE, MPI_SUM, ctrl->comm); } graph->mincut = GlobalSESum(ctrl, graph->lmincut)/2; if (graph->mincut == oldcut) break; } /* gnswaps = GlobalSESum(ctrl, nswaps); gnzgswaps = GlobalSESum(ctrl, nzgswaps); if (mype == 0) printf("niters: %d, nswaps: %d, nzgswaps: %d\n", pass+1, gnswaps, gnzgswaps); */ GKfree((void **)&badmaxpwgt, (void **)&update, (void **)&nupds_pe, (void **)&htable, LTERM); GKfree((void **)&changed, (void **)&pperm, (void **)&perm, (void **)&moved, LTERM); GKfree((void **)&pgnpwgts, (void **)&ognpwgts, (void **)&overfill, (void **)&movewgts, LTERM); GKfree((void **)&tmp_where, (void **)&tmp_rinfo, (void **)&tmp_edegrees, LTERM); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->KWayTmr)); }
/************************************************************************* * This function finds a matching **************************************************************************/ void Moc_GlobalMatch_Balance(CtrlType *ctrl, GraphType *graph, WorkSpaceType *wspace) { int h, i, ii, j, k; int nnbrs, nvtxs, ncon, cnvtxs, firstvtx, lastvtx, maxi, maxidx, nkept; int otherlastvtx, nrequests, nchanged, pass, nmatched, wside; idxtype *xadj, *ladjncy, *adjwgt, *vtxdist, *home, *myhome, *shome, *rhome; idxtype *match, *rmatch, *smatch; idxtype *peind, *sendptr, *recvptr; idxtype *perm, *iperm, *nperm, *changed; floattype *nvwgt, maxnvwgt; int *nreqs_pe; KeyValueType *match_requests, *match_granted, *pe_requests; maxnvwgt = 1.0/((floattype)(ctrl->nparts)*MAXNVWGT_FACTOR); graph->match_type = MATCH_GLOBAL; IFSET(ctrl->dbglvl, DBG_TIME, MPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->MatchTmr)); nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; ladjncy = graph->adjncy; adjwgt = graph->adjwgt; home = graph->home; nvwgt = graph->nvwgt; vtxdist = graph->vtxdist; firstvtx = vtxdist[ctrl->mype]; lastvtx = vtxdist[ctrl->mype+1]; match = graph->match = idxsmalloc(nvtxs+graph->nrecv, UNMATCHED, "HEM_Match: match"); myhome = idxsmalloc(nvtxs+graph->nrecv, UNMATCHED, "HEM_Match: myhome"); /*------------------------------------------------------------ / Send/Receive the home information of interface vertices /------------------------------------------------------------*/ if (ctrl->partType == ADAPTIVE_PARTITION || ctrl->partType == REFINE_PARTITION) { idxcopy(nvtxs, home, myhome); shome = wspace->indices; rhome = myhome + nvtxs; CommInterfaceData(ctrl, graph, myhome, shome, rhome); } nnbrs = graph->nnbrs; peind = graph->peind; sendptr = graph->sendptr; recvptr = graph->recvptr; /* Use wspace->indices as the tmp space for matching info of the boundary * vertices that are sent and received */ rmatch = match + nvtxs; smatch = wspace->indices; changed = smatch+graph->nsend; /* Use wspace->indices as the tmp space for match requests of the boundary * vertices that are sent and received */ match_requests = wspace->pairs; match_granted = match_requests + graph->nsend; nreqs_pe = ismalloc(nnbrs, 0, "Match_HEM: nreqs_pe"); nkept = graph->gnvtxs/ctrl->npes - nvtxs; perm = (idxtype *)wspace->degrees; iperm = perm + nvtxs; FastRandomPermute(nvtxs, perm, 1); for (i=0; i<nvtxs; i++) iperm[perm[i]] = i; nperm = iperm + nvtxs; for (i=0; i<nnbrs; i++) nperm[i] = i; /************************************************************* * Go now and find a matching by doing multiple iterations *************************************************************/ /* First nullify the heavy vertices */ for (nchanged=i=0; i<nvtxs; i++) { for (h=0; h<ncon; h++) if (nvwgt[i*ncon+h] > maxnvwgt) { break; } if (h != ncon) { match[i] = TOO_HEAVY; nchanged++; } } if (GlobalSESum(ctrl, nchanged) > 0) { IFSET(ctrl->dbglvl, DBG_PROGRESS, rprintf(ctrl, "We found %d heavy vertices!\n", GlobalSESum(ctrl, nchanged))); CommInterfaceData(ctrl, graph, match, smatch, rmatch); } for (nmatched=pass=0; pass<NMATCH_PASSES; pass++) { wside = (graph->level+pass)%2; nchanged = nrequests = 0; for (ii=nmatched; ii<nvtxs; ii++) { i = perm[ii]; if (match[i] == UNMATCHED) { /* Unmatched */ maxidx = i; maxi = -1; /* Find a heavy-edge matching */ for (j=xadj[i]; j<xadj[i+1]; j++) { k = ladjncy[j]; if (match[k] == UNMATCHED && myhome[k] == myhome[i] && (maxi == -1 || adjwgt[maxi] < adjwgt[j] || (maxidx < nvtxs && k < nvtxs && adjwgt[maxi] == adjwgt[j] && BetterVBalance(ncon,nvwgt+i*ncon,nvwgt+maxidx*ncon,nvwgt+k*ncon) >= 0))) { maxi = j; maxidx = k; } } if (maxi != -1) { k = ladjncy[maxi]; if (k < nvtxs) { /* Take care the local vertices first */ /* Here we give preference the local matching by granting it right away */ if (i <= k) { match[i] = firstvtx+k + KEEP_BIT; match[k] = firstvtx+i; } else { match[i] = firstvtx+k; match[k] = firstvtx+i + KEEP_BIT; } changed[nchanged++] = i; changed[nchanged++] = k; } else { /* Take care any remote boundary vertices */ match[k] = MAYBE_MATCHED; /* Alternate among which vertices will issue the requests */ if ((wside ==0 && firstvtx+i < graph->imap[k]) || (wside == 1 && firstvtx+i > graph->imap[k])) { match[i] = MAYBE_MATCHED; match_requests[nrequests].key = graph->imap[k]; match_requests[nrequests].val = firstvtx+i; nrequests++; } } } } } #ifdef DEBUG_MATCH PrintVector2(ctrl, nvtxs, firstvtx, match, "Match1"); myprintf(ctrl, "[c: %2d] Nlocal: %d, Nrequests: %d\n", c, nlocal, nrequests); #endif /*********************************************************** * Exchange the match_requests, requests for me are stored in * match_granted ************************************************************/ /* Issue the receives first. Note that from each PE can receive a maximum of the interface node that it needs to send it in the case of a mat-vec */ for (i=0; i<nnbrs; i++) { MPI_Irecv((void *)(match_granted+recvptr[i]), 2*(recvptr[i+1]-recvptr[i]), IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Issue the sends next. This needs some work */ ikeysort(nrequests, match_requests); for (j=i=0; i<nnbrs; i++) { otherlastvtx = vtxdist[peind[i]+1]; for (k=j; k<nrequests && match_requests[k].key < otherlastvtx; k++); MPI_Isend((void *)(match_requests+j), 2*(k-j), IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->sreq+i); j = k; } /* OK, now get into the loop waiting for the operations to finish */ MPI_Waitall(nnbrs, ctrl->rreq, ctrl->statuses); for (i=0; i<nnbrs; i++) { MPI_Get_count(ctrl->statuses+i, IDX_DATATYPE, nreqs_pe+i); nreqs_pe[i] = nreqs_pe[i]/2; /* Adjust for pairs of IDX_DATATYPE */ } MPI_Waitall(nnbrs, ctrl->sreq, ctrl->statuses); /*********************************************************** * Now, go and service the requests that you received in * match_granted ************************************************************/ RandomPermute(nnbrs, nperm, 0); for (ii=0; ii<nnbrs; ii++) { i = nperm[ii]; pe_requests = match_granted+recvptr[i]; for (j=0; j<nreqs_pe[i]; j++) { k = pe_requests[j].key; ASSERTP(ctrl, k >= firstvtx && k < lastvtx, (ctrl, "%d %d %d %d %d\n", firstvtx, lastvtx, k, j, peind[i])); /* myprintf(ctrl, "Requesting a match %d %d\n", pe_requests[j].key, pe_requests[j].val); */ if (match[k-firstvtx] == UNMATCHED) { /* Bingo, lets grant this request */ changed[nchanged++] = k-firstvtx; if (nkept >= 0) { /* Flip a coin for who gets it */ match[k-firstvtx] = pe_requests[j].val + KEEP_BIT; nkept--; } else { match[k-firstvtx] = pe_requests[j].val; pe_requests[j].key += KEEP_BIT; nkept++; } /* myprintf(ctrl, "Request from pe:%d (%d %d) granted!\n", peind[i], pe_requests[j].val, pe_requests[j].key); */ } else { /* We are not granting the request */ /* myprintf(ctrl, "Request from pe:%d (%d %d) not granted!\n", peind[i], pe_requests[j].val, pe_requests[j].key); */ pe_requests[j].key = UNMATCHED; } } } /*********************************************************** * Exchange the match_granted information. It is stored in * match_requests ************************************************************/ /* Issue the receives first. Note that from each PE can receive a maximum of the interface node that it needs to send during the case of a mat-vec */ for (i=0; i<nnbrs; i++) { MPI_Irecv((void *)(match_requests+sendptr[i]), 2*(sendptr[i+1]-sendptr[i]), IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Issue the sends next. */ for (i=0; i<nnbrs; i++) { MPI_Isend((void *)(match_granted+recvptr[i]), 2*nreqs_pe[i], IDX_DATATYPE, peind[i], 1, ctrl->comm, ctrl->sreq+i); } /* OK, now get into the loop waiting for the operations to finish */ MPI_Waitall(nnbrs, ctrl->rreq, ctrl->statuses); for (i=0; i<nnbrs; i++) { MPI_Get_count(ctrl->statuses+i, IDX_DATATYPE, nreqs_pe+i); nreqs_pe[i] = nreqs_pe[i]/2; /* Adjust for pairs of IDX_DATATYPE */ } MPI_Waitall(nnbrs, ctrl->sreq, ctrl->statuses); /*********************************************************** * Now, go and through the match_requests and update local * match information for the matchings that were granted. ************************************************************/ for (i=0; i<nnbrs; i++) { pe_requests = match_requests+sendptr[i]; for (j=0; j<nreqs_pe[i]; j++) { match[pe_requests[j].val-firstvtx] = pe_requests[j].key; if (pe_requests[j].key != UNMATCHED) changed[nchanged++] = pe_requests[j].val-firstvtx; } } for (i=0; i<nchanged; i++) { ii = iperm[changed[i]]; perm[ii] = perm[nmatched]; iperm[perm[nmatched]] = ii; nmatched++; } CommChangedInterfaceData(ctrl, graph, nchanged, changed, match, match_requests, match_granted, wspace->pv4); } /* Traverse the vertices and those that were unmatched, match them with themselves */ cnvtxs = 0; for (i=0; i<nvtxs; i++) { if (match[i] == UNMATCHED || match[i] == TOO_HEAVY) { match[i] = (firstvtx+i) + KEEP_BIT; cnvtxs++; } else if (match[i] >= KEEP_BIT) { /* A matched vertex which I get to keep */ cnvtxs++; } } if (ctrl->dbglvl&DBG_MATCHINFO) { PrintVector2(ctrl, nvtxs, firstvtx, match, "Match"); myprintf(ctrl, "Cnvtxs: %d\n", cnvtxs); rprintf(ctrl, "Done with matching...\n"); } GKfree((void **)(&myhome), (void **)(&nreqs_pe), LTERM); IFSET(ctrl->dbglvl, DBG_TIME, MPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->MatchTmr)); IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->ContractTmr)); Moc_Global_CreateCoarseGraph(ctrl, graph, wspace, cnvtxs); IFSET(ctrl->dbglvl, DBG_TIME, MPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->ContractTmr)); }