/************************************************************************* * This function implements a simple graph adaption strategy. **************************************************************************/ void AdaptGraph2(graph_t *graph, idx_t afactor, MPI_Comm comm) { idx_t i, j, k, nvtxs, firstvtx, lastvtx; idx_t npes, mype, mypwgt, max, min, sum; idx_t *vwgt, *xadj, *adjncy, *adjwgt; gkMPI_Comm_size(comm, &npes); gkMPI_Comm_rank(comm, &mype); srand(mype*afactor); nvtxs = graph->nvtxs; xadj = graph->xadj; adjncy = graph->adjncy; if (graph->adjwgt == NULL) adjwgt = graph->adjwgt = ismalloc(graph->nedges, 1, "AdaptGraph: adjwgt"); else adjwgt = graph->adjwgt; vwgt = graph->vwgt; firstvtx = graph->vtxdist[mype]; lastvtx = graph->vtxdist[mype+1]; /* if (RandomInRange(npes+1) < .05*npes) { */ if (RandomInRange(npes+1) < 2) { printf("[%"PRIDX"] is adapting\n", mype); for (i=0; i<nvtxs; i++) vwgt[i] = afactor*vwgt[i]; } for (i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { k = adjncy[j]; if (k >= firstvtx && k < lastvtx) { adjwgt[j] = (int)pow(1.0*(gk_min(vwgt[i],vwgt[k-firstvtx])), .6667); if (adjwgt[j] == 0) adjwgt[j] = 1; } } } mypwgt = isum(nvtxs, vwgt, 1); gkMPI_Allreduce((void *)&mypwgt, (void *)&max, 1, IDX_T, MPI_MAX, comm); gkMPI_Allreduce((void *)&mypwgt, (void *)&min, 1, IDX_T, MPI_MIN, comm); gkMPI_Allreduce((void *)&mypwgt, (void *)&sum, 1, IDX_T, MPI_SUM, comm); if (mype == 0) printf("Initial Load Imbalance: %5.4"PRREAL", [%5"PRIDX" %5"PRIDX" %5"PRIDX"]\n", (1.0*max*npes)/(1.0*sum), min, max, sum); }
/************************************************************************* * This function is the entry point of the initial partition algorithm * that does recursive bissection. * This algorithm assembles the graph to all the processors and preceeds * by parallelizing the recursive bisection step. **************************************************************************/ void InitPartition(ctrl_t *ctrl, graph_t *graph) { idx_t i, j, ncon, mype, npes, gnvtxs, ngroups; idx_t *xadj, *adjncy, *adjwgt, *vwgt; idx_t *part, *gwhere0, *gwhere1; idx_t *tmpwhere, *tmpvwgt, *tmpxadj, *tmpadjncy, *tmpadjwgt; graph_t *agraph; idx_t lnparts, fpart, fpe, lnpes; idx_t twoparts=2, moptions[METIS_NOPTIONS], edgecut, max_cut; real_t *tpwgts, *tpwgts2, *lbvec, lbsum, min_lbsum, wsum; MPI_Comm ipcomm; struct { double sum; int rank; } lpesum, gpesum; WCOREPUSH; ncon = graph->ncon; ngroups = gk_max(gk_min(RIP_SPLIT_FACTOR, ctrl->npes), 1); IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->InitPartTmr)); lbvec = rwspacemalloc(ctrl, ncon); /* assemble the graph to all the processors */ agraph = AssembleAdaptiveGraph(ctrl, graph); gnvtxs = agraph->nvtxs; /* make a copy of the graph's structure for later */ xadj = icopy(gnvtxs+1, agraph->xadj, iwspacemalloc(ctrl, gnvtxs+1)); vwgt = icopy(gnvtxs*ncon, agraph->vwgt, iwspacemalloc(ctrl, gnvtxs*ncon)); adjncy = icopy(agraph->nedges, agraph->adjncy, iwspacemalloc(ctrl, agraph->nedges)); adjwgt = icopy(agraph->nedges, agraph->adjwgt, iwspacemalloc(ctrl, agraph->nedges)); part = iwspacemalloc(ctrl, gnvtxs); /* create different processor groups */ gkMPI_Comm_split(ctrl->gcomm, ctrl->mype % ngroups, 0, &ipcomm); gkMPI_Comm_rank(ipcomm, &mype); gkMPI_Comm_size(ipcomm, &npes); /* Go into the recursive bisection */ METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_SEED] = ctrl->sync + (ctrl->mype % ngroups) + 1; tpwgts = ctrl->tpwgts; tpwgts2 = rwspacemalloc(ctrl, 2*ncon); lnparts = ctrl->nparts; fpart = fpe = 0; lnpes = npes; while (lnpes > 1 && lnparts > 1) { /* determine the weights of the two partitions as a function of the weight of the target partition weights */ for (j=(lnparts>>1), i=0; i<ncon; i++) { tpwgts2[i] = rsum(j, tpwgts+fpart*ncon+i, ncon); tpwgts2[ncon+i] = rsum(lnparts-j, tpwgts+(fpart+j)*ncon+i, ncon); wsum = 1.0/(tpwgts2[i] + tpwgts2[ncon+i]); tpwgts2[i] *= wsum; tpwgts2[ncon+i] *= wsum; } METIS_PartGraphRecursive(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &twoparts, tpwgts2, NULL, moptions, &edgecut, part); /* pick one of the branches */ if (mype < fpe+lnpes/2) { KeepPart(ctrl, agraph, part, 0); lnpes = lnpes/2; lnparts = lnparts/2; } else { KeepPart(ctrl, agraph, part, 1); fpart = fpart + lnparts/2; fpe = fpe + lnpes/2; lnpes = lnpes - lnpes/2; lnparts = lnparts - lnparts/2; } } gwhere0 = iset(gnvtxs, 0, iwspacemalloc(ctrl, gnvtxs)); gwhere1 = iwspacemalloc(ctrl, gnvtxs); if (lnparts == 1) { /* Case npes is greater than or equal to nparts */ /* Only the first process will assign labels (for the reduction to work) */ if (mype == fpe) { for (i=0; i<agraph->nvtxs; i++) gwhere0[agraph->label[i]] = fpart; } } else { /* Case in which npes is smaller than nparts */ /* create the normalized tpwgts for the lnparts from ctrl->tpwgts */ tpwgts = rwspacemalloc(ctrl, lnparts*ncon); for (j=0; j<ncon; j++) { for (wsum=0.0, i=0; i<lnparts; i++) { tpwgts[i*ncon+j] = ctrl->tpwgts[(fpart+i)*ncon+j]; wsum += tpwgts[i*ncon+j]; } for (wsum=1.0/wsum, i=0; i<lnparts; i++) tpwgts[i*ncon+j] *= wsum; } METIS_PartGraphKway(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &lnparts, tpwgts, NULL, moptions, &edgecut, part); for (i=0; i<agraph->nvtxs; i++) gwhere0[agraph->label[i]] = fpart + part[i]; } gkMPI_Allreduce((void *)gwhere0, (void *)gwhere1, gnvtxs, IDX_T, MPI_SUM, ipcomm); if (ngroups > 1) { tmpxadj = agraph->xadj; tmpadjncy = agraph->adjncy; tmpadjwgt = agraph->adjwgt; tmpvwgt = agraph->vwgt; tmpwhere = agraph->where; agraph->xadj = xadj; agraph->adjncy = adjncy; agraph->adjwgt = adjwgt; agraph->vwgt = vwgt; agraph->where = gwhere1; agraph->vwgt = vwgt; agraph->nvtxs = gnvtxs; edgecut = ComputeSerialEdgeCut(agraph); ComputeSerialBalance(ctrl, agraph, gwhere1, lbvec); lbsum = rsum(ncon, lbvec, 1); gkMPI_Allreduce((void *)&edgecut, (void *)&max_cut, 1, IDX_T, MPI_MAX, ctrl->gcomm); gkMPI_Allreduce((void *)&lbsum, (void *)&min_lbsum, 1, REAL_T, MPI_MIN, ctrl->gcomm); lpesum.sum = lbsum; if (min_lbsum < UNBALANCE_FRACTION*ncon) { if (lbsum < UNBALANCE_FRACTION*ncon) lpesum.sum = edgecut; else lpesum.sum = max_cut; } lpesum.rank = ctrl->mype; gkMPI_Allreduce((void *)&lpesum, (void *)&gpesum, 1, MPI_DOUBLE_INT, MPI_MINLOC, ctrl->gcomm); gkMPI_Bcast((void *)gwhere1, gnvtxs, IDX_T, gpesum.rank, ctrl->gcomm); agraph->xadj = tmpxadj; agraph->adjncy = tmpadjncy; agraph->adjwgt = tmpadjwgt; agraph->vwgt = tmpvwgt; agraph->where = tmpwhere; } icopy(graph->nvtxs, gwhere1+graph->vtxdist[ctrl->mype], graph->where); FreeGraph(agraph); gkMPI_Comm_free(&ipcomm); IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->comm)); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->InitPartTmr)); WCOREPOP; }
/************************************************************************* * This function is the entry point of the initial partitioning algorithm. * This algorithm assembles the graph to all the processors and preceed * serially. **************************************************************************/ idx_t Mc_Diffusion(ctrl_t *ctrl, graph_t *graph, idx_t *vtxdist, idx_t *where, idx_t *home, idx_t npasses) { idx_t h, i, j; idx_t nvtxs, nedges, ncon, pass, iter, domain, processor; idx_t nparts, mype, npes, nlinks, me, you, wsize; idx_t nvisited, nswaps = -1, tnswaps, done, alldone = -1; idx_t *rowptr, *colind, *diff_where, *sr_where, *ehome, *map, *rmap; idx_t *pack, *unpack, *match, *proc2sub, *sub2proc; idx_t *visited, *gvisited; real_t *transfer, *npwgts, maxdiff, minflow, maxflow; real_t lbavg, oldlbavg, ubavg, *lbvec; real_t *diff_flows, *sr_flows; real_t diff_lbavg, sr_lbavg, diff_cost, sr_cost; idx_t *rbuffer, *sbuffer; idx_t *rcount, *rdispl; real_t *solution, *load, *workspace; matrix_t matrix; graph_t *egraph; if (graph->ncon > 3) return 0; WCOREPUSH; nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; nparts = ctrl->nparts; mype = ctrl->mype; npes = ctrl->npes; ubavg = ravg(ncon, ctrl->ubvec); /* initialize variables and allocate memory */ lbvec = rwspacemalloc(ctrl, ncon); diff_flows = rwspacemalloc(ctrl, ncon); sr_flows = rwspacemalloc(ctrl, ncon); load = rwspacemalloc(ctrl, nparts); solution = rwspacemalloc(ctrl, nparts); npwgts = graph->gnpwgts = rwspacemalloc(ctrl, ncon*nparts); matrix.values = rwspacemalloc(ctrl, nedges); transfer = matrix.transfer = rwspacemalloc(ctrl, ncon*nedges); proc2sub = iwspacemalloc(ctrl, gk_max(nparts, npes*2)); sub2proc = iwspacemalloc(ctrl, nparts); match = iwspacemalloc(ctrl, nparts); rowptr = matrix.rowptr = iwspacemalloc(ctrl, nparts+1); colind = matrix.colind = iwspacemalloc(ctrl, nedges); rcount = iwspacemalloc(ctrl, npes); rdispl = iwspacemalloc(ctrl, npes+1); pack = iwspacemalloc(ctrl, nvtxs); unpack = iwspacemalloc(ctrl, nvtxs); rbuffer = iwspacemalloc(ctrl, nvtxs); sbuffer = iwspacemalloc(ctrl, nvtxs); map = iwspacemalloc(ctrl, nvtxs); rmap = iwspacemalloc(ctrl, nvtxs); diff_where = iwspacemalloc(ctrl, nvtxs); ehome = iwspacemalloc(ctrl, nvtxs); wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "Mc_Diffusion: workspace"); graph->ckrinfo = (ckrinfo_t *)gk_malloc(nvtxs*sizeof(ckrinfo_t), "Mc_Diffusion: rinfo"); /* construct subdomain connectivity graph */ matrix.nrows = nparts; SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); nlinks = (matrix.nnzs-nparts) / 2; visited = iwspacemalloc(ctrl, matrix.nnzs); gvisited = iwspacemalloc(ctrl, matrix.nnzs); for (pass=0; pass<npasses; pass++) { rset(matrix.nnzs*ncon, 0.0, transfer); iset(matrix.nnzs, 0, gvisited); iset(matrix.nnzs, 0, visited); iter = nvisited = 0; /* compute ncon flow solutions */ for (h=0; h<ncon; h++) { rset(nparts, 0.0, solution); ComputeLoad(graph, nparts, load, ctrl->tpwgts, h); lbvec[h] = (rmax(nparts, load)+1.0/nparts) * (real_t)nparts; ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(ncon, &matrix, solution, transfer, h); } oldlbavg = ravg(ncon, lbvec); tnswaps = 0; maxdiff = 0.0; for (i=0; i<nparts; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { maxflow = rmax(ncon, transfer+j*ncon); minflow = rmin(ncon, transfer+j*ncon); maxdiff = (maxflow - minflow > maxdiff) ? maxflow - minflow : maxdiff; } } while (nvisited < nlinks) { /* compute independent sets of subdomains */ iset(gk_max(nparts, npes*2), UNMATCHED, proc2sub); CSR_Match_SHEM(&matrix, match, proc2sub, gvisited, ncon); /* set up the packing arrays */ iset(nparts, UNMATCHED, sub2proc); for (i=0; i<npes*2; i++) { if (proc2sub[i] == UNMATCHED) break; sub2proc[proc2sub[i]] = i/2; } iset(npes, 0, rcount); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) rcount[processor]++; } rdispl[0] = 0; for (i=1; i<npes+1; i++) rdispl[i] = rdispl[i-1] + rcount[i-1]; iset(nvtxs, UNMATCHED, unpack); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) unpack[rdispl[processor]++] = i; } SHIFTCSR(i, npes, rdispl); iset(nvtxs, UNMATCHED, pack); for (i=0; i<rdispl[npes]; i++) { ASSERT(unpack[i] != UNMATCHED); domain = where[unpack[i]]; processor = sub2proc[domain]; if (processor != UNMATCHED) pack[unpack[i]] = i; } /* Compute the flows */ if (proc2sub[mype*2] != UNMATCHED) { me = proc2sub[2*mype]; you = proc2sub[2*mype+1]; ASSERT(me != you); for (j=rowptr[me]; j<rowptr[me+1]; j++) { if (colind[j] == you) { visited[j] = 1; rcopy(ncon, transfer+j*ncon, diff_flows); break; } } for (j=rowptr[you]; j<rowptr[you+1]; j++) { if (colind[j] == me) { visited[j] = 1; for (h=0; h<ncon; h++) { if (transfer[j*ncon+h] > 0.0) diff_flows[h] = -1.0 * transfer[j*ncon+h]; } break; } } nswaps = 1; rcopy(ncon, diff_flows, sr_flows); iset(nvtxs, 0, sbuffer); for (i=0; i<nvtxs; i++) { if (where[i] == me || where[i] == you) sbuffer[i] = 1; } egraph = ExtractGraph(ctrl, graph, sbuffer, map, rmap); if (egraph != NULL) { icopy(egraph->nvtxs, egraph->where, diff_where); for (j=0; j<egraph->nvtxs; j++) ehome[j] = home[map[j]]; RedoMyLink(ctrl, egraph, ehome, me, you, sr_flows, &sr_cost, &sr_lbavg); if (ncon <= 4) { sr_where = egraph->where; egraph->where = diff_where; nswaps = BalanceMyLink(ctrl, egraph, ehome, me, you, diff_flows, maxdiff, &diff_cost, &diff_lbavg, 1.0/(real_t)nvtxs); if ((sr_lbavg < diff_lbavg && (diff_lbavg >= ubavg-1.0 || sr_cost == diff_cost)) || (sr_lbavg < ubavg-1.0 && sr_cost < diff_cost)) { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = sr_where[i]; } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = diff_where[i]; } } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = egraph->where[i]; } gk_free((void **)&egraph->xadj, &egraph->nvwgt, &egraph->adjncy, &egraph, LTERM); } /* Pack the flow data */ iset(nvtxs, UNMATCHED, sbuffer); for (i=0; i<nvtxs; i++) { domain = where[i]; if (domain == you || domain == me) sbuffer[pack[i]] = where[i]; } } /* Broadcast the flow data */ gkMPI_Allgatherv((void *)&sbuffer[rdispl[mype]], rcount[mype], IDX_T, (void *)rbuffer, rcount, rdispl, IDX_T, ctrl->comm); /* Unpack the flow data */ for (i=0; i<rdispl[npes]; i++) { if (rbuffer[i] != UNMATCHED) where[unpack[i]] = rbuffer[i]; } /* Do other stuff */ gkMPI_Allreduce((void *)visited, (void *)gvisited, matrix.nnzs, IDX_T, MPI_MAX, ctrl->comm); nvisited = isum(matrix.nnzs, gvisited, 1)/2; tnswaps += GlobalSESum(ctrl, nswaps); if (iter++ == NGD_PASSES) break; } /* perform serial refinement */ Mc_ComputeSerialPartitionParams(ctrl, graph, nparts); Mc_SerialKWayAdaptRefine(ctrl, graph, nparts, home, ctrl->ubvec, 10); /* check for early breakout */ for (h=0; h<ncon; h++) { lbvec[h] = (real_t)(nparts) * npwgts[rargmax_strd(nparts,npwgts+h,ncon)*ncon+h]; } lbavg = ravg(ncon, lbvec); done = 0; if (tnswaps == 0 || lbavg >= oldlbavg || lbavg <= ubavg + 0.035) done = 1; alldone = GlobalSEMax(ctrl, done); if (alldone == 1) break; } /* ensure that all subdomains have at least one vertex */ /* iset(nparts, 0, match); for (i=0; i<nvtxs; i++) match[where[i]]++; done = 0; while (done == 0) { done = 1; me = iargmin(nparts, match); if (match[me] == 0) { if (ctrl->mype == PE) printf("WARNING: empty subdomain %"PRIDX" in Mc_Diffusion\n", me); you = iargmax(nparts, match); for (i=0; i<nvtxs; i++) { if (where[i] == you) { where[i] = me; match[you]--; match[me]++; done = 0; break; } } } } */ /* now free memory and return */ gk_free((void **)&workspace, (void **)&graph->ckrinfo, LTERM); graph->gnpwgts = NULL; graph->ckrinfo = NULL; WCOREPOP; return 0; }
/************************************************************************* * This function is the entry point of the initial balancing algorithm. * This algorithm assembles the graph to all the processors and preceeds * with the balancing step. **************************************************************************/ void Balance_Partition(ctrl_t *ctrl, graph_t *graph) { idx_t i, j, nvtxs, nedges, ncon; idx_t mype, npes, srnpes, srmype; idx_t *vtxdist, *xadj, *adjncy, *adjwgt, *vwgt, *vsize; idx_t *part, *lwhere, *home; idx_t lnparts, fpart, fpe, lnpes, ngroups; idx_t *rcounts, *rdispls; idx_t twoparts=2, moptions[METIS_NOPTIONS], edgecut, max_cut; idx_t sr_pe, gd_pe, sr, gd, who_wins; real_t my_cut, my_totalv, my_cost = -1.0, my_balance = -1.0, wsum; real_t rating, max_rating, your_cost = -1.0, your_balance = -1.0; real_t lbsum, min_lbsum, *lbvec, *tpwgts, *tpwgts2, buffer[2]; graph_t *agraph, cgraph; ctrl_t *myctrl; MPI_Status status; MPI_Comm ipcomm, srcomm; struct { double cost; int rank; } lpecost, gpecost; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->InitPartTmr)); WCOREPUSH; vtxdist = graph->vtxdist; agraph = AssembleAdaptiveGraph(ctrl, graph); nvtxs = cgraph.nvtxs = agraph->nvtxs; nedges = cgraph.nedges = agraph->nedges; ncon = cgraph.ncon = agraph->ncon; xadj = cgraph.xadj = icopy(nvtxs+1, agraph->xadj, iwspacemalloc(ctrl, nvtxs+1)); vwgt = cgraph.vwgt = icopy(nvtxs*ncon, agraph->vwgt, iwspacemalloc(ctrl, nvtxs*ncon)); vsize = cgraph.vsize = icopy(nvtxs, agraph->vsize, iwspacemalloc(ctrl, nvtxs)); adjncy = cgraph.adjncy = icopy(nedges, agraph->adjncy, iwspacemalloc(ctrl, nedges)); adjwgt = cgraph.adjwgt = icopy(nedges, agraph->adjwgt, iwspacemalloc(ctrl, nedges)); part = cgraph.where = agraph->where = iwspacemalloc(ctrl, nvtxs); lwhere = iwspacemalloc(ctrl, nvtxs); home = iwspacemalloc(ctrl, nvtxs); lbvec = rwspacemalloc(ctrl, graph->ncon); /****************************************/ /****************************************/ if (ctrl->ps_relation == PARMETIS_PSR_UNCOUPLED) { WCOREPUSH; rcounts = iwspacemalloc(ctrl, ctrl->npes); rdispls = iwspacemalloc(ctrl, ctrl->npes+1); for (i=0; i<ctrl->npes; i++) rdispls[i] = rcounts[i] = vtxdist[i+1]-vtxdist[i]; MAKECSR(i, ctrl->npes, rdispls); gkMPI_Allgatherv((void *)graph->home, graph->nvtxs, IDX_T, (void *)part, rcounts, rdispls, IDX_T, ctrl->comm); for (i=0; i<agraph->nvtxs; i++) home[i] = part[i]; WCOREPOP; /* local frees */ } else { for (i=0; i<ctrl->npes; i++) { for (j=vtxdist[i]; j<vtxdist[i+1]; j++) part[j] = home[j] = i; } } /* Ensure that the initial partitioning is legal */ for (i=0; i<agraph->nvtxs; i++) { if (part[i] >= ctrl->nparts) part[i] = home[i] = part[i] % ctrl->nparts; if (part[i] < 0) part[i] = home[i] = (-1*part[i]) % ctrl->nparts; } /****************************************/ /****************************************/ IFSET(ctrl->dbglvl, DBG_REFINEINFO, ComputeSerialBalance(ctrl, agraph, agraph->where, lbvec)); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "input cut: %"PRIDX", balance: ", ComputeSerialEdgeCut(agraph))); for (i=0; i<agraph->ncon; i++) IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "%.3"PRREAL" ", lbvec[i])); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "\n")); /****************************************/ /* Split the processors into two groups */ /****************************************/ sr = (ctrl->mype % 2 == 0) ? 1 : 0; gd = (ctrl->mype % 2 == 1) ? 1 : 0; if (graph->ncon > MAX_NCON_FOR_DIFFUSION || ctrl->npes == 1) { sr = 1; gd = 0; } sr_pe = 0; gd_pe = 1; gkMPI_Comm_split(ctrl->gcomm, sr, 0, &ipcomm); gkMPI_Comm_rank(ipcomm, &mype); gkMPI_Comm_size(ipcomm, &npes); if (sr == 1) { /* Half of the processors do scratch-remap */ ngroups = gk_max(gk_min(RIP_SPLIT_FACTOR, npes), 1); gkMPI_Comm_split(ipcomm, mype % ngroups, 0, &srcomm); gkMPI_Comm_rank(srcomm, &srmype); gkMPI_Comm_size(srcomm, &srnpes); METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_SEED] = ctrl->sync + (mype % ngroups) + 1; tpwgts = ctrl->tpwgts; tpwgts2 = rwspacemalloc(ctrl, 2*ncon); iset(nvtxs, 0, lwhere); lnparts = ctrl->nparts; fpart = fpe = 0; lnpes = srnpes; while (lnpes > 1 && lnparts > 1) { PASSERT(ctrl, agraph->nvtxs > 1); /* determine the weights of the two partitions as a function of the weight of the target partition weights */ for (j=(lnparts>>1), i=0; i<ncon; i++) { tpwgts2[i] = rsum(j, tpwgts+fpart*ncon+i, ncon); tpwgts2[ncon+i] = rsum(lnparts-j, tpwgts+(fpart+j)*ncon+i, ncon); wsum = 1.0/(tpwgts2[i] + tpwgts2[ncon+i]); tpwgts2[i] *= wsum; tpwgts2[ncon+i] *= wsum; } METIS_PartGraphRecursive(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &twoparts, tpwgts2, NULL, moptions, &edgecut, part); /* pick one of the branches */ if (srmype < fpe+lnpes/2) { KeepPart(ctrl, agraph, part, 0); lnpes = lnpes/2; lnparts = lnparts/2; } else { KeepPart(ctrl, agraph, part, 1); fpart = fpart + lnparts/2; fpe = fpe + lnpes/2; lnpes = lnpes - lnpes/2; lnparts = lnparts - lnparts/2; } } if (lnparts == 1) { /* Case in which srnpes is greater or equal to nparts */ /* Only the first process will assign labels (for the reduction to work) */ if (srmype == fpe) { for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart; } } else { /* Case in which srnpes is smaller than nparts */ /* create the normalized tpwgts for the lnparts from ctrl->tpwgts */ tpwgts = rwspacemalloc(ctrl, lnparts*ncon); for (j=0; j<ncon; j++) { for (wsum=0.0, i=0; i<lnparts; i++) { tpwgts[i*ncon+j] = ctrl->tpwgts[(fpart+i)*ncon+j]; wsum += tpwgts[i*ncon+j]; } for (wsum=1.0/wsum, i=0; i<lnparts; i++) tpwgts[i*ncon+j] *= wsum; } METIS_PartGraphKway(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &lnparts, tpwgts, NULL, moptions, &edgecut, part); for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart + part[i]; } gkMPI_Allreduce((void *)lwhere, (void *)part, nvtxs, IDX_T, MPI_SUM, srcomm); edgecut = ComputeSerialEdgeCut(&cgraph); ComputeSerialBalance(ctrl, &cgraph, part, lbvec); lbsum = rsum(ncon, lbvec, 1); gkMPI_Allreduce((void *)&edgecut, (void *)&max_cut, 1, IDX_T, MPI_MAX, ipcomm); gkMPI_Allreduce((void *)&lbsum, (void *)&min_lbsum, 1, REAL_T, MPI_MIN, ipcomm); lpecost.rank = ctrl->mype; lpecost.cost = lbsum; if (min_lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) { if (lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) lpecost.cost = (double)edgecut; else lpecost.cost = (double)max_cut + lbsum; } gkMPI_Allreduce((void *)&lpecost, (void *)&gpecost, 1, MPI_DOUBLE_INT, MPI_MINLOC, ipcomm); if (ctrl->mype == gpecost.rank && ctrl->mype != sr_pe) gkMPI_Send((void *)part, nvtxs, IDX_T, sr_pe, 1, ctrl->comm); if (ctrl->mype != gpecost.rank && ctrl->mype == sr_pe) gkMPI_Recv((void *)part, nvtxs, IDX_T, gpecost.rank, 1, ctrl->comm, &status); if (ctrl->mype == sr_pe) { icopy(nvtxs, part, lwhere); SerialRemap(ctrl, &cgraph, ctrl->nparts, home, lwhere, part, ctrl->tpwgts); } gkMPI_Comm_free(&srcomm); }