/************************************************************************* * This function performs a k-way directed diffusion **************************************************************************/ real_t WavefrontDiffusion(ctrl_t *ctrl, graph_t *graph, idx_t *home) { idx_t ii, i, j, k, l, nvtxs, nedges, nparts; idx_t from, to, edge, done, nswaps, noswaps, totalv, wsize; idx_t npasses, first, second, third, mind, maxd; idx_t *xadj, *adjncy, *adjwgt, *where, *perm; idx_t *rowptr, *colind, *ed, *psize; real_t *transfer, *tmpvec; real_t balance = -1.0, *load, *solution, *workspace; real_t *nvwgt, *npwgts, flowFactor, cost, ubfactor; matrix_t matrix; ikv_t *cand; idx_t ndirty, nclean, dptr, clean; nvtxs = graph->nvtxs; nedges = graph->nedges; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; nparts = ctrl->nparts; ubfactor = ctrl->ubvec[0]; matrix.nrows = nparts; flowFactor = 0.35; flowFactor = (ctrl->mype == 2) ? 0.50 : flowFactor; flowFactor = (ctrl->mype == 3) ? 0.75 : flowFactor; flowFactor = (ctrl->mype == 4) ? 1.00 : flowFactor; /* allocate memory */ solution = rmalloc(4*nparts+2*nedges, "WavefrontDiffusion: solution"); tmpvec = solution + nparts; npwgts = solution + 2*nparts; load = solution + 3*nparts; matrix.values = solution + 4*nparts; transfer = matrix.transfer = solution + 4*nparts + nedges; perm = imalloc(2*nvtxs+2*nparts+nedges+1, "WavefrontDiffusion: perm"); ed = perm + nvtxs; psize = perm + 2*nvtxs; rowptr = matrix.rowptr = perm + 2*nvtxs + nparts; colind = matrix.colind = perm + 2*nvtxs + 2*nparts + 1; /*GKTODO - Potential problem with this malloc */ wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "WavefrontDiffusion: workspace"); cand = ikvmalloc(nvtxs, "WavefrontDiffusion: cand"); /*****************************/ /* Populate empty subdomains */ /*****************************/ iset(nparts, 0, psize); for (i=0; i<nvtxs; i++) psize[where[i]]++; mind = iargmin(nparts, psize); maxd = iargmax(nparts, psize); if (psize[mind] == 0) { for (i=0; i<nvtxs; i++) { k = (RandomInRange(nvtxs)+i)%nvtxs; if (where[k] == maxd) { where[k] = mind; psize[mind]++; psize[maxd]--; break; } } } iset(nvtxs, 0, ed); rset(nparts, 0.0, npwgts); for (i=0; i<nvtxs; i++) { npwgts[where[i]] += nvwgt[i]; for (j=xadj[i]; j<xadj[i+1]; j++) ed[i] += (where[i] != where[adjncy[j]] ? adjwgt[j] : 0); } ComputeLoad(graph, nparts, load, ctrl->tpwgts, 0); done = 0; /* zero out the tmpvec array */ rset(nparts, 0.0, tmpvec); npasses = gk_min(nparts/2, NGD_PASSES); for (l=0; l<npasses; l++) { /* Set-up and solve the diffusion equation */ nswaps = 0; /************************/ /* Solve flow equations */ /************************/ SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); /* check for disconnected subdomains */ for(i=0; i<matrix.nrows; i++) { if (matrix.rowptr[i]+1 == matrix.rowptr[i+1]) { cost = (real_t)(ctrl->mype); goto CleanUpAndExit; } } ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(1, &matrix, solution, transfer, 0); GetThreeMax(nparts, load, &first, &second, &third); if (l%3 == 0) { FastRandomPermute(nvtxs, perm, 1); } else { /*****************************/ /* move dirty vertices first */ /*****************************/ ndirty = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) ndirty++; } dptr = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) perm[dptr++] = i; else perm[ndirty++] = i; } PASSERT(ctrl, ndirty == nvtxs); ndirty = dptr; nclean = nvtxs-dptr; FastRandomPermute(ndirty, perm, 0); FastRandomPermute(nclean, perm+ndirty, 0); } if (ctrl->mype == 0) { for (j=nvtxs, k=0, ii=0; ii<nvtxs; ii++) { i = perm[ii]; if (ed[i] != 0) { cand[k].key = -ed[i]; cand[k++].val = i; } else { cand[--j].key = 0; cand[j].val = i; } } ikvsorti(k, cand); } for (ii=0; ii<nvtxs/3; ii++) { i = (ctrl->mype == 0) ? cand[ii].val : perm[ii]; from = where[i]; /* don't move out the last vertex in a subdomain */ if (psize[from] == 1) continue; clean = (from == home[i]) ? 1 : 0; /* only move from top three or dirty vertices */ if (from != first && from != second && from != third && clean) continue; /* Scatter the sparse transfer row into the dense tmpvec row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) tmpvec[colind[j]] = transfer[j]; for (j=xadj[i]; j<xadj[i+1]; j++) { to = where[adjncy[j]]; if (from != to) { if (tmpvec[to] > (flowFactor * nvwgt[i])) { tmpvec[to] -= nvwgt[i]; INC_DEC(psize[to], psize[from], 1); INC_DEC(npwgts[to], npwgts[from], nvwgt[i]); INC_DEC(load[to], load[from], nvwgt[i]); where[i] = to; nswaps++; /* Update external degrees */ ed[i] = 0; for (k=xadj[i]; k<xadj[i+1]; k++) { edge = adjncy[k]; ed[i] += (to != where[edge] ? adjwgt[k] : 0); if (where[edge] == from) ed[edge] += adjwgt[k]; if (where[edge] == to) ed[edge] -= adjwgt[k]; } break; } } } /* Gather the dense tmpvec row into the sparse transfer row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) { transfer[j] = tmpvec[colind[j]]; tmpvec[colind[j]] = 0.0; } ASSERT(fabs(rsum(nparts, tmpvec, 1)) < .0001) } if (l % 2 == 1) { balance = rmax(nparts, npwgts)*nparts; if (balance < ubfactor + 0.035) done = 1; if (GlobalSESum(ctrl, done) > 0) break; noswaps = (nswaps > 0) ? 0 : 1; if (GlobalSESum(ctrl, noswaps) > ctrl->npes/2) break; } } graph->mincut = ComputeSerialEdgeCut(graph); totalv = Mc_ComputeSerialTotalV(graph, home); cost = ctrl->ipc_factor * (real_t)graph->mincut + ctrl->redist_factor * (real_t)totalv; CleanUpAndExit: gk_free((void **)&solution, (void **)&perm, (void **)&workspace, (void **)&cand, LTERM); return cost; }
/************************************************************************* * This function is the entry point of the initial partitioning algorithm. * This algorithm assembles the graph to all the processors and preceed * serially. **************************************************************************/ idx_t Mc_Diffusion(ctrl_t *ctrl, graph_t *graph, idx_t *vtxdist, idx_t *where, idx_t *home, idx_t npasses) { idx_t h, i, j; idx_t nvtxs, nedges, ncon, pass, iter, domain, processor; idx_t nparts, mype, npes, nlinks, me, you, wsize; idx_t nvisited, nswaps = -1, tnswaps, done, alldone = -1; idx_t *rowptr, *colind, *diff_where, *sr_where, *ehome, *map, *rmap; idx_t *pack, *unpack, *match, *proc2sub, *sub2proc; idx_t *visited, *gvisited; real_t *transfer, *npwgts, maxdiff, minflow, maxflow; real_t lbavg, oldlbavg, ubavg, *lbvec; real_t *diff_flows, *sr_flows; real_t diff_lbavg, sr_lbavg, diff_cost, sr_cost; idx_t *rbuffer, *sbuffer; idx_t *rcount, *rdispl; real_t *solution, *load, *workspace; matrix_t matrix; graph_t *egraph; if (graph->ncon > 3) return 0; WCOREPUSH; nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; nparts = ctrl->nparts; mype = ctrl->mype; npes = ctrl->npes; ubavg = ravg(ncon, ctrl->ubvec); /* initialize variables and allocate memory */ lbvec = rwspacemalloc(ctrl, ncon); diff_flows = rwspacemalloc(ctrl, ncon); sr_flows = rwspacemalloc(ctrl, ncon); load = rwspacemalloc(ctrl, nparts); solution = rwspacemalloc(ctrl, nparts); npwgts = graph->gnpwgts = rwspacemalloc(ctrl, ncon*nparts); matrix.values = rwspacemalloc(ctrl, nedges); transfer = matrix.transfer = rwspacemalloc(ctrl, ncon*nedges); proc2sub = iwspacemalloc(ctrl, gk_max(nparts, npes*2)); sub2proc = iwspacemalloc(ctrl, nparts); match = iwspacemalloc(ctrl, nparts); rowptr = matrix.rowptr = iwspacemalloc(ctrl, nparts+1); colind = matrix.colind = iwspacemalloc(ctrl, nedges); rcount = iwspacemalloc(ctrl, npes); rdispl = iwspacemalloc(ctrl, npes+1); pack = iwspacemalloc(ctrl, nvtxs); unpack = iwspacemalloc(ctrl, nvtxs); rbuffer = iwspacemalloc(ctrl, nvtxs); sbuffer = iwspacemalloc(ctrl, nvtxs); map = iwspacemalloc(ctrl, nvtxs); rmap = iwspacemalloc(ctrl, nvtxs); diff_where = iwspacemalloc(ctrl, nvtxs); ehome = iwspacemalloc(ctrl, nvtxs); wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "Mc_Diffusion: workspace"); graph->ckrinfo = (ckrinfo_t *)gk_malloc(nvtxs*sizeof(ckrinfo_t), "Mc_Diffusion: rinfo"); /* construct subdomain connectivity graph */ matrix.nrows = nparts; SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); nlinks = (matrix.nnzs-nparts) / 2; visited = iwspacemalloc(ctrl, matrix.nnzs); gvisited = iwspacemalloc(ctrl, matrix.nnzs); for (pass=0; pass<npasses; pass++) { rset(matrix.nnzs*ncon, 0.0, transfer); iset(matrix.nnzs, 0, gvisited); iset(matrix.nnzs, 0, visited); iter = nvisited = 0; /* compute ncon flow solutions */ for (h=0; h<ncon; h++) { rset(nparts, 0.0, solution); ComputeLoad(graph, nparts, load, ctrl->tpwgts, h); lbvec[h] = (rmax(nparts, load)+1.0/nparts) * (real_t)nparts; ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(ncon, &matrix, solution, transfer, h); } oldlbavg = ravg(ncon, lbvec); tnswaps = 0; maxdiff = 0.0; for (i=0; i<nparts; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { maxflow = rmax(ncon, transfer+j*ncon); minflow = rmin(ncon, transfer+j*ncon); maxdiff = (maxflow - minflow > maxdiff) ? maxflow - minflow : maxdiff; } } while (nvisited < nlinks) { /* compute independent sets of subdomains */ iset(gk_max(nparts, npes*2), UNMATCHED, proc2sub); CSR_Match_SHEM(&matrix, match, proc2sub, gvisited, ncon); /* set up the packing arrays */ iset(nparts, UNMATCHED, sub2proc); for (i=0; i<npes*2; i++) { if (proc2sub[i] == UNMATCHED) break; sub2proc[proc2sub[i]] = i/2; } iset(npes, 0, rcount); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) rcount[processor]++; } rdispl[0] = 0; for (i=1; i<npes+1; i++) rdispl[i] = rdispl[i-1] + rcount[i-1]; iset(nvtxs, UNMATCHED, unpack); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) unpack[rdispl[processor]++] = i; } SHIFTCSR(i, npes, rdispl); iset(nvtxs, UNMATCHED, pack); for (i=0; i<rdispl[npes]; i++) { ASSERT(unpack[i] != UNMATCHED); domain = where[unpack[i]]; processor = sub2proc[domain]; if (processor != UNMATCHED) pack[unpack[i]] = i; } /* Compute the flows */ if (proc2sub[mype*2] != UNMATCHED) { me = proc2sub[2*mype]; you = proc2sub[2*mype+1]; ASSERT(me != you); for (j=rowptr[me]; j<rowptr[me+1]; j++) { if (colind[j] == you) { visited[j] = 1; rcopy(ncon, transfer+j*ncon, diff_flows); break; } } for (j=rowptr[you]; j<rowptr[you+1]; j++) { if (colind[j] == me) { visited[j] = 1; for (h=0; h<ncon; h++) { if (transfer[j*ncon+h] > 0.0) diff_flows[h] = -1.0 * transfer[j*ncon+h]; } break; } } nswaps = 1; rcopy(ncon, diff_flows, sr_flows); iset(nvtxs, 0, sbuffer); for (i=0; i<nvtxs; i++) { if (where[i] == me || where[i] == you) sbuffer[i] = 1; } egraph = ExtractGraph(ctrl, graph, sbuffer, map, rmap); if (egraph != NULL) { icopy(egraph->nvtxs, egraph->where, diff_where); for (j=0; j<egraph->nvtxs; j++) ehome[j] = home[map[j]]; RedoMyLink(ctrl, egraph, ehome, me, you, sr_flows, &sr_cost, &sr_lbavg); if (ncon <= 4) { sr_where = egraph->where; egraph->where = diff_where; nswaps = BalanceMyLink(ctrl, egraph, ehome, me, you, diff_flows, maxdiff, &diff_cost, &diff_lbavg, 1.0/(real_t)nvtxs); if ((sr_lbavg < diff_lbavg && (diff_lbavg >= ubavg-1.0 || sr_cost == diff_cost)) || (sr_lbavg < ubavg-1.0 && sr_cost < diff_cost)) { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = sr_where[i]; } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = diff_where[i]; } } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = egraph->where[i]; } gk_free((void **)&egraph->xadj, &egraph->nvwgt, &egraph->adjncy, &egraph, LTERM); } /* Pack the flow data */ iset(nvtxs, UNMATCHED, sbuffer); for (i=0; i<nvtxs; i++) { domain = where[i]; if (domain == you || domain == me) sbuffer[pack[i]] = where[i]; } } /* Broadcast the flow data */ gkMPI_Allgatherv((void *)&sbuffer[rdispl[mype]], rcount[mype], IDX_T, (void *)rbuffer, rcount, rdispl, IDX_T, ctrl->comm); /* Unpack the flow data */ for (i=0; i<rdispl[npes]; i++) { if (rbuffer[i] != UNMATCHED) where[unpack[i]] = rbuffer[i]; } /* Do other stuff */ gkMPI_Allreduce((void *)visited, (void *)gvisited, matrix.nnzs, IDX_T, MPI_MAX, ctrl->comm); nvisited = isum(matrix.nnzs, gvisited, 1)/2; tnswaps += GlobalSESum(ctrl, nswaps); if (iter++ == NGD_PASSES) break; } /* perform serial refinement */ Mc_ComputeSerialPartitionParams(ctrl, graph, nparts); Mc_SerialKWayAdaptRefine(ctrl, graph, nparts, home, ctrl->ubvec, 10); /* check for early breakout */ for (h=0; h<ncon; h++) { lbvec[h] = (real_t)(nparts) * npwgts[rargmax_strd(nparts,npwgts+h,ncon)*ncon+h]; } lbavg = ravg(ncon, lbvec); done = 0; if (tnswaps == 0 || lbavg >= oldlbavg || lbavg <= ubavg + 0.035) done = 1; alldone = GlobalSEMax(ctrl, done); if (alldone == 1) break; } /* ensure that all subdomains have at least one vertex */ /* iset(nparts, 0, match); for (i=0; i<nvtxs; i++) match[where[i]]++; done = 0; while (done == 0) { done = 1; me = iargmin(nparts, match); if (match[me] == 0) { if (ctrl->mype == PE) printf("WARNING: empty subdomain %"PRIDX" in Mc_Diffusion\n", me); you = iargmax(nparts, match); for (i=0; i<nvtxs; i++) { if (where[i] == you) { where[i] = me; match[you]--; match[me]++; done = 0; break; } } } } */ /* now free memory and return */ gk_free((void **)&workspace, (void **)&graph->ckrinfo, LTERM); graph->gnpwgts = NULL; graph->ckrinfo = NULL; WCOREPOP; return 0; }