/************************************************************************* * This function is the entry point of the initial partitioning algorithm. * This algorithm assembles the graph to all the processors and preceed * serially. **************************************************************************/ idx_t Mc_Diffusion(ctrl_t *ctrl, graph_t *graph, idx_t *vtxdist, idx_t *where, idx_t *home, idx_t npasses) { idx_t h, i, j; idx_t nvtxs, nedges, ncon, pass, iter, domain, processor; idx_t nparts, mype, npes, nlinks, me, you, wsize; idx_t nvisited, nswaps = -1, tnswaps, done, alldone = -1; idx_t *rowptr, *colind, *diff_where, *sr_where, *ehome, *map, *rmap; idx_t *pack, *unpack, *match, *proc2sub, *sub2proc; idx_t *visited, *gvisited; real_t *transfer, *npwgts, maxdiff, minflow, maxflow; real_t lbavg, oldlbavg, ubavg, *lbvec; real_t *diff_flows, *sr_flows; real_t diff_lbavg, sr_lbavg, diff_cost, sr_cost; idx_t *rbuffer, *sbuffer; idx_t *rcount, *rdispl; real_t *solution, *load, *workspace; matrix_t matrix; graph_t *egraph; if (graph->ncon > 3) return 0; WCOREPUSH; nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; nparts = ctrl->nparts; mype = ctrl->mype; npes = ctrl->npes; ubavg = ravg(ncon, ctrl->ubvec); /* initialize variables and allocate memory */ lbvec = rwspacemalloc(ctrl, ncon); diff_flows = rwspacemalloc(ctrl, ncon); sr_flows = rwspacemalloc(ctrl, ncon); load = rwspacemalloc(ctrl, nparts); solution = rwspacemalloc(ctrl, nparts); npwgts = graph->gnpwgts = rwspacemalloc(ctrl, ncon*nparts); matrix.values = rwspacemalloc(ctrl, nedges); transfer = matrix.transfer = rwspacemalloc(ctrl, ncon*nedges); proc2sub = iwspacemalloc(ctrl, gk_max(nparts, npes*2)); sub2proc = iwspacemalloc(ctrl, nparts); match = iwspacemalloc(ctrl, nparts); rowptr = matrix.rowptr = iwspacemalloc(ctrl, nparts+1); colind = matrix.colind = iwspacemalloc(ctrl, nedges); rcount = iwspacemalloc(ctrl, npes); rdispl = iwspacemalloc(ctrl, npes+1); pack = iwspacemalloc(ctrl, nvtxs); unpack = iwspacemalloc(ctrl, nvtxs); rbuffer = iwspacemalloc(ctrl, nvtxs); sbuffer = iwspacemalloc(ctrl, nvtxs); map = iwspacemalloc(ctrl, nvtxs); rmap = iwspacemalloc(ctrl, nvtxs); diff_where = iwspacemalloc(ctrl, nvtxs); ehome = iwspacemalloc(ctrl, nvtxs); wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "Mc_Diffusion: workspace"); graph->ckrinfo = (ckrinfo_t *)gk_malloc(nvtxs*sizeof(ckrinfo_t), "Mc_Diffusion: rinfo"); /* construct subdomain connectivity graph */ matrix.nrows = nparts; SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); nlinks = (matrix.nnzs-nparts) / 2; visited = iwspacemalloc(ctrl, matrix.nnzs); gvisited = iwspacemalloc(ctrl, matrix.nnzs); for (pass=0; pass<npasses; pass++) { rset(matrix.nnzs*ncon, 0.0, transfer); iset(matrix.nnzs, 0, gvisited); iset(matrix.nnzs, 0, visited); iter = nvisited = 0; /* compute ncon flow solutions */ for (h=0; h<ncon; h++) { rset(nparts, 0.0, solution); ComputeLoad(graph, nparts, load, ctrl->tpwgts, h); lbvec[h] = (rmax(nparts, load)+1.0/nparts) * (real_t)nparts; ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(ncon, &matrix, solution, transfer, h); } oldlbavg = ravg(ncon, lbvec); tnswaps = 0; maxdiff = 0.0; for (i=0; i<nparts; i++) { for (j=rowptr[i]; j<rowptr[i+1]; j++) { maxflow = rmax(ncon, transfer+j*ncon); minflow = rmin(ncon, transfer+j*ncon); maxdiff = (maxflow - minflow > maxdiff) ? maxflow - minflow : maxdiff; } } while (nvisited < nlinks) { /* compute independent sets of subdomains */ iset(gk_max(nparts, npes*2), UNMATCHED, proc2sub); CSR_Match_SHEM(&matrix, match, proc2sub, gvisited, ncon); /* set up the packing arrays */ iset(nparts, UNMATCHED, sub2proc); for (i=0; i<npes*2; i++) { if (proc2sub[i] == UNMATCHED) break; sub2proc[proc2sub[i]] = i/2; } iset(npes, 0, rcount); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) rcount[processor]++; } rdispl[0] = 0; for (i=1; i<npes+1; i++) rdispl[i] = rdispl[i-1] + rcount[i-1]; iset(nvtxs, UNMATCHED, unpack); for (i=0; i<nvtxs; i++) { domain = where[i]; processor = sub2proc[domain]; if (processor != UNMATCHED) unpack[rdispl[processor]++] = i; } SHIFTCSR(i, npes, rdispl); iset(nvtxs, UNMATCHED, pack); for (i=0; i<rdispl[npes]; i++) { ASSERT(unpack[i] != UNMATCHED); domain = where[unpack[i]]; processor = sub2proc[domain]; if (processor != UNMATCHED) pack[unpack[i]] = i; } /* Compute the flows */ if (proc2sub[mype*2] != UNMATCHED) { me = proc2sub[2*mype]; you = proc2sub[2*mype+1]; ASSERT(me != you); for (j=rowptr[me]; j<rowptr[me+1]; j++) { if (colind[j] == you) { visited[j] = 1; rcopy(ncon, transfer+j*ncon, diff_flows); break; } } for (j=rowptr[you]; j<rowptr[you+1]; j++) { if (colind[j] == me) { visited[j] = 1; for (h=0; h<ncon; h++) { if (transfer[j*ncon+h] > 0.0) diff_flows[h] = -1.0 * transfer[j*ncon+h]; } break; } } nswaps = 1; rcopy(ncon, diff_flows, sr_flows); iset(nvtxs, 0, sbuffer); for (i=0; i<nvtxs; i++) { if (where[i] == me || where[i] == you) sbuffer[i] = 1; } egraph = ExtractGraph(ctrl, graph, sbuffer, map, rmap); if (egraph != NULL) { icopy(egraph->nvtxs, egraph->where, diff_where); for (j=0; j<egraph->nvtxs; j++) ehome[j] = home[map[j]]; RedoMyLink(ctrl, egraph, ehome, me, you, sr_flows, &sr_cost, &sr_lbavg); if (ncon <= 4) { sr_where = egraph->where; egraph->where = diff_where; nswaps = BalanceMyLink(ctrl, egraph, ehome, me, you, diff_flows, maxdiff, &diff_cost, &diff_lbavg, 1.0/(real_t)nvtxs); if ((sr_lbavg < diff_lbavg && (diff_lbavg >= ubavg-1.0 || sr_cost == diff_cost)) || (sr_lbavg < ubavg-1.0 && sr_cost < diff_cost)) { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = sr_where[i]; } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = diff_where[i]; } } else { for (i=0; i<egraph->nvtxs; i++) where[map[i]] = egraph->where[i]; } gk_free((void **)&egraph->xadj, &egraph->nvwgt, &egraph->adjncy, &egraph, LTERM); } /* Pack the flow data */ iset(nvtxs, UNMATCHED, sbuffer); for (i=0; i<nvtxs; i++) { domain = where[i]; if (domain == you || domain == me) sbuffer[pack[i]] = where[i]; } } /* Broadcast the flow data */ gkMPI_Allgatherv((void *)&sbuffer[rdispl[mype]], rcount[mype], IDX_T, (void *)rbuffer, rcount, rdispl, IDX_T, ctrl->comm); /* Unpack the flow data */ for (i=0; i<rdispl[npes]; i++) { if (rbuffer[i] != UNMATCHED) where[unpack[i]] = rbuffer[i]; } /* Do other stuff */ gkMPI_Allreduce((void *)visited, (void *)gvisited, matrix.nnzs, IDX_T, MPI_MAX, ctrl->comm); nvisited = isum(matrix.nnzs, gvisited, 1)/2; tnswaps += GlobalSESum(ctrl, nswaps); if (iter++ == NGD_PASSES) break; } /* perform serial refinement */ Mc_ComputeSerialPartitionParams(ctrl, graph, nparts); Mc_SerialKWayAdaptRefine(ctrl, graph, nparts, home, ctrl->ubvec, 10); /* check for early breakout */ for (h=0; h<ncon; h++) { lbvec[h] = (real_t)(nparts) * npwgts[rargmax_strd(nparts,npwgts+h,ncon)*ncon+h]; } lbavg = ravg(ncon, lbvec); done = 0; if (tnswaps == 0 || lbavg >= oldlbavg || lbavg <= ubavg + 0.035) done = 1; alldone = GlobalSEMax(ctrl, done); if (alldone == 1) break; } /* ensure that all subdomains have at least one vertex */ /* iset(nparts, 0, match); for (i=0; i<nvtxs; i++) match[where[i]]++; done = 0; while (done == 0) { done = 1; me = iargmin(nparts, match); if (match[me] == 0) { if (ctrl->mype == PE) printf("WARNING: empty subdomain %"PRIDX" in Mc_Diffusion\n", me); you = iargmax(nparts, match); for (i=0; i<nvtxs; i++) { if (where[i] == you) { where[i] = me; match[you]--; match[me]++; done = 0; break; } } } } */ /* now free memory and return */ gk_free((void **)&workspace, (void **)&graph->ckrinfo, LTERM); graph->gnpwgts = NULL; graph->ckrinfo = NULL; WCOREPOP; return 0; }
void Adaptive_Partition(ctrl_t *ctrl, graph_t *graph) { idx_t i; idx_t tewgt, tvsize; real_t gtewgt, gtvsize; real_t ubavg, lbavg, *lbvec; WCOREPUSH; lbvec = rwspacemalloc(ctrl, graph->ncon); /************************************/ /* Set up important data structures */ /************************************/ CommSetup(ctrl, graph); ubavg = ravg(graph->ncon, ctrl->ubvec); tewgt = isum(graph->nedges, graph->adjwgt, 1); tvsize = isum(graph->nvtxs, graph->vsize, 1); gtewgt = (real_t) GlobalSESum(ctrl, tewgt) + 1.0/graph->gnvtxs; /* The +1/graph->gnvtxs were added to remove any FPE */ gtvsize = (real_t) GlobalSESum(ctrl, tvsize) + 1.0/graph->gnvtxs; ctrl->redist_factor = ctrl->redist_base * ((gtewgt/gtvsize)/ ctrl->edge_size_ratio); IFSET(ctrl->dbglvl, DBG_PROGRESS, rprintf(ctrl, "[%6"PRIDX" %8"PRIDX" %5"PRIDX" %5"PRIDX"][%"PRIDX"]\n", graph->gnvtxs, GlobalSESum(ctrl, graph->nedges), GlobalSEMin(ctrl, graph->nvtxs), GlobalSEMax(ctrl, graph->nvtxs), ctrl->CoarsenTo)); if (graph->gnvtxs < 1.3*ctrl->CoarsenTo || (graph->finer != NULL && graph->gnvtxs > graph->finer->gnvtxs*COARSEN_FRACTION)) { AllocateRefinementWorkSpace(ctrl, 2*graph->nedges); /***********************************************/ /* Balance the partition on the coarsest graph */ /***********************************************/ graph->where = ismalloc(graph->nvtxs+graph->nrecv, -1, "graph->where"); icopy(graph->nvtxs, graph->home, graph->where); ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = ravg(graph->ncon, lbvec); if (lbavg > ubavg + 0.035 && ctrl->partType != REFINE_PARTITION) Balance_Partition(ctrl, graph); if (ctrl->dbglvl&DBG_PROGRESS) { ComputePartitionParams(ctrl, graph); ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10"PRIDX", cut: %8"PRIDX", balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3"PRREAL" ", lbvec[i]); rprintf(ctrl, "\n"); /* free memory allocated by ComputePartitionParams */ gk_free((void **)&graph->ckrinfo, &graph->lnpwgts, &graph->gnpwgts, LTERM); } /* check if no coarsening took place */ if (graph->finer == NULL) { ComputePartitionParams(ctrl, graph); KWayBalance(ctrl, graph, graph->ncon); KWayAdaptiveRefine(ctrl, graph, NGR_PASSES); } } else { /*******************************/ /* Coarsen it and partition it */ /*******************************/ switch (ctrl->ps_relation) { case PARMETIS_PSR_COUPLED: Match_Local(ctrl, graph); break; case PARMETIS_PSR_UNCOUPLED: default: Match_Global(ctrl, graph); break; } Adaptive_Partition(ctrl, graph->coarser); /********************************/ /* project partition and refine */ /********************************/ ProjectPartition(ctrl, graph); ComputePartitionParams(ctrl, graph); if (graph->ncon > 1 && graph->level < 4) { ComputeParallelBalance(ctrl, graph, graph->where, lbvec); lbavg = ravg(graph->ncon, lbvec); if (lbavg > ubavg + 0.025) { KWayBalance(ctrl, graph, graph->ncon); } } KWayAdaptiveRefine(ctrl, graph, NGR_PASSES); if (ctrl->dbglvl&DBG_PROGRESS) { ComputeParallelBalance(ctrl, graph, graph->where, lbvec); rprintf(ctrl, "nvtxs: %10"PRIDX", cut: %8"PRIDX", balance: ", graph->gnvtxs, graph->mincut); for (i=0; i<graph->ncon; i++) rprintf(ctrl, "%.3"PRREAL" ", lbvec[i]); rprintf(ctrl, "\n"); } } WCOREPOP; }
/************************************************************************* * This function performs an edge-based FM refinement **************************************************************************/ void RedoMyLink(ctrl_t *ctrl, graph_t *graph, idx_t *home, idx_t me, idx_t you, real_t *flows, real_t *sr_cost, real_t *sr_lbavg) { idx_t h, i, r; idx_t nvtxs, nedges, ncon; idx_t pass, lastseed, totalv; idx_t *xadj, *adjncy, *adjwgt, *where, *vsize; idx_t *costwhere, *lbwhere, *selectwhere; idx_t *ed, *id, *bndptr, *bndind, *perm; real_t *nvwgt, mycost; real_t lbavg, *lbvec; real_t best_lbavg, other_lbavg = -1.0, bestcost, othercost = -1.0; real_t *npwgts, *pwgts, *tpwgts; real_t ipc_factor, redist_factor, ftmp; idx_t mype; gkMPI_Comm_rank(MPI_COMM_WORLD, &mype); WCOREPUSH; nvtxs = graph->nvtxs; nedges = graph->nedges; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; vsize = graph->vsize; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; ipc_factor = ctrl->ipc_factor; redist_factor = ctrl->redist_factor; /* set up data structures */ id = graph->sendind = iwspacemalloc(ctrl, nvtxs); ed = graph->recvind = iwspacemalloc(ctrl, nvtxs); bndptr = graph->sendptr = iwspacemalloc(ctrl, nvtxs); bndind = graph->recvptr = iwspacemalloc(ctrl, nvtxs); costwhere = iwspacemalloc(ctrl, nvtxs); lbwhere = iwspacemalloc(ctrl, nvtxs); perm = iwspacemalloc(ctrl, nvtxs); lbvec = rwspacemalloc(ctrl, ncon); pwgts = rset(2*ncon, 0.0, rwspacemalloc(ctrl, 2*ncon)); npwgts = rwspacemalloc(ctrl, 2*ncon); tpwgts = rwspacemalloc(ctrl, 2*ncon); graph->gnpwgts = npwgts; RandomPermute(nvtxs, perm, 1); icopy(nvtxs, where, costwhere); icopy(nvtxs, where, lbwhere); /* compute target pwgts */ for (h=0; h<ncon; h++) { tpwgts[h] = -1.0*flows[h]; tpwgts[ncon+h] = flows[h]; } for (i=0; i<nvtxs; i++) { if (where[i] == me) { for (h=0; h<ncon; h++) { tpwgts[h] += nvwgt[i*ncon+h]; pwgts[h] += nvwgt[i*ncon+h]; } } else { ASSERT(where[i] == you); for (h=0; h<ncon; h++) { tpwgts[ncon+h] += nvwgt[i*ncon+h]; pwgts[ncon+h] += nvwgt[i*ncon+h]; } } } /* we don't want any weights to be less than zero */ for (h=0; h<ncon; h++) { if (tpwgts[h] < 0.0) { tpwgts[ncon+h] += tpwgts[h]; tpwgts[h] = 0.0; } if (tpwgts[ncon+h] < 0.0) { tpwgts[h] += tpwgts[ncon+h]; tpwgts[ncon+h] = 0.0; } } /* now compute new bisection */ bestcost = (real_t)isum(nedges, adjwgt, 1)*ipc_factor + (real_t)isum(nvtxs, vsize, 1)*redist_factor; best_lbavg = 10.0; lastseed = 0; for (pass=N_MOC_REDO_PASSES; pass>0; pass--) { iset(nvtxs, 1, where); /* find seed vertices */ r = perm[lastseed] % nvtxs; lastseed = (lastseed+1) % nvtxs; where[r] = 0; Mc_Serial_Compute2WayPartitionParams(ctrl, graph); Mc_Serial_Init2WayBalance(ctrl, graph, tpwgts); Mc_Serial_FM_2WayRefine(ctrl, graph, tpwgts, 4); Mc_Serial_Balance2Way(ctrl, graph, tpwgts, 1.02); Mc_Serial_FM_2WayRefine(ctrl, graph, tpwgts, 4); for (i=0; i<nvtxs; i++) where[i] = (where[i] == 0) ? me : you; for (i=0; i<ncon; i++) { ftmp = (pwgts[i]+pwgts[ncon+i])/2.0; if (ftmp != 0.0) lbvec[i] = fabs(npwgts[i]-tpwgts[i])/ftmp; else lbvec[i] = 0.0; } lbavg = ravg(ncon, lbvec); totalv = 0; for (i=0; i<nvtxs; i++) if (where[i] != home[i]) totalv += vsize[i]; mycost = (real_t)(graph->mincut)*ipc_factor + (real_t)totalv*redist_factor; if (bestcost >= mycost) { bestcost = mycost; other_lbavg = lbavg; icopy(nvtxs, where, costwhere); } if (best_lbavg >= lbavg) { best_lbavg = lbavg; othercost = mycost; icopy(nvtxs, where, lbwhere); } } if (other_lbavg <= .05) { selectwhere = costwhere; *sr_cost = bestcost; *sr_lbavg = other_lbavg; } else { selectwhere = lbwhere; *sr_cost = othercost; *sr_lbavg = best_lbavg; } icopy(nvtxs, selectwhere, where); WCOREPOP; }