/************************************************************************* * This function performs a k-way directed diffusion **************************************************************************/ real_t WavefrontDiffusion(ctrl_t *ctrl, graph_t *graph, idx_t *home) { idx_t ii, i, j, k, l, nvtxs, nedges, nparts; idx_t from, to, edge, done, nswaps, noswaps, totalv, wsize; idx_t npasses, first, second, third, mind, maxd; idx_t *xadj, *adjncy, *adjwgt, *where, *perm; idx_t *rowptr, *colind, *ed, *psize; real_t *transfer, *tmpvec; real_t balance = -1.0, *load, *solution, *workspace; real_t *nvwgt, *npwgts, flowFactor, cost, ubfactor; matrix_t matrix; ikv_t *cand; idx_t ndirty, nclean, dptr, clean; nvtxs = graph->nvtxs; nedges = graph->nedges; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; nparts = ctrl->nparts; ubfactor = ctrl->ubvec[0]; matrix.nrows = nparts; flowFactor = 0.35; flowFactor = (ctrl->mype == 2) ? 0.50 : flowFactor; flowFactor = (ctrl->mype == 3) ? 0.75 : flowFactor; flowFactor = (ctrl->mype == 4) ? 1.00 : flowFactor; /* allocate memory */ solution = rmalloc(4*nparts+2*nedges, "WavefrontDiffusion: solution"); tmpvec = solution + nparts; npwgts = solution + 2*nparts; load = solution + 3*nparts; matrix.values = solution + 4*nparts; transfer = matrix.transfer = solution + 4*nparts + nedges; perm = imalloc(2*nvtxs+2*nparts+nedges+1, "WavefrontDiffusion: perm"); ed = perm + nvtxs; psize = perm + 2*nvtxs; rowptr = matrix.rowptr = perm + 2*nvtxs + nparts; colind = matrix.colind = perm + 2*nvtxs + 2*nparts + 1; /*GKTODO - Potential problem with this malloc */ wsize = gk_max(sizeof(real_t)*nparts*6, sizeof(idx_t)*(nvtxs+nparts*2+1)); workspace = (real_t *)gk_malloc(wsize, "WavefrontDiffusion: workspace"); cand = ikvmalloc(nvtxs, "WavefrontDiffusion: cand"); /*****************************/ /* Populate empty subdomains */ /*****************************/ iset(nparts, 0, psize); for (i=0; i<nvtxs; i++) psize[where[i]]++; mind = iargmin(nparts, psize); maxd = iargmax(nparts, psize); if (psize[mind] == 0) { for (i=0; i<nvtxs; i++) { k = (RandomInRange(nvtxs)+i)%nvtxs; if (where[k] == maxd) { where[k] = mind; psize[mind]++; psize[maxd]--; break; } } } iset(nvtxs, 0, ed); rset(nparts, 0.0, npwgts); for (i=0; i<nvtxs; i++) { npwgts[where[i]] += nvwgt[i]; for (j=xadj[i]; j<xadj[i+1]; j++) ed[i] += (where[i] != where[adjncy[j]] ? adjwgt[j] : 0); } ComputeLoad(graph, nparts, load, ctrl->tpwgts, 0); done = 0; /* zero out the tmpvec array */ rset(nparts, 0.0, tmpvec); npasses = gk_min(nparts/2, NGD_PASSES); for (l=0; l<npasses; l++) { /* Set-up and solve the diffusion equation */ nswaps = 0; /************************/ /* Solve flow equations */ /************************/ SetUpConnectGraph(graph, &matrix, (idx_t *)workspace); /* check for disconnected subdomains */ for(i=0; i<matrix.nrows; i++) { if (matrix.rowptr[i]+1 == matrix.rowptr[i+1]) { cost = (real_t)(ctrl->mype); goto CleanUpAndExit; } } ConjGrad2(&matrix, load, solution, 0.001, workspace); ComputeTransferVector(1, &matrix, solution, transfer, 0); GetThreeMax(nparts, load, &first, &second, &third); if (l%3 == 0) { FastRandomPermute(nvtxs, perm, 1); } else { /*****************************/ /* move dirty vertices first */ /*****************************/ ndirty = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) ndirty++; } dptr = 0; for (i=0; i<nvtxs; i++) { if (where[i] != home[i]) perm[dptr++] = i; else perm[ndirty++] = i; } PASSERT(ctrl, ndirty == nvtxs); ndirty = dptr; nclean = nvtxs-dptr; FastRandomPermute(ndirty, perm, 0); FastRandomPermute(nclean, perm+ndirty, 0); } if (ctrl->mype == 0) { for (j=nvtxs, k=0, ii=0; ii<nvtxs; ii++) { i = perm[ii]; if (ed[i] != 0) { cand[k].key = -ed[i]; cand[k++].val = i; } else { cand[--j].key = 0; cand[j].val = i; } } ikvsorti(k, cand); } for (ii=0; ii<nvtxs/3; ii++) { i = (ctrl->mype == 0) ? cand[ii].val : perm[ii]; from = where[i]; /* don't move out the last vertex in a subdomain */ if (psize[from] == 1) continue; clean = (from == home[i]) ? 1 : 0; /* only move from top three or dirty vertices */ if (from != first && from != second && from != third && clean) continue; /* Scatter the sparse transfer row into the dense tmpvec row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) tmpvec[colind[j]] = transfer[j]; for (j=xadj[i]; j<xadj[i+1]; j++) { to = where[adjncy[j]]; if (from != to) { if (tmpvec[to] > (flowFactor * nvwgt[i])) { tmpvec[to] -= nvwgt[i]; INC_DEC(psize[to], psize[from], 1); INC_DEC(npwgts[to], npwgts[from], nvwgt[i]); INC_DEC(load[to], load[from], nvwgt[i]); where[i] = to; nswaps++; /* Update external degrees */ ed[i] = 0; for (k=xadj[i]; k<xadj[i+1]; k++) { edge = adjncy[k]; ed[i] += (to != where[edge] ? adjwgt[k] : 0); if (where[edge] == from) ed[edge] += adjwgt[k]; if (where[edge] == to) ed[edge] -= adjwgt[k]; } break; } } } /* Gather the dense tmpvec row into the sparse transfer row */ for (j=rowptr[from]+1; j<rowptr[from+1]; j++) { transfer[j] = tmpvec[colind[j]]; tmpvec[colind[j]] = 0.0; } ASSERT(fabs(rsum(nparts, tmpvec, 1)) < .0001) } if (l % 2 == 1) { balance = rmax(nparts, npwgts)*nparts; if (balance < ubfactor + 0.035) done = 1; if (GlobalSESum(ctrl, done) > 0) break; noswaps = (nswaps > 0) ? 0 : 1; if (GlobalSESum(ctrl, noswaps) > ctrl->npes/2) break; } } graph->mincut = ComputeSerialEdgeCut(graph); totalv = Mc_ComputeSerialTotalV(graph, home); cost = ctrl->ipc_factor * (real_t)graph->mincut + ctrl->redist_factor * (real_t)totalv; CleanUpAndExit: gk_free((void **)&solution, (void **)&perm, (void **)&workspace, (void **)&cand, LTERM); return cost; }
void CreateCoarseGraph_Global(ctrl_t *ctrl, graph_t *graph, idx_t cnvtxs) { idx_t h, i, j, k, l, ii, jj, ll, nnbrs, nvtxs, nedges, ncon; idx_t firstvtx, lastvtx, cfirstvtx, clastvtx, otherlastvtx; idx_t npes=ctrl->npes, mype=ctrl->mype; idx_t cnedges, nsend, nrecv, nkeepsize, nrecvsize, nsendsize, v, u; idx_t *xadj, *adjncy, *adjwgt, *vwgt, *vsize, *vtxdist, *home, *where; idx_t *match, *cmap; idx_t *cxadj, *cadjncy, *cadjwgt, *cvwgt, *cvsize = NULL, *chome = NULL, *cwhere = NULL, *cvtxdist; idx_t *rsizes, *ssizes, *rlens, *slens, *rgraph, *sgraph, *perm; idx_t *peind, *recvptr, *recvind; real_t *nvwgt, *cnvwgt; graph_t *cgraph; ikv_t *scand, *rcand; idx_t htable[LHTSIZE], htableidx[LHTSIZE]; WCOREPUSH; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; vwgt = graph->vwgt; vsize = graph->vsize; nvwgt = graph->nvwgt; home = graph->home; where = graph->where; adjncy = graph->adjncy; adjwgt = graph->adjwgt; match = graph->match; vtxdist = graph->vtxdist; firstvtx = vtxdist[mype]; lastvtx = vtxdist[mype+1]; cmap = graph->cmap = ismalloc(nvtxs+graph->nrecv, -1, "Global_CreateCoarseGraph: cmap"); nnbrs = graph->nnbrs; peind = graph->peind; recvind = graph->recvind; recvptr = graph->recvptr; /* Initialize the coarser graph */ cgraph = CreateGraph(); cgraph->nvtxs = cnvtxs; cgraph->ncon = ncon; cgraph->level = graph->level+1; cgraph->finer = graph; graph->coarser = cgraph; /************************************************************* * Obtain the vtxdist of the coarser graph **************************************************************/ cvtxdist = cgraph->vtxdist = imalloc(npes+1, "Global_CreateCoarseGraph: cvtxdist"); cvtxdist[npes] = cnvtxs; /* Use last position in the cvtxdist as a temp buffer */ gkMPI_Allgather((void *)(cvtxdist+npes), 1, IDX_T, (void *)cvtxdist, 1, IDX_T, ctrl->comm); MAKECSR(i, npes, cvtxdist); cgraph->gnvtxs = cvtxdist[npes]; #ifdef DEBUG_CONTRACT PrintVector(ctrl, npes+1, 0, cvtxdist, "cvtxdist"); #endif /************************************************************* * Construct the cmap vector **************************************************************/ cfirstvtx = cvtxdist[mype]; clastvtx = cvtxdist[mype+1]; /* Create the cmap of what you know so far locally. */ for (cnvtxs=0, i=0; i<nvtxs; i++) { if (match[i] >= KEEP_BIT) { k = match[i] - KEEP_BIT; if (k>=firstvtx && k<firstvtx+i) continue; /* Both (i,k) are local and i has been matched via the (k,i) side */ cmap[i] = cfirstvtx + cnvtxs++; if (k != firstvtx+i && (k>=firstvtx && k<lastvtx)) { /* I'm matched locally */ cmap[k-firstvtx] = cmap[i]; match[k-firstvtx] += KEEP_BIT; /* Add the KEEP_BIT to simplify coding */ } } } PASSERT(ctrl, cnvtxs == clastvtx-cfirstvtx); CommInterfaceData(ctrl, graph, cmap, cmap+nvtxs); /* Update the cmap of the locally stored vertices that will go away. * The remote processor assigned cmap for them */ for (i=0; i<nvtxs; i++) { if (match[i] < KEEP_BIT) { /* Only vertices that go away satisfy this*/ cmap[i] = cmap[nvtxs+BSearch(graph->nrecv, recvind, match[i])]; } } CommInterfaceData(ctrl, graph, cmap, cmap+nvtxs); #ifndef NDEBUG for (i=0; i<nvtxs+graph->nrecv; i++) { if (cmap[i] == -1) errexit("cmap[%"PRIDX"] == -1\n", i); } #endif #ifdef DEBUG_CONTRACT PrintVector(ctrl, nvtxs, firstvtx, cmap, "Cmap"); #endif /************************************************************* * Determine how many adjcency lists you need to send/receive. **************************************************************/ /* first pass: determine sizes */ for (nsend=0, nrecv=0, i=0; i<nvtxs; i++) { if (match[i] < KEEP_BIT) /* This is going away */ nsend++; else { k = match[i]-KEEP_BIT; if (k<firstvtx || k>=lastvtx) /* This is comming from afar */ nrecv++; } } scand = ikvwspacemalloc(ctrl, nsend); rcand = graph->rcand = ikvmalloc(nrecv, "CreateCoarseGraph: rcand"); /* second pass: place them in the appropriate arrays */ nkeepsize = nsend = nrecv = 0; for (i=0; i<nvtxs; i++) { if (match[i] < KEEP_BIT) { /* This is going away */ scand[nsend].key = match[i]; scand[nsend].val = i; nsend++; } else { nkeepsize += (xadj[i+1]-xadj[i]); k = match[i]-KEEP_BIT; if (k<firstvtx || k>=lastvtx) { /* This is comming from afar */ rcand[nrecv].key = k; rcand[nrecv].val = cmap[i] - cfirstvtx; /* Set it for use during the partition projection */ PASSERT(ctrl, rcand[nrecv].val>=0 && rcand[nrecv].val<cnvtxs); nrecv++; } } } #ifdef DEBUG_CONTRACT PrintPairs(ctrl, nsend, scand, "scand"); PrintPairs(ctrl, nrecv, rcand, "rcand"); #endif /*************************************************************** * Determine how many lists and their sizes you will send and * received for each of the neighboring PEs ****************************************************************/ rlens = graph->rlens = imalloc(nnbrs+1, "CreateCoarseGraph: graph->rlens"); slens = graph->slens = imalloc(nnbrs+1, "CreateCoarseGraph: graph->slens"); rsizes = iset(nnbrs, 0, iwspacemalloc(ctrl, nnbrs)); ssizes = iset(nnbrs, 0, iwspacemalloc(ctrl, nnbrs)); /* Take care the sending data first */ ikvsortii(nsend, scand); slens[0] = 0; for (k=i=0; i<nnbrs; i++) { otherlastvtx = vtxdist[peind[i]+1]; for (; k<nsend && scand[k].key < otherlastvtx; k++) ssizes[i] += (xadj[scand[k].val+1]-xadj[scand[k].val]); slens[i+1] = k; } /* Take care the receiving data next. You cannot yet determine the rsizes[] */ ikvsortii(nrecv, rcand); rlens[0] = 0; for (k=i=0; i<nnbrs; i++) { otherlastvtx = vtxdist[peind[i]+1]; for (; k<nrecv && rcand[k].key < otherlastvtx; k++); rlens[i+1] = k; } #ifdef DEBUG_CONTRACT PrintVector(ctrl, nnbrs+1, 0, slens, "slens"); PrintVector(ctrl, nnbrs+1, 0, rlens, "rlens"); #endif /*************************************************************** * Exchange size information ****************************************************************/ /* Issue the receives first. */ for (i=0; i<nnbrs; i++) { if (rlens[i+1]-rlens[i] > 0) /* Issue a receive only if you are getting something */ gkMPI_Irecv((void *)(rsizes+i), 1, IDX_T, peind[i], 1, ctrl->comm, ctrl->rreq+i); } /* Take care the sending data next */ for (i=0; i<nnbrs; i++) { if (slens[i+1]-slens[i] > 0) /* Issue a send only if you are sending something */ gkMPI_Isend((void *)(ssizes+i), 1, IDX_T, peind[i], 1, ctrl->comm, ctrl->sreq+i); } /* OK, now get into the loop waiting for the operations to finish */ for (i=0; i<nnbrs; i++) { if (rlens[i+1]-rlens[i] > 0) gkMPI_Wait(ctrl->rreq+i, &ctrl->status); } for (i=0; i<nnbrs; i++) { if (slens[i+1]-slens[i] > 0) gkMPI_Wait(ctrl->sreq+i, &ctrl->status); } #ifdef DEBUG_CONTRACT PrintVector(ctrl, nnbrs, 0, rsizes, "rsizes"); PrintVector(ctrl, nnbrs, 0, ssizes, "ssizes"); #endif /************************************************************* * Allocate memory for received/sent graphs and start sending * and receiving data. * rgraph and sgraph is a different data structure than CSR * to facilitate single message exchange. **************************************************************/ nrecvsize = isum(nnbrs, rsizes, 1); nsendsize = isum(nnbrs, ssizes, 1); rgraph = iwspacemalloc(ctrl, (4+ncon)*nrecv+2*nrecvsize); WCOREPUSH; /* for freeing sgraph right away */ sgraph = iwspacemalloc(ctrl, (4+ncon)*nsend+2*nsendsize); /* Deal with the received portion first */ for (l=i=0; i<nnbrs; i++) { /* Issue a receive only if you are getting something */ if (rlens[i+1]-rlens[i] > 0) { gkMPI_Irecv((void *)(rgraph+l), (4+ncon)*(rlens[i+1]-rlens[i])+2*rsizes[i], IDX_T, peind[i], 1, ctrl->comm, ctrl->rreq+i); l += (4+ncon)*(rlens[i+1]-rlens[i])+2*rsizes[i]; } } /* Deal with the sent portion now */ for (ll=l=i=0; i<nnbrs; i++) { if (slens[i+1]-slens[i] > 0) { /* Issue a send only if you are sending something */ for (k=slens[i]; k<slens[i+1]; k++) { ii = scand[k].val; sgraph[ll++] = firstvtx+ii; sgraph[ll++] = xadj[ii+1]-xadj[ii]; for (h=0; h<ncon; h++) sgraph[ll++] = vwgt[ii*ncon+h]; sgraph[ll++] = (ctrl->partType == STATIC_PARTITION || ctrl->partType == ORDER_PARTITION ? -1 : vsize[ii]); sgraph[ll++] = (ctrl->partType == STATIC_PARTITION || ctrl->partType == ORDER_PARTITION ? -1 : home[ii]); for (jj=xadj[ii]; jj<xadj[ii+1]; jj++) { sgraph[ll++] = cmap[adjncy[jj]]; sgraph[ll++] = adjwgt[jj]; } } PASSERT(ctrl, ll-l == (4+ncon)*(slens[i+1]-slens[i])+2*ssizes[i]); /*myprintf(ctrl, "Sending to pe:%"PRIDX", %"PRIDX" lists of size %"PRIDX"\n", peind[i], slens[i+1]-slens[i], ssizes[i]); */ gkMPI_Isend((void *)(sgraph+l), ll-l, IDX_T, peind[i], 1, ctrl->comm, ctrl->sreq+i); l = ll; } } /* OK, now get into the loop waiting for the operations to finish */ for (i=0; i<nnbrs; i++) { if (rlens[i+1]-rlens[i] > 0) gkMPI_Wait(ctrl->rreq+i, &ctrl->status); } for (i=0; i<nnbrs; i++) { if (slens[i+1]-slens[i] > 0) gkMPI_Wait(ctrl->sreq+i, &ctrl->status); } #ifdef DEBUG_CONTRACT rprintf(ctrl, "Graphs were sent!\n"); PrintTransferedGraphs(ctrl, nnbrs, peind, slens, rlens, sgraph, rgraph); #endif WCOREPOP; /* free sgraph */ /************************************************************* * Setup the mapping from indices returned by BSearch to * those that are actually stored **************************************************************/ perm = iset(graph->nrecv, -1, iwspacemalloc(ctrl, graph->nrecv)); for (j=i=0; i<nrecv; i++) { perm[BSearch(graph->nrecv, recvind, rgraph[j])] = j+1; j += (4+ncon)+2*rgraph[j+1]; } /************************************************************* * Finally, create the coarser graph **************************************************************/ /* Allocate memory for the coarser graph, and fire up coarsening */ cxadj = cgraph->xadj = imalloc(cnvtxs+1, "CreateCoarserGraph: cxadj"); cvwgt = cgraph->vwgt = imalloc(cnvtxs*ncon, "CreateCoarserGraph: cvwgt"); cnvwgt = cgraph->nvwgt = rmalloc(cnvtxs*ncon, "CreateCoarserGraph: cnvwgt"); if (ctrl->partType == ADAPTIVE_PARTITION || ctrl->partType == REFINE_PARTITION) { cvsize = cgraph->vsize = imalloc(cnvtxs, "CreateCoarserGraph: cvsize"); chome = cgraph->home = imalloc(cnvtxs, "CreateCoarserGraph: chome"); } if (where != NULL) cwhere = cgraph->where = imalloc(cnvtxs, "CreateCoarserGraph: cwhere"); /* these are just upper bound estimates for now */ cadjncy = iwspacemalloc(ctrl, nkeepsize+nrecvsize); cadjwgt = iwspacemalloc(ctrl, nkeepsize+nrecvsize); iset(LHTSIZE, -1, htable); cxadj[0] = cnvtxs = cnedges = 0; for (i=0; i<nvtxs; i++) { if (match[i] >= KEEP_BIT) { v = firstvtx+i; u = match[i]-KEEP_BIT; if (u>=firstvtx && u<lastvtx && v > u) continue; /* I have already collapsed it as (u,v) */ /* Collapse the v vertex first, which you know is local */ for (h=0; h<ncon; h++) cvwgt[cnvtxs*ncon+h] = vwgt[i*ncon+h]; if (ctrl->partType == ADAPTIVE_PARTITION || ctrl->partType == REFINE_PARTITION) { cvsize[cnvtxs] = vsize[i]; chome[cnvtxs] = home[i]; } if (where != NULL) cwhere[cnvtxs] = where[i]; nedges = 0; /* Collapse the v (i) vertex first */ for (j=xadj[i]; j<xadj[i+1]; j++) { k = cmap[adjncy[j]]; if (k < 0) printf("k=%d\n", (int)k); if (k != cfirstvtx+cnvtxs) { /* If this is not an internal edge */ l = k&MASK; if (htable[l] == -1) { /* Seeing this for first time */ htable[l] = k; htableidx[l] = cnedges+nedges; cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges++] = adjwgt[j]; } else if (htable[l] == k) { cadjwgt[htableidx[l]] += adjwgt[j]; } else { /* Now you have to go and do a search. Expensive case */ for (l=0; l<nedges; l++) { if (cadjncy[cnedges+l] == k) break; } if (l < nedges) { cadjwgt[cnedges+l] += adjwgt[j]; } else { cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges++] = adjwgt[j]; } } } } /* Collapse the u vertex next */ if (v != u) { if (u>=firstvtx && u<lastvtx) { /* Local vertex */ u -= firstvtx; for (h=0; h<ncon; h++) cvwgt[cnvtxs*ncon+h] += vwgt[u*ncon+h]; if (ctrl->partType == ADAPTIVE_PARTITION || ctrl->partType == REFINE_PARTITION) cvsize[cnvtxs] += vsize[u]; for (j=xadj[u]; j<xadj[u+1]; j++) { k = cmap[adjncy[j]]; if (k != cfirstvtx+cnvtxs) { /* If this is not an internal edge */ l = k&MASK; if (htable[l] == -1) { /* Seeing this for first time */ htable[l] = k; htableidx[l] = cnedges+nedges; cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges] = adjwgt[j]; nedges++; } else if (htable[l] == k) { cadjwgt[htableidx[l]] += adjwgt[j]; } else { /* Now you have to go and do a search. Expensive case */ for (l=0; l<nedges; l++) { if (cadjncy[cnedges+l] == k) break; } if (l < nedges) { cadjwgt[cnedges+l] += adjwgt[j]; } else { cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges] = adjwgt[j]; nedges++; } } } } } else { /* Remote vertex */ u = perm[BSearch(graph->nrecv, recvind, u)]; for (h=0; h<ncon; h++) /* Remember that the +1 stores the vertex weight */ cvwgt[cnvtxs*ncon+h] += rgraph[(u+1)+h]; if (ctrl->partType == ADAPTIVE_PARTITION || ctrl->partType == REFINE_PARTITION) { cvsize[cnvtxs] += rgraph[u+1+ncon]; chome[cnvtxs] = rgraph[u+2+ncon]; } for (j=0; j<rgraph[u]; j++) { k = rgraph[u+3+ncon+2*j]; if (k != cfirstvtx+cnvtxs) { /* If this is not an internal edge */ l = k&MASK; if (htable[l] == -1) { /* Seeing this for first time */ htable[l] = k; htableidx[l] = cnedges+nedges; cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges] = rgraph[u+3+ncon+2*j+1]; nedges++; } else if (htable[l] == k) { cadjwgt[htableidx[l]] += rgraph[u+3+ncon+2*j+1]; } else { /* Now you have to go and do a search. Expensive case */ for (l=0; l<nedges; l++) { if (cadjncy[cnedges+l] == k) break; } if (l < nedges) { cadjwgt[cnedges+l] += rgraph[u+3+ncon+2*j+1]; } else { cadjncy[cnedges+nedges] = k; cadjwgt[cnedges+nedges] = rgraph[u+3+ncon+2*j+1]; nedges++; } } } } } } cnedges += nedges; for (j=cxadj[cnvtxs]; j<cnedges; j++) htable[cadjncy[j]&MASK] = -1; /* reset the htable */ cxadj[++cnvtxs] = cnedges; } } cgraph->nedges = cnedges; /* ADD: In order to keep from having to change this too much */ /* ADD: I kept vwgt array and recomputed nvwgt for each coarser graph */ for (j=0; j<cnvtxs; j++) { for (h=0; h<ncon; h++) cgraph->nvwgt[j*ncon+h] = ctrl->invtvwgts[h]*cvwgt[j*ncon+h]; } cgraph->adjncy = imalloc(cnedges, "CreateCoarserGraph: cadjncy"); cgraph->adjwgt = imalloc(cnedges, "CreateCoarserGraph: cadjwgt"); icopy(cnedges, cadjncy, cgraph->adjncy); icopy(cnedges, cadjwgt, cgraph->adjwgt); WCOREPOP; /* Note that graph->where works fine even if it is NULL */ gk_free((void **)&graph->where, LTERM); }
graph_t *CompressGraph(ctrl_t *ctrl, idx_t nvtxs, idx_t *xadj, idx_t *adjncy, idx_t *vwgt, idx_t *cptr, idx_t *cind) { idx_t i, ii, iii, j, jj, k, l, cnvtxs, cnedges; idx_t *cxadj, *cadjncy, *cvwgt, *mark, *map; ikv_t *keys; graph_t *graph=NULL; mark = ismalloc(nvtxs, -1, "CompressGraph: mark"); map = ismalloc(nvtxs, -1, "CompressGraph: map"); keys = ikvmalloc(nvtxs, "CompressGraph: keys"); /* Compute a key for each adjacency list */ for (i=0; i<nvtxs; i++) { k = 0; for (j=xadj[i]; j<xadj[i+1]; j++) k += adjncy[j]; keys[i].key = k+i; /* Add the diagonal entry as well */ keys[i].val = i; } ikvsorti(nvtxs, keys); l = cptr[0] = 0; for (cnvtxs=i=0; i<nvtxs; i++) { ii = keys[i].val; if (map[ii] == -1) { mark[ii] = i; /* Add the diagonal entry */ for (j=xadj[ii]; j<xadj[ii+1]; j++) mark[adjncy[j]] = i; map[ii] = cnvtxs; cind[l++] = ii; for (j=i+1; j<nvtxs; j++) { iii = keys[j].val; if (keys[i].key != keys[j].key || xadj[ii+1]-xadj[ii] != xadj[iii+1]-xadj[iii]) break; /* Break if keys or degrees are different */ if (map[iii] == -1) { /* Do a comparison if iii has not been mapped */ for (jj=xadj[iii]; jj<xadj[iii+1]; jj++) { if (mark[adjncy[jj]] != i) break; } if (jj == xadj[iii+1]) { /* Identical adjacency structure */ map[iii] = cnvtxs; cind[l++] = iii; } } } cptr[++cnvtxs] = l; } } IFSET(ctrl->dbglvl, METIS_DBG_INFO, printf(" Compression: reduction in # of vertices: %"PRIDX".\n", nvtxs-cnvtxs)); if (cnvtxs < COMPRESSION_FRACTION*nvtxs) { /* Sufficient compression is possible, so go ahead and create the compressed graph */ graph = CreateGraph(); cnedges = 0; for (i=0; i<cnvtxs; i++) { ii = cind[cptr[i]]; cnedges += xadj[ii+1]-xadj[ii]; } /* Allocate memory for the compressed graph */ cxadj = graph->xadj = imalloc(cnvtxs+1, "CompressGraph: xadj"); cvwgt = graph->vwgt = ismalloc(cnvtxs, 0, "CompressGraph: vwgt"); cadjncy = graph->adjncy = imalloc(cnedges, "CompressGraph: adjncy"); graph->adjwgt = ismalloc(cnedges, 1, "CompressGraph: adjwgt"); /* Now go and compress the graph */ iset(nvtxs, -1, mark); l = cxadj[0] = 0; for (i=0; i<cnvtxs; i++) { mark[i] = i; /* Remove any dioganal entries in the compressed graph */ for (j=cptr[i]; j<cptr[i+1]; j++) { ii = cind[j]; /* accumulate the vertex weights of the consistuent vertices */ cvwgt[i] += (vwgt == NULL ? 1 : vwgt[ii]); /* generate the combined adjancency list */ for (jj=xadj[ii]; jj<xadj[ii+1]; jj++) { k = map[adjncy[jj]]; if (mark[k] != i) { mark[k] = i; cadjncy[l++] = k; } } } cxadj[i+1] = l; } graph->nvtxs = cnvtxs; graph->nedges = l; graph->ncon = 1; SetupGraph_tvwgt(graph); SetupGraph_label(graph); } gk_free((void **)&keys, &map, &mark, LTERM); return graph; }
/************************************************************************* * This function converts a mesh into a dual graph **************************************************************************/ int ParMETIS_V3_Mesh2Dual(idx_t *elmdist, idx_t *eptr, idx_t *eind, idx_t *numflag, idx_t *ncommon, idx_t **r_xadj, idx_t **r_adjncy, MPI_Comm *comm) { idx_t i, j, jj, k, kk, m; idx_t npes, mype, pe, count, mask, pass; idx_t nelms, lnns, my_nns, node; idx_t firstelm, firstnode, lnode, nrecv, nsend; idx_t *scounts, *rcounts, *sdispl, *rdispl; idx_t *nodedist, *nmap, *auxarray; idx_t *gnptr, *gnind, *nptr, *nind, *myxadj=NULL, *myadjncy = NULL; idx_t *sbuffer, *rbuffer, *htable; ikv_t *nodelist, *recvbuffer; idx_t maxcount, *ind, *wgt; idx_t gmaxnode, gminnode; size_t curmem; gk_malloc_init(); curmem = gk_GetCurMemoryUsed(); /* Get basic comm info */ gkMPI_Comm_size(*comm, &npes); gkMPI_Comm_rank(*comm, &mype); nelms = elmdist[mype+1]-elmdist[mype]; if (*numflag > 0) ChangeNumberingMesh(elmdist, eptr, eind, NULL, NULL, NULL, npes, mype, 1); mask = (1<<11)-1; /*****************************/ /* Determine number of nodes */ /*****************************/ gminnode = GlobalSEMinComm(*comm, imin(eptr[nelms], eind)); for (i=0; i<eptr[nelms]; i++) eind[i] -= gminnode; gmaxnode = GlobalSEMaxComm(*comm, imax(eptr[nelms], eind)); /**************************/ /* Check for input errors */ /**************************/ ASSERT(nelms > 0); /* construct node distribution array */ nodedist = ismalloc(npes+1, 0, "nodedist"); for (nodedist[0]=0, i=0,j=gmaxnode+1; i<npes; i++) { k = j/(npes-i); nodedist[i+1] = nodedist[i]+k; j -= k; } my_nns = nodedist[mype+1]-nodedist[mype]; firstnode = nodedist[mype]; nodelist = ikvmalloc(eptr[nelms], "nodelist"); auxarray = imalloc(eptr[nelms], "auxarray"); htable = ismalloc(gk_max(my_nns, mask+1), -1, "htable"); scounts = imalloc(npes, "scounts"); rcounts = imalloc(npes, "rcounts"); sdispl = imalloc(npes+1, "sdispl"); rdispl = imalloc(npes+1, "rdispl"); /*********************************************/ /* first find a local numbering of the nodes */ /*********************************************/ for (i=0; i<nelms; i++) { for (j=eptr[i]; j<eptr[i+1]; j++) { nodelist[j].key = eind[j]; nodelist[j].val = j; auxarray[j] = i; /* remember the local element ID that uses this node */ } } ikvsorti(eptr[nelms], nodelist); for (count=1, i=1; i<eptr[nelms]; i++) { if (nodelist[i].key > nodelist[i-1].key) count++; } lnns = count; nmap = imalloc(lnns, "nmap"); /* renumber the nodes of the elements array */ count = 1; nmap[0] = nodelist[0].key; eind[nodelist[0].val] = 0; nodelist[0].val = auxarray[nodelist[0].val]; /* Store the local element ID */ for (i=1; i<eptr[nelms]; i++) { if (nodelist[i].key > nodelist[i-1].key) { nmap[count] = nodelist[i].key; count++; } eind[nodelist[i].val] = count-1; nodelist[i].val = auxarray[nodelist[i].val]; /* Store the local element ID */ } gkMPI_Barrier(*comm); /**********************************************************/ /* perform comms necessary to construct node-element list */ /**********************************************************/ iset(npes, 0, scounts); for (pe=i=0; i<eptr[nelms]; i++) { while (nodelist[i].key >= nodedist[pe+1]) pe++; scounts[pe] += 2; } ASSERT(pe < npes); gkMPI_Alltoall((void *)scounts, 1, IDX_T, (void *)rcounts, 1, IDX_T, *comm); icopy(npes, scounts, sdispl); MAKECSR(i, npes, sdispl); icopy(npes, rcounts, rdispl); MAKECSR(i, npes, rdispl); ASSERT(sdispl[npes] == eptr[nelms]*2); nrecv = rdispl[npes]/2; recvbuffer = ikvmalloc(gk_max(1, nrecv), "recvbuffer"); gkMPI_Alltoallv((void *)nodelist, scounts, sdispl, IDX_T, (void *)recvbuffer, rcounts, rdispl, IDX_T, *comm); /**************************************/ /* construct global node-element list */ /**************************************/ gnptr = ismalloc(my_nns+1, 0, "gnptr"); for (i=0; i<npes; i++) { for (j=rdispl[i]/2; j<rdispl[i+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; ASSERT(lnode >= 0 && lnode < my_nns) gnptr[lnode]++; } } MAKECSR(i, my_nns, gnptr); gnind = imalloc(gk_max(1, gnptr[my_nns]), "gnind"); for (pe=0; pe<npes; pe++) { firstelm = elmdist[pe]; for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; gnind[gnptr[lnode]++] = recvbuffer[j].val+firstelm; } } SHIFTCSR(i, my_nns, gnptr); /*********************************************************/ /* send the node-element info to the relevant processors */ /*********************************************************/ iset(npes, 0, scounts); /* use a hash table to ensure that each node is sent to a proc only once */ for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; if (htable[lnode] == -1) { scounts[pe] += gnptr[lnode+1]-gnptr[lnode]; htable[lnode] = 1; } } /* now reset the hash table */ for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; htable[lnode] = -1; } } gkMPI_Alltoall((void *)scounts, 1, IDX_T, (void *)rcounts, 1, IDX_T, *comm); icopy(npes, scounts, sdispl); MAKECSR(i, npes, sdispl); /* create the send buffer */ nsend = sdispl[npes]; sbuffer = imalloc(gk_max(1, nsend), "sbuffer"); count = 0; for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; if (htable[lnode] == -1) { for (k=gnptr[lnode]; k<gnptr[lnode+1]; k++) { if (k == gnptr[lnode]) sbuffer[count++] = -1*(gnind[k]+1); else sbuffer[count++] = gnind[k]; } htable[lnode] = 1; } } ASSERT(count == sdispl[pe+1]); /* now reset the hash table */ for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) { lnode = recvbuffer[j].key-firstnode; htable[lnode] = -1; } } icopy(npes, rcounts, rdispl); MAKECSR(i, npes, rdispl); nrecv = rdispl[npes]; rbuffer = imalloc(gk_max(1, nrecv), "rbuffer"); gkMPI_Alltoallv((void *)sbuffer, scounts, sdispl, IDX_T, (void *)rbuffer, rcounts, rdispl, IDX_T, *comm); k = -1; nptr = ismalloc(lnns+1, 0, "nptr"); nind = rbuffer; for (pe=0; pe<npes; pe++) { for (j=rdispl[pe]; j<rdispl[pe+1]; j++) { if (nind[j] < 0) { k++; nind[j] = (-1*nind[j])-1; } nptr[k]++; } } MAKECSR(i, lnns, nptr); ASSERT(k+1 == lnns); ASSERT(nptr[lnns] == nrecv) myxadj = *r_xadj = (idx_t *)malloc(sizeof(idx_t)*(nelms+1)); if (myxadj == NULL) gk_errexit(SIGMEM, "Failed to allocate memory for the dual graph's xadj array.\n"); iset(nelms+1, 0, myxadj); iset(mask+1, -1, htable); firstelm = elmdist[mype]; /* Two passes -- in first pass, simply find out the memory requirements */ maxcount = 200; ind = imalloc(maxcount, "ParMETIS_V3_Mesh2Dual: ind"); wgt = imalloc(maxcount, "ParMETIS_V3_Mesh2Dual: wgt"); for (pass=0; pass<2; pass++) { for (i=0; i<nelms; i++) { for (count=0, j=eptr[i]; j<eptr[i+1]; j++) { node = eind[j]; for (k=nptr[node]; k<nptr[node+1]; k++) { if ((kk=nind[k]) == firstelm+i) continue; m = htable[(kk&mask)]; if (m == -1) { ind[count] = kk; wgt[count] = 1; htable[(kk&mask)] = count++; } else { if (ind[m] == kk) { wgt[m]++; } else { for (jj=0; jj<count; jj++) { if (ind[jj] == kk) { wgt[jj]++; break; } } if (jj == count) { ind[count] = kk; wgt[count++] = 1; } } } /* Adjust the memory. This will be replaced by a idxrealloc() when GKlib will be incorporated */ if (count == maxcount-1) { maxcount *= 2; ind = irealloc(ind, maxcount, "ParMETIS_V3_Mesh2Dual: ind"); wgt = irealloc(wgt, maxcount, "ParMETIS_V3_Mesh2Dual: wgt"); } } } for (j=0; j<count; j++) { htable[(ind[j]&mask)] = -1; if (wgt[j] >= *ncommon) { if (pass == 0) myxadj[i]++; else myadjncy[myxadj[i]++] = ind[j]; } } } if (pass == 0) { MAKECSR(i, nelms, myxadj); myadjncy = *r_adjncy = (idx_t *)malloc(sizeof(idx_t)*myxadj[nelms]); if (myadjncy == NULL) gk_errexit(SIGMEM, "Failed to allocate memory for dual graph's adjncy array.\n"); } else { SHIFTCSR(i, nelms, myxadj); } } /*****************************************/ /* correctly renumber the elements array */ /*****************************************/ for (i=0; i<eptr[nelms]; i++) eind[i] = nmap[eind[i]] + gminnode; if (*numflag == 1) ChangeNumberingMesh(elmdist, eptr, eind, myxadj, myadjncy, NULL, npes, mype, 0); /* do not free nodelist, recvbuffer, rbuffer */ gk_free((void **)&nodedist, &nodelist, &auxarray, &htable, &scounts, &rcounts, &sdispl, &rdispl, &nmap, &recvbuffer, &gnptr, &gnind, &sbuffer, &rbuffer, &nptr, &ind, &wgt, LTERM); if (gk_GetCurMemoryUsed() - curmem > 0) { printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n", (ssize_t)(gk_GetCurMemoryUsed() - curmem)); } gk_malloc_cleanup(0); return METIS_OK; }