void CreateGraphDual(idx_t ne, idx_t nn, idx_t *eptr, idx_t *eind, idx_t ncommon, idx_t **r_xadj, idx_t **r_adjncy) { idx_t i, j, nnbrs; idx_t *nptr, *nind; idx_t *xadj, *adjncy; idx_t *marker, *nbrs; if (ncommon < 1) { printf(" Increased ncommon to 1, as it was initially %"PRIDX"\n", ncommon); ncommon = 1; } /* construct the node-element list first */ nptr = ismalloc(nn+1, 0, "CreateGraphDual: nptr"); nind = imalloc(eptr[ne], "CreateGraphDual: nind"); for (i=0; i<ne; i++) { for (j=eptr[i]; j<eptr[i+1]; j++) nptr[eind[j]]++; } MAKECSR(i, nn, nptr); for (i=0; i<ne; i++) { for (j=eptr[i]; j<eptr[i+1]; j++) nind[nptr[eind[j]]++] = i; } SHIFTCSR(i, nn, nptr); /* Allocate memory for xadj, since you know its size. These are done using standard malloc as they are returned to the calling function */ if ((xadj = (idx_t *)malloc((ne+1)*sizeof(idx_t))) == NULL) gk_errexit(SIGMEM, "***Failed to allocate memory for xadj.\n"); *r_xadj = xadj; iset(ne+1, 0, xadj); /* allocate memory for working arrays used by FindCommonElements */ marker = ismalloc(ne, 0, "CreateGraphDual: marker"); nbrs = imalloc(ne, "CreateGraphDual: nbrs"); for (i=0; i<ne; i++) { xadj[i] = FindCommonElements(i, eptr[i+1]-eptr[i], eind+eptr[i], nptr, nind, eptr, ncommon, marker, nbrs); } MAKECSR(i, ne, xadj); /* Allocate memory for adjncy, since you now know its size. These are done using standard malloc as they are returned to the calling function */ if ((adjncy = (idx_t *)malloc(xadj[ne]*sizeof(idx_t))) == NULL) { free(xadj); *r_xadj = NULL; gk_errexit(SIGMEM, "***Failed to allocate memory for adjncy.\n"); } *r_adjncy = adjncy; for (i=0; i<ne; i++) { nnbrs = FindCommonElements(i, eptr[i+1]-eptr[i], eind+eptr[i], nptr, nind, eptr, ncommon, marker, nbrs); for (j=0; j<nnbrs; j++) adjncy[xadj[i]++] = nbrs[j]; } SHIFTCSR(i, ne, xadj); gk_free((void **)&nptr, &nind, &marker, &nbrs, LTERM); }
/************************************************************************* * This function is the entry point of the initial balancing algorithm. * This algorithm assembles the graph to all the processors and preceeds * with the balancing step. **************************************************************************/ void Balance_Partition(ctrl_t *ctrl, graph_t *graph) { idx_t i, j, nvtxs, nedges, ncon; idx_t mype, npes, srnpes, srmype; idx_t *vtxdist, *xadj, *adjncy, *adjwgt, *vwgt, *vsize; idx_t *part, *lwhere, *home; idx_t lnparts, fpart, fpe, lnpes, ngroups; idx_t *rcounts, *rdispls; idx_t twoparts=2, moptions[METIS_NOPTIONS], edgecut, max_cut; idx_t sr_pe, gd_pe, sr, gd, who_wins; real_t my_cut, my_totalv, my_cost = -1.0, my_balance = -1.0, wsum; real_t rating, max_rating, your_cost = -1.0, your_balance = -1.0; real_t lbsum, min_lbsum, *lbvec, *tpwgts, *tpwgts2, buffer[2]; graph_t *agraph, cgraph; ctrl_t *myctrl; MPI_Status status; MPI_Comm ipcomm, srcomm; struct { double cost; int rank; } lpecost, gpecost; IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->InitPartTmr)); WCOREPUSH; vtxdist = graph->vtxdist; agraph = AssembleAdaptiveGraph(ctrl, graph); nvtxs = cgraph.nvtxs = agraph->nvtxs; nedges = cgraph.nedges = agraph->nedges; ncon = cgraph.ncon = agraph->ncon; xadj = cgraph.xadj = icopy(nvtxs+1, agraph->xadj, iwspacemalloc(ctrl, nvtxs+1)); vwgt = cgraph.vwgt = icopy(nvtxs*ncon, agraph->vwgt, iwspacemalloc(ctrl, nvtxs*ncon)); vsize = cgraph.vsize = icopy(nvtxs, agraph->vsize, iwspacemalloc(ctrl, nvtxs)); adjncy = cgraph.adjncy = icopy(nedges, agraph->adjncy, iwspacemalloc(ctrl, nedges)); adjwgt = cgraph.adjwgt = icopy(nedges, agraph->adjwgt, iwspacemalloc(ctrl, nedges)); part = cgraph.where = agraph->where = iwspacemalloc(ctrl, nvtxs); lwhere = iwspacemalloc(ctrl, nvtxs); home = iwspacemalloc(ctrl, nvtxs); lbvec = rwspacemalloc(ctrl, graph->ncon); /****************************************/ /****************************************/ if (ctrl->ps_relation == PARMETIS_PSR_UNCOUPLED) { WCOREPUSH; rcounts = iwspacemalloc(ctrl, ctrl->npes); rdispls = iwspacemalloc(ctrl, ctrl->npes+1); for (i=0; i<ctrl->npes; i++) rdispls[i] = rcounts[i] = vtxdist[i+1]-vtxdist[i]; MAKECSR(i, ctrl->npes, rdispls); gkMPI_Allgatherv((void *)graph->home, graph->nvtxs, IDX_T, (void *)part, rcounts, rdispls, IDX_T, ctrl->comm); for (i=0; i<agraph->nvtxs; i++) home[i] = part[i]; WCOREPOP; /* local frees */ } else { for (i=0; i<ctrl->npes; i++) { for (j=vtxdist[i]; j<vtxdist[i+1]; j++) part[j] = home[j] = i; } } /* Ensure that the initial partitioning is legal */ for (i=0; i<agraph->nvtxs; i++) { if (part[i] >= ctrl->nparts) part[i] = home[i] = part[i] % ctrl->nparts; if (part[i] < 0) part[i] = home[i] = (-1*part[i]) % ctrl->nparts; } /****************************************/ /****************************************/ IFSET(ctrl->dbglvl, DBG_REFINEINFO, ComputeSerialBalance(ctrl, agraph, agraph->where, lbvec)); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "input cut: %"PRIDX", balance: ", ComputeSerialEdgeCut(agraph))); for (i=0; i<agraph->ncon; i++) IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "%.3"PRREAL" ", lbvec[i])); IFSET(ctrl->dbglvl, DBG_REFINEINFO, rprintf(ctrl, "\n")); /****************************************/ /* Split the processors into two groups */ /****************************************/ sr = (ctrl->mype % 2 == 0) ? 1 : 0; gd = (ctrl->mype % 2 == 1) ? 1 : 0; if (graph->ncon > MAX_NCON_FOR_DIFFUSION || ctrl->npes == 1) { sr = 1; gd = 0; } sr_pe = 0; gd_pe = 1; gkMPI_Comm_split(ctrl->gcomm, sr, 0, &ipcomm); gkMPI_Comm_rank(ipcomm, &mype); gkMPI_Comm_size(ipcomm, &npes); if (sr == 1) { /* Half of the processors do scratch-remap */ ngroups = gk_max(gk_min(RIP_SPLIT_FACTOR, npes), 1); gkMPI_Comm_split(ipcomm, mype % ngroups, 0, &srcomm); gkMPI_Comm_rank(srcomm, &srmype); gkMPI_Comm_size(srcomm, &srnpes); METIS_SetDefaultOptions(moptions); moptions[METIS_OPTION_SEED] = ctrl->sync + (mype % ngroups) + 1; tpwgts = ctrl->tpwgts; tpwgts2 = rwspacemalloc(ctrl, 2*ncon); iset(nvtxs, 0, lwhere); lnparts = ctrl->nparts; fpart = fpe = 0; lnpes = srnpes; while (lnpes > 1 && lnparts > 1) { PASSERT(ctrl, agraph->nvtxs > 1); /* determine the weights of the two partitions as a function of the weight of the target partition weights */ for (j=(lnparts>>1), i=0; i<ncon; i++) { tpwgts2[i] = rsum(j, tpwgts+fpart*ncon+i, ncon); tpwgts2[ncon+i] = rsum(lnparts-j, tpwgts+(fpart+j)*ncon+i, ncon); wsum = 1.0/(tpwgts2[i] + tpwgts2[ncon+i]); tpwgts2[i] *= wsum; tpwgts2[ncon+i] *= wsum; } METIS_PartGraphRecursive(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &twoparts, tpwgts2, NULL, moptions, &edgecut, part); /* pick one of the branches */ if (srmype < fpe+lnpes/2) { KeepPart(ctrl, agraph, part, 0); lnpes = lnpes/2; lnparts = lnparts/2; } else { KeepPart(ctrl, agraph, part, 1); fpart = fpart + lnparts/2; fpe = fpe + lnpes/2; lnpes = lnpes - lnpes/2; lnparts = lnparts - lnparts/2; } } if (lnparts == 1) { /* Case in which srnpes is greater or equal to nparts */ /* Only the first process will assign labels (for the reduction to work) */ if (srmype == fpe) { for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart; } } else { /* Case in which srnpes is smaller than nparts */ /* create the normalized tpwgts for the lnparts from ctrl->tpwgts */ tpwgts = rwspacemalloc(ctrl, lnparts*ncon); for (j=0; j<ncon; j++) { for (wsum=0.0, i=0; i<lnparts; i++) { tpwgts[i*ncon+j] = ctrl->tpwgts[(fpart+i)*ncon+j]; wsum += tpwgts[i*ncon+j]; } for (wsum=1.0/wsum, i=0; i<lnparts; i++) tpwgts[i*ncon+j] *= wsum; } METIS_PartGraphKway(&agraph->nvtxs, &ncon, agraph->xadj, agraph->adjncy, agraph->vwgt, NULL, agraph->adjwgt, &lnparts, tpwgts, NULL, moptions, &edgecut, part); for (i=0; i<agraph->nvtxs; i++) lwhere[agraph->label[i]] = fpart + part[i]; } gkMPI_Allreduce((void *)lwhere, (void *)part, nvtxs, IDX_T, MPI_SUM, srcomm); edgecut = ComputeSerialEdgeCut(&cgraph); ComputeSerialBalance(ctrl, &cgraph, part, lbvec); lbsum = rsum(ncon, lbvec, 1); gkMPI_Allreduce((void *)&edgecut, (void *)&max_cut, 1, IDX_T, MPI_MAX, ipcomm); gkMPI_Allreduce((void *)&lbsum, (void *)&min_lbsum, 1, REAL_T, MPI_MIN, ipcomm); lpecost.rank = ctrl->mype; lpecost.cost = lbsum; if (min_lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) { if (lbsum < UNBALANCE_FRACTION * (real_t)(ncon)) lpecost.cost = (double)edgecut; else lpecost.cost = (double)max_cut + lbsum; } gkMPI_Allreduce((void *)&lpecost, (void *)&gpecost, 1, MPI_DOUBLE_INT, MPI_MINLOC, ipcomm); if (ctrl->mype == gpecost.rank && ctrl->mype != sr_pe) gkMPI_Send((void *)part, nvtxs, IDX_T, sr_pe, 1, ctrl->comm); if (ctrl->mype != gpecost.rank && ctrl->mype == sr_pe) gkMPI_Recv((void *)part, nvtxs, IDX_T, gpecost.rank, 1, ctrl->comm, &status); if (ctrl->mype == sr_pe) { icopy(nvtxs, part, lwhere); SerialRemap(ctrl, &cgraph, ctrl->nparts, home, lwhere, part, ctrl->tpwgts); } gkMPI_Comm_free(&srcomm); }
/***************************************************************************** * This function creates the nodal graph of a finite element mesh ******************************************************************************/ void HEXNODALMETIS(int nelmnts, int nvtxs, idxtype *elmnts, idxtype *dxadj, idxtype *dadjncy) { int i, j, jj, k, kk, /*kkk, l, m, n,*/ nedges; idxtype *nptr, *nind; idxtype *mark; int table[8][3] = {{1, 3, 4}, {0, 2, 5}, {1, 3, 6}, {0, 2, 7}, {0, 5, 7}, {1, 4, 6}, {2, 5, 7}, {3, 4, 6} }; /* Construct the node-element list first */ nptr = idxsmalloc(nvtxs+1, 0, "HEXNODALMETIS: nptr"); for (j=8*nelmnts, i=0; i<j; i++) nptr[elmnts[i]]++; MAKECSR(i, nvtxs, nptr); nind = idxmalloc(nptr[nvtxs], "HEXNODALMETIS: nind"); for (k=i=0; i<nelmnts; i++) { for (j=0; j<8; j++, k++) nind[nptr[elmnts[k]]++] = i; } for (i=nvtxs; i>0; i--) nptr[i] = nptr[i-1]; nptr[0] = 0; mark = idxsmalloc(nvtxs, -1, "HEXNODALMETIS: mark"); nedges = dxadj[0] = 0; for (i=0; i<nvtxs; i++) { mark[i] = i; for (j=nptr[i]; j<nptr[i+1]; j++) { jj=8*nind[j]; for (k=0; k<8; k++) { if (elmnts[jj+k] == i) break; } ASSERT(k != 8); /* You found the index, now go and put the 3 neighbors */ kk = elmnts[jj+table[k][0]]; if (mark[kk] != i) { mark[kk] = i; dadjncy[nedges++] = kk; } kk = elmnts[jj+table[k][1]]; if (mark[kk] != i) { mark[kk] = i; dadjncy[nedges++] = kk; } kk = elmnts[jj+table[k][2]]; if (mark[kk] != i) { mark[kk] = i; dadjncy[nedges++] = kk; } } dxadj[i+1] = nedges; } free(mark); free(nptr); free(nind); }
/***************************************************************************** * This function creates the dual of a finite element mesh ******************************************************************************/ void GENDUALMETIS(int nelmnts, int nvtxs, int etype, idxtype *elmnts, idxtype *dxadj, idxtype *dadjncy) { int i, j, jj, k, kk, kkk, l, m, n, /*nedges,*/ mask; idxtype *nptr, *nind; idxtype *mark, ind[200], wgt[200]; int esize, esizes[] = {-1, 3, 4, 8, 4}, mgcnum, mgcnums[] = {-1, 2, 3, 4, 2}; mask = (1<<11)-1; mark = idxsmalloc(mask+1, -1, "GENDUALMETIS: mark"); /* Get the element size and magic number for the particular element */ esize = esizes[etype]; mgcnum = mgcnums[etype]; /* Construct the node-element list first */ nptr = idxsmalloc(nvtxs+1, 0, "GENDUALMETIS: nptr"); for (j=esize*nelmnts, i=0; i<j; i++) nptr[elmnts[i]]++; MAKECSR(i, nvtxs, nptr); nind = idxmalloc(nptr[nvtxs], "GENDUALMETIS: nind"); for (k=i=0; i<nelmnts; i++) { for (j=0; j<esize; j++, k++) nind[nptr[elmnts[k]]++] = i; } for (i=nvtxs; i>0; i--) nptr[i] = nptr[i-1]; nptr[0] = 0; for (i=0; i<nelmnts; i++) dxadj[i] = esize*i; for (i=0; i<nelmnts; i++) { for (m=j=0; j<esize; j++) { n = elmnts[esize*i+j]; for (k=nptr[n+1]-1; k>=nptr[n]; k--) { if ((kk = nind[k]) <= i) break; kkk = kk&mask; if ((l = mark[kkk]) == -1) { ind[m] = kk; wgt[m] = 1; mark[kkk] = m++; } else if (ind[l] == kk) { wgt[l]++; } else { for (jj=0; jj<m; jj++) { if (ind[jj] == kk) { wgt[jj]++; break; } } if (jj == m) { ind[m] = kk; wgt[m++] = 1; } } } } for (j=0; j<m; j++) { if (wgt[j] == mgcnum) { k = ind[j]; dadjncy[dxadj[i]++] = k; dadjncy[dxadj[k]++] = i; } mark[ind[j]&mask] = -1; } } /* Go and consolidate the dxadj and dadjncy */ for (j=i=0; i<nelmnts; i++) { for (k=esize*i; k<dxadj[i]; k++, j++) dadjncy[j] = dadjncy[k]; dxadj[i] = j; } for (i=nelmnts; i>0; i--) dxadj[i] = dxadj[i-1]; dxadj[0] = 0; free(mark); free(nptr); free(nind); }
graph_t *FixGraph(graph_t *graph) { idx_t i, j, k, l, nvtxs, nedges; idx_t *xadj, *adjncy, *adjwgt; idx_t *nxadj, *nadjncy, *nadjwgt; graph_t *ngraph; uvw_t *edges; nvtxs = graph->nvtxs; xadj = graph->xadj; adjncy = graph->adjncy; adjwgt = graph->adjwgt; ASSERT(adjwgt != NULL); ngraph = CreateGraph(); ngraph->nvtxs = nvtxs; /* deal with vertex weights/sizes */ ngraph->ncon = graph->ncon; ngraph->vwgt = icopy(nvtxs*graph->ncon, graph->vwgt, imalloc(nvtxs*graph->ncon, "FixGraph: vwgt")); ngraph->vsize = ismalloc(nvtxs, 1, "FixGraph: vsize"); if (graph->vsize) icopy(nvtxs, graph->vsize, ngraph->vsize); /* fix graph by sorting the "superset" of edges */ edges = (uvw_t *)gk_malloc(sizeof(uvw_t)*2*xadj[nvtxs], "FixGraph: edges"); for (nedges=0, i=0; i<nvtxs; i++) { for (j=xadj[i]; j<xadj[i+1]; j++) { /* keep only the upper-trianglular part of the adjacency matrix */ if (i < adjncy[j]) { edges[nedges].u = i; edges[nedges].v = adjncy[j]; edges[nedges].w = adjwgt[j]; nedges++; } else if (i > adjncy[j]) { edges[nedges].u = adjncy[j]; edges[nedges].v = i; edges[nedges].w = adjwgt[j]; nedges++; } } } uvwsorti(nedges, edges); /* keep the unique subset */ for (k=0, i=1; i<nedges; i++) { if (edges[k].v != edges[i].v || edges[k].u != edges[i].u) { edges[++k] = edges[i]; } } nedges = k+1; /* allocate memory for the fixed graph */ nxadj = ngraph->xadj = ismalloc(nvtxs+1, 0, "FixGraph: nxadj"); nadjncy = ngraph->adjncy = imalloc(2*nedges, "FixGraph: nadjncy"); nadjwgt = ngraph->adjwgt = imalloc(2*nedges, "FixGraph: nadjwgt"); /* create the adjacency list of the fixed graph from the upper-triangular part of the adjacency matrix */ for (k=0; k<nedges; k++) { nxadj[edges[k].u]++; nxadj[edges[k].v]++; } MAKECSR(i, nvtxs, nxadj); for (k=0; k<nedges; k++) { nadjncy[nxadj[edges[k].u]] = edges[k].v; nadjncy[nxadj[edges[k].v]] = edges[k].u; nadjwgt[nxadj[edges[k].u]] = edges[k].w; nadjwgt[nxadj[edges[k].v]] = edges[k].w; nxadj[edges[k].u]++; nxadj[edges[k].v]++; } SHIFTCSR(i, nvtxs, nxadj); gk_free((void **)&edges, LTERM); return ngraph; }