static vl::Error axpy(vl::Context & context, ptrdiff_t n, type alpha, type const *x, ptrdiff_t incx, type *y, ptrdiff_t incy) { saxpy(&n, &alpha, (float*)x, &incx, (float*)y, &incy) ; return vl::vlSuccess ; }
/************************************************************************* * This function computes the initial id/ed **************************************************************************/ void MocCompute2WayPartitionParams(CtrlType *ctrl, GraphType *graph) { int i, j, /*k, l,*/ nvtxs, ncon, nbnd, mincut; idxtype *xadj, *adjncy, *adjwgt; float *nvwgt, *npwgts; idxtype *id, *ed, *where; idxtype *bndptr, *bndind; int me/*, other*/; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; npwgts = sset(2*ncon, 0.0, graph->npwgts); id = idxset(nvtxs, 0, graph->id); ed = idxset(nvtxs, 0, graph->ed); bndptr = idxset(nvtxs, -1, graph->bndptr); bndind = graph->bndind; /*------------------------------------------------------------ / Compute now the id/ed degrees /------------------------------------------------------------*/ nbnd = mincut = 0; for (i=0; i<nvtxs; i++) { ASSERT(where[i] >= 0 && where[i] <= 1); me = where[i]; saxpy(ncon, 1.0, nvwgt+i*ncon, 1, npwgts+me*ncon, 1); for (j=xadj[i]; j<xadj[i+1]; j++) { if (me == where[adjncy[j]]) id[i] += adjwgt[j]; else ed[i] += adjwgt[j]; } if (ed[i] > 0 || xadj[i] == xadj[i+1]) { mincut += ed[i]; bndptr[i] = nbnd; bndind[nbnd++] = i; } } graph->mincut = mincut/2; graph->nbnd = nbnd; }
//! @{ virtual void interpolate_initial(shared_ptr<ISweeper<time>> dst, shared_ptr<const ISweeper<time>> src) override { auto& fine = as_encap_sweeper(dst); auto& crse = as_encap_sweeper(src); auto crse_factory = crse.get_factory(); auto fine_factory = fine.get_factory(); auto crse_delta = crse_factory->create(solution); this->restrict(crse_delta, fine.get_start_state()); crse_delta->saxpy(-1.0, crse.get_start_state()); auto fine_delta = fine_factory->create(solution); this->interpolate(fine_delta, crse_delta); fine.get_start_state()->saxpy(-1.0, fine_delta); fine.reevaluate(true); }
int main() { uint32_t num_cpus = std::thread::hardware_concurrency(); std::vector<std::thread>threads(num_cpus); float *S = new float[num_cpus], a = 1.0f, b = 2.0f, *X = new float[num_cpus], *Y = new float[num_cpus]; for(uint32_t i=0;i<num_cpus;i++){ X[i] = 1.0f*i; Y[i] = 2.0f*(i+1); Saxpy saxpy(std::ref(S[i]), a, X[i], b, Y[i]); threads[i] = std::thread(saxpy); } for(uint32_t i=0;i<num_cpus;i++){ threads[i].join(); std::cout<<S[i]<<std::endl; } }
int main (int argc, char * argv[]) { const int N = 16; const int iters = 1; float *x, *y, *z; posix_memalign((void **)&x, VECTOR_SIZE, N*sizeof(float)); posix_memalign((void **)&y, VECTOR_SIZE, N*sizeof(float)); posix_memalign((void **)&z, VECTOR_SIZE, N*sizeof(float)); float a = 0.93f; int i, j; for (i=0; i<N; i++) { x[i] = i+1; y[i] = i-1; z[i] = 0.0f; } for (i=0; i<iters; i++) { saxpy(x, y, z, a, N); } for (i=0; i<N; i++) { if (z[i] != (a * x[i] + y[i])) { printf("Error\n"); return (1); } } printf("SUCCESS!\n"); return 0; }
int main(int argc, char* argv[]) { float a, x[N], y[N]; a=5; int i; for (i=0; i<N; ++i){ x[i]=i; y[i]=i+2; } saxpy(N, a, x, y); #pragma omp taskwait //Check results for (i=0; i<N; ++i){ if (y[i]!=a*i+(i+2)){ printf("Error when checking results, in position %d\n",i); return -1; } } printf("Results are correct\n"); return 0; }
virtual void interpolate(shared_ptr<ISweeper<time>> dst, shared_ptr<const ISweeper<time>> src, bool interp_initial) override { auto& fine = as_encap_sweeper(dst); auto& crse = as_encap_sweeper(src); if (tmat.rows() == 0) { tmat = pfasst::quadrature::compute_interp<time>(fine.get_nodes(), crse.get_nodes()); } if (interp_initial) { this->interpolate_initial(dst, src); } size_t nfine = fine.get_nodes().size(); size_t ncrse = crse.get_nodes().size(); auto crse_factory = crse.get_factory(); auto fine_factory = fine.get_factory(); EncapVecT fine_state(nfine), fine_delta(ncrse); for (size_t m = 0; m < nfine; m++) { fine_state[m] = fine.get_state(m); } for (size_t m = 0; m < ncrse; m++) { fine_delta[m] = fine_factory->create(solution); } auto crse_delta = crse_factory->create(solution); for (size_t m = 0; m < ncrse; m++) { crse_delta->copy(crse.get_state(m)); crse_delta->saxpy(-1.0, crse.get_saved_state(m)); interpolate(fine_delta[m], crse_delta); } fine.get_state(0)->mat_apply(fine_state, 1.0, tmat, fine_delta, false); fine.reevaluate(); }
void operator()(float a, float *x, float *y, std::size_t n) { saxpy(a,x,y,n); }
int sgefa ( float a[], int lda, int n, int ipvt[] ) /*******************************************************************************/ /* Purpose: SGEFA factors a matrix by gaussian elimination. Discussion: Matrix references which would, mathematically, be written A(I,J) must be written here as: * A[I+J*LDA], when the value is needed, or * A+I+J*LDA, when the address is needed. Modified: 07 March 2008 Author: FORTRAN77 original version by Cleve Moler. C version by John Burkardt. Reference: Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart, LINPACK User's Guide, SIAM, 1979, ISBN13: 978-0-898711-72-1, LC: QA214.L56. Parameters: Input/output, float A[LDA*N]. On input, the matrix to be factored. On output, an upper triangular matrix and the multipliers which were used to obtain it. The factorization can be written A = L * U where L is a product of permutation and unit lower triangular matrices and U is upper triangular. Input, int LDA, the leading dimension of the matrix. Input, int N, the order of the matrix. Output, int IPVT[N], the pivot indices. Output, int SGEFA, indicates singularity. If 0, this is the normal value, and the algorithm succeeded. If K, then on the K-th elimination step, a zero pivot was encountered. The matrix is numerically not invertible. */ { int j; int info; int k; int kp1; int l; int nm1; float t; info = 0; for ( k = 1; k <= n - 1; k++ ) { /* Find l = pivot index. */ l = isamax ( n-k+1, &a[k-1+(k-1)*lda], 1 ) + k - 1; ipvt[k-1] = l; /* Zero pivot implies this column already triangularized. */ if ( a[l-1+(k-1)*lda] != 0.0 ) { /* Interchange if necessary. */ if ( l != k ) { t = a[l-1+(k-1)*lda]; a[l-1+(k-1)*lda] = a[k-1+(k-1)*lda]; a[k-1+(k-1)*lda] = t; } /* Compute multipliers. */ t = - 1.0 / a[k-1+(k-1)*lda]; sscal ( n-k, t, &a[k+(k-1)*lda], 1 ); /* Row elimination with column indexing. */ for ( j = k + 1; j <= n; j++ ) { t = a[l-1+(j-1)*lda]; if (l != k) { a[l-1+(j-1)*lda] = a[k-1+(j-1)*lda]; a[k-1+(j-1)*lda] = t; } saxpy ( n-k, t, &a[k+(k-1)*lda], 1, &a[k+(j-1)*lda], 1 ); } } else { info = k; } } ipvt[n-1] = n; if (a[n-1+(n-1)*lda] == 0.0 ) { info = n - 1; } return info; }
int main() { #define N 8 int i; float x_ref[N], y_ref[N]; float x[N], y[N]; cublasHandle_t h; float a = 2.0; for (i = 0; i < N; i++) { x[i] = x_ref[i] = 4.0 + i; y[i] = y_ref[i] = 3.0; } saxpy (N, a, x_ref, y_ref); cublasCreate (&h); #pragma acc data copyin (x[0:N]) copy (y[0:N]) { #pragma acc host_data use_device (x, y) { cublasSaxpy (h, N, &a, x, 1, y, 1); } } validate_results (N, y, y_ref); #pragma acc data create (x[0:N]) copyout (y[0:N]) { #pragma acc kernels for (i = 0; i < N; i++) y[i] = 3.0; #pragma acc host_data use_device (x, y) { cublasSaxpy (h, N, &a, x, 1, y, 1); } } cublasDestroy (h); validate_results (N, y, y_ref); for (i = 0; i < N; i++) y[i] = 3.0; /* There's no need to use host_data here. */ #pragma acc data copyin (x[0:N]) copyin (a) copy (y[0:N]) { #pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a) saxpy (N, a, x, y); } validate_results (N, y, y_ref); /* Exercise host_data with data transferred with acc enter data. */ for (i = 0; i < N; i++) y[i] = 3.0; #pragma acc enter data copyin (x, a, y) #pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a) { saxpy (N, a, x, y); } #pragma acc exit data delete (x, a) copyout (y) validate_results (N, y, y_ref); return 0; }
/************************************************************************* * This function creates the coarser graph **************************************************************************/ void CreateCoarseGraphNoMask(CtrlType *ctrl, GraphType *graph, int cnvtxs, idxtype *match, idxtype *perm) { int i, j, k, m, istart, iend, nvtxs, nedges, ncon, cnedges, v, u, dovsize; idxtype *xadj, *vwgt, *vsize, *adjncy, *adjwgt, *adjwgtsum, *auxadj; idxtype *cmap, *htable; idxtype *cxadj, *cvwgt, *cvsize, *cadjncy, *cadjwgt, *cadjwgtsum; float *nvwgt, *cnvwgt; GraphType *cgraph; dovsize = (ctrl->optype == OP_KVMETIS ? 1 : 0); IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->ContractTmr)); nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; vwgt = graph->vwgt; vsize = graph->vsize; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; adjwgtsum = graph->adjwgtsum; cmap = graph->cmap; /* Initialize the coarser graph */ cgraph = SetUpCoarseGraph(graph, cnvtxs, dovsize); cxadj = cgraph->xadj; cvwgt = cgraph->vwgt; cvsize = cgraph->vsize; cnvwgt = cgraph->nvwgt; cadjwgtsum = cgraph->adjwgtsum; cadjncy = cgraph->adjncy; cadjwgt = cgraph->adjwgt; htable = idxset(cnvtxs, -1, idxwspacemalloc(ctrl, cnvtxs)); iend = xadj[nvtxs]; auxadj = ctrl->wspace.auxcore; memcpy(auxadj, adjncy, iend*sizeof(idxtype)); for (i=0; i<iend; i++) auxadj[i] = cmap[auxadj[i]]; cxadj[0] = cnvtxs = cnedges = 0; for (i=0; i<nvtxs; i++) { v = perm[i]; if (cmap[v] != cnvtxs) continue; u = match[v]; if (ncon == 1) cvwgt[cnvtxs] = vwgt[v]; else scopy(ncon, nvwgt+v*ncon, cnvwgt+cnvtxs*ncon); if (dovsize) cvsize[cnvtxs] = vsize[v]; cadjwgtsum[cnvtxs] = adjwgtsum[v]; nedges = 0; istart = xadj[v]; iend = xadj[v+1]; for (j=istart; j<iend; j++) { k = auxadj[j]; if ((m = htable[k]) == -1) { cadjncy[nedges] = k; cadjwgt[nedges] = adjwgt[j]; htable[k] = nedges++; } else { cadjwgt[m] += adjwgt[j]; } } if (v != u) { if (ncon == 1) cvwgt[cnvtxs] += vwgt[u]; else saxpy(ncon, 1.0, nvwgt+u*ncon, 1, cnvwgt+cnvtxs*ncon, 1); if (dovsize) cvsize[cnvtxs] += vsize[u]; cadjwgtsum[cnvtxs] += adjwgtsum[u]; istart = xadj[u]; iend = xadj[u+1]; for (j=istart; j<iend; j++) { k = auxadj[j]; if ((m = htable[k]) == -1) { cadjncy[nedges] = k; cadjwgt[nedges] = adjwgt[j]; htable[k] = nedges++; } else { cadjwgt[m] += adjwgt[j]; } } /* Remove the contracted adjacency weight */ if ((j = htable[cnvtxs]) != -1) { ASSERT(cadjncy[j] == cnvtxs); cadjwgtsum[cnvtxs] -= cadjwgt[j]; cadjncy[j] = cadjncy[--nedges]; cadjwgt[j] = cadjwgt[nedges]; htable[cnvtxs] = -1; } } ASSERTP(cadjwgtsum[cnvtxs] == idxsum(nedges, cadjwgt), ("%d %d\n", cadjwgtsum[cnvtxs], idxsum(nedges, cadjwgt))); for (j=0; j<nedges; j++) htable[cadjncy[j]] = -1; /* Zero out the htable */ cnedges += nedges; cxadj[++cnvtxs] = cnedges; cadjncy += nedges; cadjwgt += nedges; } cgraph->nedges = cnedges; ReAdjustMemory(graph, cgraph, dovsize); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->ContractTmr)); idxwspacefree(ctrl, cnvtxs); }
/************************************************************************* * This function performs k-way refinement **************************************************************************/ void MCRandom_KWayEdgeRefineHorizontal(CtrlType *ctrl, GraphType *graph, int nparts, float *orgubvec, int npasses) { int i, ii, iii, j, /*jj,*/ k, /*l,*/ pass, nvtxs, ncon, nmoves, nbnd, myndegrees, same; int from, me, to, oldcut, gain; idxtype *xadj, *adjncy, *adjwgt; idxtype *where, *perm, *bndptr, *bndind; EDegreeType *myedegrees; RInfoType *myrinfo; float *npwgts, *nvwgt, *minwgt, *maxwgt, maxlb, minlb, ubvec[MAXNCON], tvec[MAXNCON]; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; adjncy = graph->adjncy; adjwgt = graph->adjwgt; bndptr = graph->bndptr; bndind = graph->bndind; where = graph->where; npwgts = graph->npwgts; /* Setup the weight intervals of the various subdomains */ minwgt = fwspacemalloc(ctrl, nparts*ncon); maxwgt = fwspacemalloc(ctrl, nparts*ncon); /* See if the orgubvec consists of identical constraints */ maxlb = minlb = orgubvec[0]; for (i=1; i<ncon; i++) { minlb = (orgubvec[i] < minlb ? orgubvec[i] : minlb); maxlb = (orgubvec[i] > maxlb ? orgubvec[i] : maxlb); } same = (fabs(maxlb-minlb) < .01 ? 1 : 0); /* Let's not get very optimistic. Let Balancing do the work */ ComputeHKWayLoadImbalance(ncon, nparts, npwgts, ubvec); for (i=0; i<ncon; i++) ubvec[i] = amax(ubvec[i], orgubvec[i]); if (!same) { for (i=0; i<nparts; i++) { for (j=0; j<ncon; j++) { maxwgt[i*ncon+j] = ubvec[j]/nparts; minwgt[i*ncon+j] = 1.0/(ubvec[j]*nparts); } } } else { maxlb = ubvec[0]; for (i=1; i<ncon; i++) maxlb = (ubvec[i] > maxlb ? ubvec[i] : maxlb); for (i=0; i<nparts; i++) { for (j=0; j<ncon; j++) { maxwgt[i*ncon+j] = maxlb/nparts; minwgt[i*ncon+j] = 1.0/(maxlb*nparts); } } } perm = idxwspacemalloc(ctrl, nvtxs); if (ctrl->dbglvl&DBG_REFINE) { printf("Partitions: [%5.4f %5.4f], Nv-Nb[%6d %6d]. Cut: %6d, LB: ", npwgts[samin(ncon*nparts, npwgts)], npwgts[samax(ncon*nparts, npwgts)], graph->nvtxs, graph->nbnd, graph->mincut); ComputeHKWayLoadImbalance(ncon, nparts, npwgts, tvec); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); printf("\n"); } for (pass=0; pass<npasses; pass++) { ASSERT(ComputeCut(graph, where) == graph->mincut); oldcut = graph->mincut; nbnd = graph->nbnd; RandomPermute(nbnd, perm, 1); for (nmoves=iii=0; iii<graph->nbnd; iii++) { ii = perm[iii]; if (ii >= nbnd) continue; i = bndind[ii]; myrinfo = graph->rinfo+i; if (myrinfo->ed >= myrinfo->id) { /* Total ED is too high */ from = where[i]; nvwgt = graph->nvwgt+i*ncon; if (myrinfo->id > 0 && AreAllHVwgtsBelow(ncon, 1.0, npwgts+from*ncon, -1.0, nvwgt, minwgt+from*ncon)) continue; /* This cannot be moved! */ myedegrees = myrinfo->edegrees; myndegrees = myrinfo->ndegrees; for (k=0; k<myndegrees; k++) { to = myedegrees[k].pid; gain = myedegrees[k].ed - myrinfo->id; if (gain >= 0 && (AreAllHVwgtsBelow(ncon, 1.0, npwgts+to*ncon, 1.0, nvwgt, maxwgt+to*ncon) || IsHBalanceBetterFT(ncon, nparts, npwgts+from*ncon, npwgts+to*ncon, nvwgt, ubvec))) break; } if (k == myndegrees) continue; /* break out if you did not find a candidate */ for (j=k+1; j<myndegrees; j++) { to = myedegrees[j].pid; if ((myedegrees[j].ed > myedegrees[k].ed && (AreAllHVwgtsBelow(ncon, 1.0, npwgts+to*ncon, 1.0, nvwgt, maxwgt+to*ncon) || IsHBalanceBetterFT(ncon, nparts, npwgts+from*ncon, npwgts+to*ncon, nvwgt, ubvec))) || (myedegrees[j].ed == myedegrees[k].ed && IsHBalanceBetterTT(ncon, nparts, npwgts+myedegrees[k].pid*ncon, npwgts+to*ncon, nvwgt, ubvec))) k = j; } to = myedegrees[k].pid; if (myedegrees[k].ed-myrinfo->id == 0 && !IsHBalanceBetterFT(ncon, nparts, npwgts+from*ncon, npwgts+to*ncon, nvwgt, ubvec) && AreAllHVwgtsBelow(ncon, 1.0, npwgts+from*ncon, 0.0, npwgts+from*ncon, maxwgt+from*ncon)) continue; /*===================================================================== * If we got here, we can now move the vertex from 'from' to 'to' *======================================================================*/ graph->mincut -= myedegrees[k].ed-myrinfo->id; IFSET(ctrl->dbglvl, DBG_MOVEINFO, printf("\t\tMoving %6d to %3d. Gain: %4d. Cut: %6d\n", i, to, myedegrees[k].ed-myrinfo->id, graph->mincut)); /* Update where, weight, and ID/ED information of the vertex you moved */ saxpy(ncon, 1.0, nvwgt, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt, 1, npwgts+from*ncon, 1); where[i] = to; myrinfo->ed += myrinfo->id-myedegrees[k].ed; SWAP(myrinfo->id, myedegrees[k].ed, j); if (myedegrees[k].ed == 0) myedegrees[k] = myedegrees[--myrinfo->ndegrees]; else myedegrees[k].pid = from; if (myrinfo->ed-myrinfo->id < 0) BNDDelete(nbnd, bndind, bndptr, i); /* Update the degrees of adjacent vertices */ for (j=xadj[i]; j<xadj[i+1]; j++) { ii = adjncy[j]; me = where[ii]; myrinfo = graph->rinfo+ii; if (myrinfo->edegrees == NULL) { myrinfo->edegrees = ctrl->wspace.edegrees+ctrl->wspace.cdegree; ctrl->wspace.cdegree += xadj[ii+1]-xadj[ii]; } myedegrees = myrinfo->edegrees; ASSERT(CheckRInfo(myrinfo)); if (me == from) { INC_DEC(myrinfo->ed, myrinfo->id, adjwgt[j]); if (myrinfo->ed-myrinfo->id >= 0 && bndptr[ii] == -1) BNDInsert(nbnd, bndind, bndptr, ii); } else if (me == to) { INC_DEC(myrinfo->id, myrinfo->ed, adjwgt[j]); if (myrinfo->ed-myrinfo->id < 0 && bndptr[ii] != -1) BNDDelete(nbnd, bndind, bndptr, ii); } /* Remove contribution from the .ed of 'from' */ if (me != from) { for (k=0; k<myrinfo->ndegrees; k++) { if (myedegrees[k].pid == from) { if (myedegrees[k].ed == adjwgt[j]) myedegrees[k] = myedegrees[--myrinfo->ndegrees]; else myedegrees[k].ed -= adjwgt[j]; break; } } } /* Add contribution to the .ed of 'to' */ if (me != to) { for (k=0; k<myrinfo->ndegrees; k++) { if (myedegrees[k].pid == to) { myedegrees[k].ed += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { myedegrees[myrinfo->ndegrees].pid = to; myedegrees[myrinfo->ndegrees++].ed = adjwgt[j]; } } ASSERT(myrinfo->ndegrees <= xadj[ii+1]-xadj[ii]); ASSERT(CheckRInfo(myrinfo)); } nmoves++; } } graph->nbnd = nbnd; if (ctrl->dbglvl&DBG_REFINE) { printf("\t [%5.4f %5.4f], Nb: %6d, Nmoves: %5d, Cut: %6d, LB: ", npwgts[samin(ncon*nparts, npwgts)], npwgts[samax(ncon*nparts, npwgts)], nbnd, nmoves, graph->mincut); ComputeHKWayLoadImbalance(ncon, nparts, npwgts, tvec); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); printf("\n"); } if (graph->mincut == oldcut) break; } fwspacefree(ctrl, ncon*nparts); fwspacefree(ctrl, ncon*nparts); idxwspacefree(ctrl, nvtxs); }
void sgesl ( float a[], int lda, int n, int ipvt[], float b[], int job ) /******************************************************************************/ /* Purpose: SGESL solves a real general linear system A * X = B. Discussion: SGESL can solve either of the systems A * X = B or A' * X = B. The system matrix must have been factored by SGECO or SGEFA. A division by zero will occur if the input factor contains a zero on the diagonal. Technically this indicates singularity but it is often caused by improper arguments or improper setting of LDA. It will not occur if the subroutines are called correctly and if SGECO has set 0.0 < RCOND or SGEFA has set INFO == 0. Modified: 04 April 2006 Author: FORTRAN77 original by Dongarra, Moler, Bunch and Stewart. C translation by John Burkardt. Reference: Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, LINPACK User's Guide, SIAM, (Society for Industrial and Applied Mathematics), 3600 University City Science Center, Philadelphia, PA, 19104-2688. ISBN: 0-89871-172-X Parameters: Input, float A[LDA*N], the output from SGECO or SGEFA. Input, int LDA, the leading dimension of A. Input, int N, the order of the matrix A. Input, int IPVT[N], the pivot vector from SGECO or SGEFA. Input/output, float B[N]. On input, the right hand side vector. On output, the solution vector. Input, int JOB. 0, solve A * X = B; nonzero, solve A' * X = B. */ { int k; int l; float t; /* Solve A * X = B. */ if ( job == 0 ) { for ( k = 1; k <= n-1; k++ ) { l = ipvt[k-1]; t = b[l-1]; if ( l != k ) { b[l-1] = b[k-1]; b[k-1] = t; } saxpy ( n-k, t, a+k+(k-1)*lda, 1, b+k, 1 ); } for ( k = n; 1 <= k; k-- ) { b[k-1] = b[k-1] / a[k-1+(k-1)*lda]; t = -b[k-1]; saxpy ( k-1, t, a+0+(k-1)*lda, 1, b, 1 ); } } /* Solve A' * X = B. */ else { for ( k = 1; k <= n; k++ ) { t = sdot ( k-1, a+0+(k-1)*lda, 1, b, 1 ); b[k-1] = ( b[k-1] - t ) / a[k-1+(k-1)*lda]; } for ( k = n-1; 1 <= k; k-- ) { b[k-1] = b[k-1] + sdot ( n-k, a+k+(k-1)*lda, 1, b+k, 1 ); l = ipvt[k-1]; if ( l != k ) { t = b[l-1]; b[l-1] = b[k-1]; b[k-1] = t; } } } return; }
/************************************************************************* * This function implements the CG solver used during the directed diffusion **************************************************************************/ void ConjGrad2(MatrixType *A, floattype *b, floattype *x, floattype tol, floattype *workspace) { int i, k, n; floattype *p, *r, *q, *z, *M; floattype alpha, beta, rho, rho_1 = -1.0, error, bnrm2, tmp; idxtype *rowptr, *colind; floattype *values; n = A->nrows; rowptr = A->rowptr; colind = A->colind; values = A->values; /* Initial Setup */ p = workspace; r = workspace + n; q = workspace + 2*n; z = workspace + 3*n; M = workspace + 4*n; for (i=0; i<n; i++) { x[i] = 0.0; if (values[rowptr[i]] != 0.0) M[i] = 1.0/values[rowptr[i]]; else M[i] = 0.0; } /* r = b - Ax */ mvMult2(A, x, r); for (i=0; i<n; i++) r[i] = b[i]-r[i]; bnrm2 = snorm2(n, b); if (bnrm2 > 0.0) { error = snorm2(n, r) / bnrm2; if (error > tol) { /* Begin Iterations */ for (k=0; k<n; k++) { for (i=0; i<n; i++) z[i] = r[i]*M[i]; rho = sdot(n, r, z); if (k == 0) scopy(n, z, p); else { if (rho_1 != 0.0) beta = rho/rho_1; else beta = 0.0; for (i=0; i<n; i++) p[i] = z[i] + beta*p[i]; } mvMult2(A, p, q); /* q = A*p */ tmp = sdot(n, p, q); if (tmp != 0.0) alpha = rho/tmp; else alpha = 0.0; saxpy(n, alpha, p, x); /* x = x + alpha*p */ saxpy(n, -alpha, q, r); /* r = r - alpha*q */ error = snorm2(n, r) / bnrm2; if (error < tol) break; rho_1 = rho; } } } }
void sqrdc (float **x, int n, int p, float *qraux, int *jpvt, float *work, int job) /***************************************************************************** Use Householder transformations to compute the QR decomposition of an n by p matrix x. Column pivoting based on the 2-norms of the reduced columns may be performed at the user's option. ****************************************************************************** Input: x matrix[p][n] to decompose (see notes below) n number of rows in the matrix x p number of columns in the matrix x jpvt array[p] controlling the pivot columns (see notes below) job =0 for no pivoting; =1 for pivoting Output: x matrix[p][n] decomposed (see notes below) qraux array[p] containing information required to recover the orthogonal part of the decomposition jpvt array[p] with jpvt[k] containing the index of the original matrix that has been interchanged into the k-th column, if pivoting is requested. Workspace: work array[p] of workspace ****************************************************************************** Notes: This function was adapted from LINPACK FORTRAN. Because two-dimensional arrays cannot be declared with variable dimensions in C, the matrix x is actually a pointer to an array of pointers to floats, as declared above and used below. Elements of x are stored as follows: x[0][0] x[1][0] x[2][0] ... x[p-1][0] x[0][1] x[1][1] x[2][1] ... x[p-1][1] x[0][2] x[1][2] x[2][2] ... x[p-1][2] . . . . . . . . . . x[0][n-1] x[1][n-1] x[2][n-1] ... x[p-1][n-1] After decomposition, x contains in its upper triangular matrix R of the QR decomposition. Below its diagonal x contains information from which the orthogonal part of the decomposition can be recovered. Note that if pivoting has been requested, the decomposition is not that of the original matrix x but that of x with its columns permuted as described by jpvt. The selection of pivot columns is controlled by jpvt as follows. The k-th column x[k] of x is placed in one of three classes according to the value of jpvt[k]. if jpvt[k] > 0, then x[k] is an initial column. if jpvt[k] == 0, then x[k] is a free column. if jpvt[k] < 0, then x[k] is a final column. Before the decomposition is computed, initial columns are moved to the beginning of the array x and final columns to the end. Both initial and final columns are frozen in place during the computation and only free columns are moved. At the k-th stage of the reduction, if x[k] is occupied by a free column it is interchanged with the free column of largest reduced norm. jpvt is not referenced if job == 0. ****************************************************************************** Author: Dave Hale, Colorado School of Mines, 12/29/89 *****************************************************************************/ { int j,jp,l,lup,maxj,pl,pu,negj,swapj; float maxnrm,t,tt,ttt,nrmxl; pl = 0; pu = -1; /* if pivoting has been requested */ if (job!=0) { /* rearrange columns according to jpvt */ for (j=0; j<p; j++) { swapj = jpvt[j]>0; negj = jpvt[j]<0; jpvt[j] = j; if (negj) jpvt[j] = -j; if (swapj) { if (j!=pl) sswap(n,x[pl],1,x[j],1); jpvt[j] = jpvt[pl]; jpvt[pl] = j; pl++; } } pu = p-1; for (j=p-1; j>=0; j--) { if (jpvt[j]<0) { jpvt[j] = -jpvt[j]; if (j!=pu) { sswap(n,x[pu],1,x[j],1); jp = jpvt[pu]; jpvt[pu] = jpvt[j]; jpvt[j] = jp; } pu--; } } } /* compute the norms of the free columns */ for (j=pl; j<=pu; j++) { qraux[j] = snrm2(n,x[j],1); work[j] = qraux[j]; } /* perform the Householder reduction of x */ lup = MIN(n,p); for (l=0; l<lup; l++) { if (l>=pl && l<pu) { /* * locate the column of largest norm and * bring it into pivot position. */ maxnrm = 0.0; maxj = l; for (j=l; j<=pu; j++) { if (qraux[j]>maxnrm) { maxnrm = qraux[j]; maxj = j; } } if (maxj!=l) { sswap(n,x[l],1,x[maxj],1); qraux[maxj] = qraux[l]; work[maxj] = work[l]; jp = jpvt[maxj]; jpvt[maxj] = jpvt[l]; jpvt[l] = jp; } } qraux[l] = 0.0; if (l!=n-1) { /* * compute the Householder transformation * for column l */ nrmxl = snrm2(n-l,&x[l][l],1); if (nrmxl!=0.0) { if (x[l][l]!=0.0) nrmxl = (x[l][l]>0.0) ? ABS(nrmxl) : -ABS(nrmxl); sscal(n-l,1.0/nrmxl,&x[l][l],1); x[l][l] += 1.0; /* * apply the transformation to the remaining * columns, updating the norms */ for (j=l+1; j<p; j++) { t = -sdot(n-l,&x[l][l],1,&x[j][l],1)/ x[l][l]; saxpy(n-l,t,&x[l][l],1,&x[j][l],1); if (j>=pl && j<=pu && qraux[j]!=0.0) { tt = ABS(x[j][l])/qraux[j]; tt = 1.0-tt*tt; tt = MAX(tt,0.0); t = tt; ttt = qraux[j]/work[j]; tt = 1.0+0.05*tt*ttt*ttt; if (tt!=1.0) { qraux[j] *= sqrt(t); } else { qraux[j] = snrm2(n-l-1, &x[j][l+1],1); work[j] = qraux[j]; } } } /* save the transformation */ qraux[l] = x[l][l]; x[l][l] = -nrmxl; } } } }
/************************************************************************* * This function performs an edge-based FM refinement **************************************************************************/ void MocGeneral2WayBalance(CtrlType *ctrl, GraphType *graph, float *tpwgts, float lbfactor) { int i, ii, j, k, l, kwgt, nvtxs, ncon, nbnd, nswaps, from, to, pass, me, limit, tmp, cnum; idxtype *xadj, *adjncy, *adjwgt, *where, *id, *ed, *bndptr, *bndind; idxtype *moved, *swaps, *perm, *qnum; float *nvwgt, *npwgts, mindiff[MAXNCON], origbal, minbal, newbal; PQueueType parts[MAXNCON][2]; int higain, oldgain, mincut, newcut, mincutorder; int qsizes[MAXNCON][2]; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; id = graph->id; ed = graph->ed; npwgts = graph->npwgts; bndptr = graph->bndptr; bndind = graph->bndind; moved = idxwspacemalloc(ctrl, nvtxs); swaps = idxwspacemalloc(ctrl, nvtxs); perm = idxwspacemalloc(ctrl, nvtxs); qnum = idxwspacemalloc(ctrl, nvtxs); limit = amin(amax(0.01*nvtxs, 15), 100); /* Initialize the queues */ for (i=0; i<ncon; i++) { PQueueInit(ctrl, &parts[i][0], nvtxs, PLUS_GAINSPAN+1); PQueueInit(ctrl, &parts[i][1], nvtxs, PLUS_GAINSPAN+1); qsizes[i][0] = qsizes[i][1] = 0; } for (i=0; i<nvtxs; i++) { qnum[i] = samax(ncon, nvwgt+i*ncon); qsizes[qnum[i]][where[i]]++; } /* printf("Weight Distribution: \t"); for (i=0; i<ncon; i++) printf(" [%d %d]", qsizes[i][0], qsizes[i][1]); printf("\n"); */ for (from=0; from<2; from++) { for (j=0; j<ncon; j++) { if (qsizes[j][from] == 0) { for (i=0; i<nvtxs; i++) { if (where[i] != from) continue; k = samax2(ncon, nvwgt+i*ncon); if (k == j && qsizes[qnum[i]][from] > qsizes[j][from] && nvwgt[i*ncon+qnum[i]] < 1.3*nvwgt[i*ncon+j]) { qsizes[qnum[i]][from]--; qsizes[j][from]++; qnum[i] = j; } } } } } /* printf("Weight Distribution (after):\t "); for (i=0; i<ncon; i++) printf(" [%d %d]", qsizes[i][0], qsizes[i][1]); printf("\n"); */ for (i=0; i<ncon; i++) mindiff[i] = fabs(tpwgts[0]-npwgts[i]); minbal = origbal = Compute2WayHLoadImbalance(ncon, npwgts, tpwgts); newcut = mincut = graph->mincut; mincutorder = -1; if (ctrl->dbglvl&DBG_REFINE) { printf("Parts: ["); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("] T[%.3f %.3f], Nv-Nb[%5d, %5d]. ICut: %6d, LB: %.3f [B]\n", tpwgts[0], tpwgts[1], graph->nvtxs, graph->nbnd, graph->mincut, origbal); } idxset(nvtxs, -1, moved); ASSERT(ComputeCut(graph, where) == graph->mincut); ASSERT(CheckBnd(graph)); /* Insert all nodes in the priority queues */ nbnd = graph->nbnd; RandomPermute(nvtxs, perm, 1); for (ii=0; ii<nvtxs; ii++) { i = perm[ii]; PQueueInsert(&parts[qnum[i]][where[i]], i, ed[i]-id[i]); } for (nswaps=0; nswaps<nvtxs; nswaps++) { if (minbal < lbfactor) break; SelectQueue(ncon, npwgts, tpwgts, &from, &cnum, parts); to = (from+1)%2; if (from == -1 || (higain = PQueueGetMax(&parts[cnum][from])) == -1) break; saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); newcut -= (ed[higain]-id[higain]); newbal = Compute2WayHLoadImbalance(ncon, npwgts, tpwgts); if (newbal < minbal || (newbal == minbal && (newcut < mincut || (newcut == mincut && BetterBalance(ncon, npwgts, tpwgts, mindiff))))) { mincut = newcut; minbal = newbal; mincutorder = nswaps; for (i=0; i<ncon; i++) mindiff[i] = fabs(tpwgts[0]-npwgts[i]); } else if (nswaps-mincutorder > limit) { /* We hit the limit, undo last move */ newcut += (ed[higain]-id[higain]); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); break; } where[higain] = to; moved[higain] = nswaps; swaps[nswaps] = higain; if (ctrl->dbglvl&DBG_MOVEINFO) { printf("Moved %6d from %d(%d). Gain: %5d, Cut: %5d, NPwgts: ", higain, from, cnum, ed[higain]-id[higain], newcut); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf(", %.3f LB: %.3f\n", minbal, newbal); } /************************************************************** * Update the id[i]/ed[i] values of the affected nodes ***************************************************************/ SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; oldgain = ed[k]-id[k]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); /* Update the queue position */ if (moved[k] == -1) PQueueUpdate(&parts[qnum[k]][where[k]], k, oldgain, ed[k]-id[k]); /* Update its boundary information */ if (ed[k] == 0 && bndptr[k] != -1) BNDDelete(nbnd, bndind, bndptr, k); else if (ed[k] > 0 && bndptr[k] == -1) BNDInsert(nbnd, bndind, bndptr, k); } } /**************************************************************** * Roll back computations *****************************************************************/ for (nswaps--; nswaps>mincutorder; nswaps--) { higain = swaps[nswaps]; to = where[higain] = (where[higain]+1)%2; SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); else if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+((to+1)%2)*ncon, 1); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); if (bndptr[k] != -1 && ed[k] == 0) BNDDelete(nbnd, bndind, bndptr, k); if (bndptr[k] == -1 && ed[k] > 0) BNDInsert(nbnd, bndind, bndptr, k); } } if (ctrl->dbglvl&DBG_REFINE) { printf("\tMincut: %6d at %5d, NBND: %6d, NPwgts: [", mincut, mincutorder, nbnd); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("], LB: %.3f\n", Compute2WayHLoadImbalance(ncon, npwgts, tpwgts)); } graph->mincut = mincut; graph->nbnd = nbnd; for (i=0; i<ncon; i++) { PQueueFree(ctrl, &parts[i][0]); PQueueFree(ctrl, &parts[i][1]); } idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); }
int main() { const int n = 1000000; const int incx = 2; const int incy = 3; const float a = float(rand()) / RAND_MAX; float * x, * y, * dev_x, * dev_y, * y_verify; x = new float[n * incx]; y = new float[n * incy]; y_verify = new float[n * incy]; cudaMalloc((void **) &dev_x, n * incx * sizeof(float)); cudaMalloc((void **) &dev_y, n * incy * sizeof(float)); for (int i = 0; i < n * incx; ++i) x[i] = float(rand()) / RAND_MAX; for (int i = 0; i < n * incy; ++i) y[i] = float(rand()) / RAND_MAX; cudaMemcpy(dev_x, x, n * incx * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dev_y, y, n * incy * sizeof(float), cudaMemcpyHostToDevice); clock_t start = clock(); saxpy(n, a, x, incx, y, incy); clock_t finish = clock(); std::cout.setf(std::ios_base::fixed); std::cout.precision(5); double cpu_time = (finish - start) /(double) CLOCKS_PER_SEC; std::cout << "CPU " << cpu_time << " seconds.\n"; start = clock(); saxpy_openmp(n, a, x, incx, y, incy); finish = clock(); double cpu_openmp_time = (finish - start) /(double) CLOCKS_PER_SEC; std::cout << "CPU_openmp " << cpu_openmp_time << " seconds.\n"; start = clock(); saxpy_gpu(n, a, dev_x, incx, dev_y, incy); cudaThreadSynchronize(); finish = clock(); double gpu_time = (float) (finish - start) / (double)CLOCKS_PER_SEC; std::cout << "GPU " << gpu_time << " seconds.\n"; cudaMemcpy(y_verify, dev_y, n * incy * sizeof(float), cudaMemcpyDeviceToHost); double diff = 0; for (int i = 0; i < n * incy; ++i) diff += (y[i] - y_verify[i]) * (y[i] - y_verify[i]); if (diff < 1e-4f) std::cout << "Correct.\n"; else std::cout << "Correct.\n"; //// //// а теперь тоже самое для double //// //double * xd, * yd, * dev_xd, * dev_yd, * y_verifyd; //const float ad = float(rand()) / RAND_MAX; // xd = new double[n * incx]; //yd = new double[n * incy]; //y_verifyd = new double[n * incy]; // //cudaMalloc((void **) &dev_xd, n * incx * sizeof(double)); // cudaMalloc((void **) &dev_yd, n * incy * sizeof(double)); // for (int i = 0; i < n * incx; ++i) // xd[i] = double(rand()) / RAND_MAX; // for (int i = 0; i < n * incy; ++i) // yd[i] = double(rand()) / RAND_MAX; // cudaMemcpy(dev_xd, xd, n * incx * sizeof(double), cudaMemcpyHostToDevice); // cudaMemcpy(dev_yd, yd, n * incy * sizeof(double), cudaMemcpyHostToDevice); //start = clock(); // for (int i = 0; i < iterations; ++i) // daxpy(n, ad, xd, incx, yd, incy); // finish = clock(); // cpu_time = (finish - start) / CLOCKS_PER_SEC; // std::cout << "CPU " << cpu_time << " seconds.\n"; //start = clock(); // for (int i = 0; i < iterations; ++i) // daxpy_gpu(n, ad, dev_xd, incx, dev_yd, incy); // // cudaThreadSynchronize(); // finish = clock(); // gpu_time = (float) (finish - start) / CLOCKS_PER_SEC; // std::cout << "GPU " << gpu_time << " seconds.\n"; // cudaMemcpy(y_verifyd, dev_yd, n * incy * sizeof(double), cudaMemcpyDeviceToHost); //diff = 0; // for (int i = 0; i < n * incy; ++i) // diff += (yd[i] - y_verifyd[i]) * (yd[i] - y_verifyd[i]); // if (diff < 1e-1f) // std::cout << "Correct.\n"; // else // std::cout << "Incorrect.\n"; system("pause"); cudaFree(dev_x); cudaFree(dev_y); //cudaFree(dev_xd); // cudaFree(dev_yd); delete [] x; delete [] y; delete [] y_verify; //delete [] xd; //delete [] yd; //delete [] y_verifyd; }
main() { int i,n=N; printf("isamax = %d\n",isamax(n,sx,1)); printf("isamax = %d\n",isamax(n/2,sx,2)); printf("isamax = %d\n",isamax(n,sy,1)); printf("sasum = %g\n",sasum(n,sx,1)); printf("sasum = %g\n",sasum(n/2,sx,2)); printf("sasum = %g\n",sasum(n,sy,1)); printf("snrm2 = %g\n",snrm2(n,sx,1)); printf("snrm2 = %g\n",snrm2(n/2,sx,2)); printf("snrm2 = %g\n",snrm2(n,sy,1)); printf("sdot = %g\n",sdot(n,sx,1,sy,1)); printf("sdot = %g\n",sdot(n/2,sx,2,sy,2)); printf("sdot = %g\n",sdot(n/2,sx,-2,sy,2)); printf("sdot = %g\n",sdot(n,sy,1,sy,1)); printf("sscal\n"); sscal(n,2.0,sx,1); pvec(n,sx); sscal(n,0.5,sx,1); pvec(n,sx); sscal(n/2,2.0,sx,2); pvec(n,sx); sscal(n/2,0.5,sx,2); pvec(n,sx); printf("sswap\n"); sswap(n,sx,1,sy,1); pvec(n,sx); pvec(n,sy); sswap(n,sy,1,sx,1); pvec(n,sx); pvec(n,sy); sswap(n/2,sx,1,sx+n/2,-1); pvec(n,sx); sswap(n/2,sx,1,sx+n/2,-1); pvec(n,sx); sswap(n/2,sx,2,sy,2); pvec(n,sx); pvec(n,sy); sswap(n/2,sx,2,sy,2); pvec(n,sx); pvec(n,sy); printf("saxpy\n"); saxpy(n,2.0,sx,1,sy,1); pvec(n,sx); pvec(n,sy); saxpy(n,-2.0,sx,1,sy,1); pvec(n,sx); pvec(n,sy); saxpy(n/2,2.0,sx,2,sy,2); pvec(n,sx); pvec(n,sy); saxpy(n/2,-2.0,sx,2,sy,2); pvec(n,sx); pvec(n,sy); saxpy(n/2,2.0,sx,-2,sy,1); pvec(n,sx); pvec(n,sy); saxpy(n/2,-2.0,sx,-2,sy,1); pvec(n,sx); pvec(n,sy); printf("scopy\n"); scopy(n/2,sx,2,sy,2); pvec(n,sx); pvec(n,sy); scopy(n/2,sx+1,2,sy+1,2); pvec(n,sx); pvec(n,sy); scopy(n/2,sx,2,sy,1); pvec(n,sx); pvec(n,sy); scopy(n/2,sx+1,-2,sy+n/2,-1); pvec(n,sx); pvec(n,sy); }
/************************************************************************* * This function computes the initial id/ed **************************************************************************/ void MocComputeKWayPartitionParams(CtrlType *ctrl, GraphType *graph, int nparts) { int i, j, k, l, nvtxs, ncon, nbnd, mincut, me, other; idxtype *xadj, *adjncy, *adjwgt, *where, *bndind, *bndptr; RInfoType *rinfo, *myrinfo; EDegreeType *myedegrees; float *nvwgt, *npwgts; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; npwgts = sset(ncon*nparts, 0.0, graph->npwgts); bndind = graph->bndind; bndptr = idxset(nvtxs, -1, graph->bndptr); rinfo = graph->rinfo; /*------------------------------------------------------------ / Compute now the id/ed degrees /------------------------------------------------------------*/ ctrl->wspace.cdegree = 0; nbnd = mincut = 0; for (i=0; i<nvtxs; i++) { me = where[i]; saxpy(ncon, 1.0, nvwgt+i*ncon, 1, npwgts+me*ncon, 1); myrinfo = rinfo+i; myrinfo->id = myrinfo->ed = myrinfo->ndegrees = 0; myrinfo->edegrees = NULL; for (j=xadj[i]; j<xadj[i+1]; j++) { if (me != where[adjncy[j]]) myrinfo->ed += adjwgt[j]; } myrinfo->id = graph->adjwgtsum[i] - myrinfo->ed; if (myrinfo->ed > 0) mincut += myrinfo->ed; if (myrinfo->ed-myrinfo->id >= 0) BNDInsert(nbnd, bndind, bndptr, i); /* Time to compute the particular external degrees */ if (myrinfo->ed > 0) { myedegrees = myrinfo->edegrees = ctrl->wspace.edegrees+ctrl->wspace.cdegree; ctrl->wspace.cdegree += xadj[i+1]-xadj[i]; for (j=xadj[i]; j<xadj[i+1]; j++) { other = where[adjncy[j]]; if (me != other) { for (k=0; k<myrinfo->ndegrees; k++) { if (myedegrees[k].pid == other) { myedegrees[k].ed += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { myedegrees[myrinfo->ndegrees].pid = other; myedegrees[myrinfo->ndegrees++].ed = adjwgt[j]; } } } ASSERT(myrinfo->ndegrees <= xadj[i+1]-xadj[i]); } } graph->mincut = mincut/2; graph->nbnd = nbnd; }
void sqrsl (float **x, int n, int k, float *qraux, float *y, float *qy, float *qty, float *b, float *rsd, float *xb, int job, int *info) /***************************************************************************** Use the output of sqrdc to compute coordinate transformations, projections, and least squares solutions. For k <= MIN(n,p), let xk be the matrix xk = (x[jpvt[0]], x[jpvt[1]], ..., x[jpvt[k-1]]) formed from columns jpvt[0], jpvt[1], ..., jpvt[k-1] of the original n by p matrix x that was input to sqrdc. (If no pivoting was done, xk consists of the first k columns of x in their original order.) sqrdc produces a factored orthogonal matrix Q and an upper triangular matrix R such that xk = Q * (R) (0) This information is contained in coded form in the arrays x and qraux. ****************************************************************************** Input: x matrix[p][n] containing output of sqrdc. n number of rows in the matrix xk; same as in sqrdc. k number of columns in the matrix xk; k must not be greater than MIN(n,p), where p is the same as in sqrdc. qraux array[p] containing auxiliary output from sqrdc. y array[n] to be manipulated by sqrsl. job specifies what is to be computed. job has the decimal expansion ABCDE, with the following meaning: if A != 0, compute qy. if B, C, D, or E != 0, compute qty. if C != 0, compute b. if D != 0, compute rsd. if E != 0, compute xb. Note that a request to compute b, rsd, or xb automatically triggers the computation of qty, for which an array must be provided. Output: qy array[n] containing Qy, if its computation has been requested. qty array[n] containing Q'y, if its computation has been requested. Here Q' denotes the transpose of Q. b array[k] containing solution of the least squares problem: minimize norm2(y - xk*b), if its computation has been requested. (Note that if pivoting was requested in sqrdc, the j-th component of b will be associated with column jpvt[j] of the original matrix x that was input into sqrdc.) rsd array[n] containing the least squares residual y - xk*b, if its computation has been requested. rsd is also the orthogonal projection of y onto the orthogonal complement of the column space of xk. xb array[n] containing the least squares approximation xk*b, if its computation has been requested. xb is also the orthogonal projection of y onto the column space of x. info =0 unless the computation of b has been requested and R is exactly singular. In this case, info is the index of the first zero diagonal element of R and b is left unaltered. ****************************************************************************** Notes: This function was adapted from LINPACK FORTRAN. Because two-dimensional arrays cannot be declared with variable dimensions in C, the matrix x is actually a pointer to an array of pointers to floats, as declared above and used below. Elements of x are stored as follows: x[0][0] x[1][0] x[2][0] ... x[k-1][0] x[0][1] x[1][1] x[2][1] ... x[k-1][1] x[0][2] x[1][2] x[2][2] ... x[k-1][2] . . . . . . . . . . x[0][n-1] x[1][n-1] x[2][n-1] ... x[k-1][n-1] The parameters qy, qty, b, rsd, and xb are not referenced if their computation is not requested and in this case can be replaced by NULL pointers in the calling program. To save storage, the user may in some cases use the same array for different parameters in the calling sequence. A frequently occuring example is when one wishes to compute any of b, rsd, or xb and does not need y or qty. In this case one may equivalence y, qty, and one of b, rsd, or xb, while providing separate arrays for anything else that is to be computed. Thus the calling sequence sqrsl(x,n,k,qraux,y,NULL,y,b,y,NULL,110,&info) will result in the computation of b and rsd, with rsd overwriting y. More generally, each item in the following list contains groups of permissible equivalences for a single calling sequence. 1. (y,qty,b) (rsd) (xb) (qy) 2. (y,qty,rsd) (b) (xb) (qy) 3. (y,qty,xb) (b) (rsd) (qy) 4. (y,qy) (qty,b) (rsd) (xb) 5. (y,qy) (qty,rsd) (b) (xb) 6. (y,qy) (qty,xb) (b) (rsd) In any group the value returned in the array allocated to the group corresponds to the last member of the group. ****************************************************************************** Author: Dave Hale, Colorado School of Mines, 12/29/89 *****************************************************************************/ { int i,j,ju,cb,cqy,cqty,cr,cxb; float t,temp; /* set info flag */ *info = 0; /* determine what is to be computed */ cqy = job/10000!=0; cqty = job%10000!=0; cb = (job%1000)/100!=0; cr = (job%100)/10!=0; cxb = job%10!=0; ju = MIN(k,n-1); /* special action when n=1 */ if (ju==0) { if (cqy) qy[0] = y[0]; if (cqty) qty[0] = y[0]; if (cxb) xb[0] = y[0]; if (cb) { if (x[0][0]==0.0) *info = 1; else b[0] = y[0]/x[0][0]; } if (cr) rsd[0] = 0.0; return; } /* set up to compute Qy or Q'y */ if (cqy) scopy(n,y,1,qy,1); if (cqty) scopy(n,y,1,qty,1); if (cqy) { /* compute Qy */ for (j=ju-1; j>=0; j--) { if (qraux[j]!=0.0) { temp = x[j][j]; x[j][j] = qraux[j]; t = -sdot(n-j,&x[j][j],1,&qy[j],1)/x[j][j]; saxpy(n-j,t,&x[j][j],1,&qy[j],1); x[j][j] = temp; } } } if (cqty) { /* compute Q'y */ for (j=0; j<ju; j++) { if (qraux[j]!=0.0) { temp = x[j][j]; x[j][j] = qraux[j]; t = -sdot(n-j,&x[j][j],1,&qty[j],1)/x[j][j]; saxpy(n-j,t,&x[j][j],1,&qty[j],1); x[j][j] = temp; } } } /* set up to compute b, rsd, or xb */ if (cb) scopy(k,qty,1,b,1); if (cxb) scopy(k,qty,1,xb,1); if (cr && k<n) scopy(n-k,&qty[k],1,&rsd[k],1); if (cxb && k<n) for (i=k; i<n; i++) xb[i] = 0.0; if (cr) for (i=0; i<k; i++) rsd[i] = 0.0; if (cb) { /* compute b */ for (j=k-1; j>=0; j--) { if (x[j][j]==0.0) { *info = j; break; } b[j] /= x[j][j]; if (j!=0) { t = -b[j]; saxpy(j,t,x[j],1,b,1); } } } if (cr || cxb) { /* compute rsd or xb as requested */ for (j=ju-1; j>=0; j--) { if (qraux[j]!=0.0) { temp = x[j][j]; x[j][j] = qraux[j]; if (cr) { t = -sdot(n-j,&x[j][j],1,&rsd[j],1)/ x[j][j]; saxpy(n-j,t,&x[j][j],1,&rsd[j],1); } if (cxb) { t = -sdot(n-j,&x[j][j],1,&xb[j],1)/ x[j][j]; saxpy(n-j,t,&x[j][j],1,&xb[j],1); } x[j][j] = temp; } } } }
/************************************************************************* * This function performs an edge-based FM refinement **************************************************************************/ void MocFM_2WayEdgeRefine(CtrlType *ctrl, GraphType *graph, float *tpwgts, int npasses) { int i, ii, j, k, l, kwgt, nvtxs, ncon, nbnd, nswaps, from, to, pass, me, limit, tmp, cnum; idxtype *xadj, *adjncy, *adjwgt, *where, *id, *ed, *bndptr, *bndind; idxtype *moved, *swaps, *perm, *qnum; float *nvwgt, *npwgts, mindiff[MAXNCON], origbal, minbal, newbal; PQueueType parts[MAXNCON][2]; int higain, oldgain, mincut, initcut, newcut, mincutorder; float rtpwgts[2]; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; id = graph->id; ed = graph->ed; npwgts = graph->npwgts; bndptr = graph->bndptr; bndind = graph->bndind; moved = idxwspacemalloc(ctrl, nvtxs); swaps = idxwspacemalloc(ctrl, nvtxs); perm = idxwspacemalloc(ctrl, nvtxs); qnum = idxwspacemalloc(ctrl, nvtxs); limit = amin(amax(0.01*nvtxs, 25), 150); /* Initialize the queues */ for (i=0; i<ncon; i++) { PQueueInit(ctrl, &parts[i][0], nvtxs, PLUS_GAINSPAN+1); PQueueInit(ctrl, &parts[i][1], nvtxs, PLUS_GAINSPAN+1); } for (i=0; i<nvtxs; i++) qnum[i] = samax(ncon, nvwgt+i*ncon); origbal = Compute2WayHLoadImbalance(ncon, npwgts, tpwgts); rtpwgts[0] = origbal*tpwgts[0]; rtpwgts[1] = origbal*tpwgts[1]; /* if (ctrl->dbglvl&DBG_REFINE) { printf("Parts: ["); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("] T[%.3f %.3f], Nv-Nb[%5d, %5d]. ICut: %6d, LB: %.3f\n", tpwgts[0], tpwgts[1], graph->nvtxs, graph->nbnd, graph->mincut, origbal); } */ idxset(nvtxs, -1, moved); for (pass=0; pass<npasses; pass++) { /* Do a number of passes */ for (i=0; i<ncon; i++) { PQueueReset(&parts[i][0]); PQueueReset(&parts[i][1]); } mincutorder = -1; newcut = mincut = initcut = graph->mincut; for (i=0; i<ncon; i++) mindiff[i] = fabs(tpwgts[0]-npwgts[i]); minbal = Compute2WayHLoadImbalance(ncon, npwgts, tpwgts); ASSERT(ComputeCut(graph, where) == graph->mincut); ASSERT(CheckBnd(graph)); /* Insert boundary nodes in the priority queues */ nbnd = graph->nbnd; RandomPermute(nbnd, perm, 1); for (ii=0; ii<nbnd; ii++) { i = bndind[perm[ii]]; ASSERT(ed[i] > 0 || id[i] == 0); ASSERT(bndptr[i] != -1); PQueueInsert(&parts[qnum[i]][where[i]], i, ed[i]-id[i]); } for (nswaps=0; nswaps<nvtxs; nswaps++) { SelectQueue(ncon, npwgts, rtpwgts, &from, &cnum, parts); to = (from+1)%2; if (from == -1 || (higain = PQueueGetMax(&parts[cnum][from])) == -1) break; ASSERT(bndptr[higain] != -1); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); newcut -= (ed[higain]-id[higain]); newbal = Compute2WayHLoadImbalance(ncon, npwgts, tpwgts); if ((newcut < mincut && newbal-origbal <= .00001) || (newcut == mincut && (newbal < minbal || (newbal == minbal && BetterBalance(ncon, npwgts, tpwgts, mindiff))))) { mincut = newcut; minbal = newbal; mincutorder = nswaps; for (i=0; i<ncon; i++) mindiff[i] = fabs(tpwgts[0]-npwgts[i]); } else if (nswaps-mincutorder > limit) { /* We hit the limit, undo last move */ newcut += (ed[higain]-id[higain]); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); break; } where[higain] = to; moved[higain] = nswaps; swaps[nswaps] = higain; /* if (ctrl->dbglvl&DBG_MOVEINFO) { printf("Moved %6d from %d(%d). Gain: %5d, Cut: %5d, NPwgts: ", higain, from, cnum, ed[higain]-id[higain], newcut); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf(", %.3f LB: %.3f\n", minbal, newbal); } */ /************************************************************** * Update the id[i]/ed[i] values of the affected nodes ***************************************************************/ SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; oldgain = ed[k]-id[k]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); /* Update its boundary information and queue position */ if (bndptr[k] != -1) { /* If k was a boundary vertex */ if (ed[k] == 0) { /* Not a boundary vertex any more */ BNDDelete(nbnd, bndind, bndptr, k); if (moved[k] == -1) /* Remove it if in the queues */ PQueueDelete(&parts[qnum[k]][where[k]], k, oldgain); } else { /* If it has not been moved, update its position in the queue */ if (moved[k] == -1) PQueueUpdate(&parts[qnum[k]][where[k]], k, oldgain, ed[k]-id[k]); } } else { if (ed[k] > 0) { /* It will now become a boundary vertex */ BNDInsert(nbnd, bndind, bndptr, k); if (moved[k] == -1) PQueueInsert(&parts[qnum[k]][where[k]], k, ed[k]-id[k]); } } } } /**************************************************************** * Roll back computations *****************************************************************/ for (i=0; i<nswaps; i++) moved[swaps[i]] = -1; /* reset moved array */ for (nswaps--; nswaps>mincutorder; nswaps--) { higain = swaps[nswaps]; to = where[higain] = (where[higain]+1)%2; SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); else if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+((to+1)%2)*ncon, 1); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); if (bndptr[k] != -1 && ed[k] == 0) BNDDelete(nbnd, bndind, bndptr, k); if (bndptr[k] == -1 && ed[k] > 0) BNDInsert(nbnd, bndind, bndptr, k); } } /* if (ctrl->dbglvl&DBG_REFINE) { printf("\tMincut: %6d at %5d, NBND: %6d, NPwgts: [", mincut, mincutorder, nbnd); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("], LB: %.3f\n", Compute2WayHLoadImbalance(ncon, npwgts, tpwgts)); } */ graph->mincut = mincut; graph->nbnd = nbnd; if (mincutorder == -1 || mincut == initcut) break; } for (i=0; i<ncon; i++) { PQueueFree(ctrl, &parts[i][0]); PQueueFree(ctrl, &parts[i][1]); } idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); }
/************************************************************************* * This function balances two partitions by moving the highest gain * (including negative gain) vertices to the other domain. * It is used only when tha unbalance is due to non contigous * subdomains. That is, the are no boundary vertices. * It moves vertices from the domain that is overweight to the one that * is underweight. **************************************************************************/ void MocInit2WayBalance(CtrlType *ctrl, GraphType *graph, float *tpwgts) { int i, ii, j, k, l, kwgt, nvtxs, nbnd, ncon, nswaps, from, to, pass, me, cnum, tmp; idxtype *xadj, *adjncy, *adjwgt, *where, *id, *ed, *bndptr, *bndind; idxtype *perm, *qnum; float *nvwgt, *npwgts; PQueueType parts[MAXNCON][2]; int higain, oldgain, mincut; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; adjncy = graph->adjncy; nvwgt = graph->nvwgt; adjwgt = graph->adjwgt; where = graph->where; id = graph->id; ed = graph->ed; npwgts = graph->npwgts; bndptr = graph->bndptr; bndind = graph->bndind; perm = idxwspacemalloc(ctrl, nvtxs); qnum = idxwspacemalloc(ctrl, nvtxs); /* This is called for initial partitioning so we know from where to pick nodes */ from = 1; to = (from+1)%2; if (ctrl->dbglvl&DBG_REFINE) { printf("Parts: ["); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("] T[%.3f %.3f], Nv-Nb[%5d, %5d]. ICut: %6d, LB: %.3f [B]\n", tpwgts[0], tpwgts[1], graph->nvtxs, graph->nbnd, graph->mincut, Compute2WayHLoadImbalance(ncon, npwgts, tpwgts)); } for (i=0; i<ncon; i++) { PQueueInit(ctrl, &parts[i][0], nvtxs, PLUS_GAINSPAN+1); PQueueInit(ctrl, &parts[i][1], nvtxs, PLUS_GAINSPAN+1); } ASSERT(ComputeCut(graph, where) == graph->mincut); ASSERT(CheckBnd(graph)); ASSERT(CheckGraph(graph)); /* Compute the queues in which each vertex will be assigned to */ for (i=0; i<nvtxs; i++) qnum[i] = samax(ncon, nvwgt+i*ncon); /* Insert the nodes of the proper partition in the appropriate priority queue */ RandomPermute(nvtxs, perm, 1); for (ii=0; ii<nvtxs; ii++) { i = perm[ii]; if (where[i] == from) { if (ed[i] > 0) PQueueInsert(&parts[qnum[i]][0], i, ed[i]-id[i]); else PQueueInsert(&parts[qnum[i]][1], i, ed[i]-id[i]); } } mincut = graph->mincut; nbnd = graph->nbnd; for (nswaps=0; nswaps<nvtxs; nswaps++) { if (AreAnyVwgtsBelow(ncon, 1.0, npwgts+from*ncon, 0.0, nvwgt, tpwgts[from])) break; if ((cnum = SelectQueueOneWay(ncon, npwgts, tpwgts, from, parts)) == -1) break; if ((higain = PQueueGetMax(&parts[cnum][0])) == -1) higain = PQueueGetMax(&parts[cnum][1]); mincut -= (ed[higain]-id[higain]); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); where[higain] = to; if (ctrl->dbglvl&DBG_MOVEINFO) { printf("Moved %6d from %d(%d). [%5d] %5d, NPwgts: ", higain, from, cnum, ed[higain]-id[higain], mincut); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf(", LB: %.3f\n", Compute2WayHLoadImbalance(ncon, npwgts, tpwgts)); if (ed[higain] == 0 && id[higain] > 0) printf("\t Pulled from the interior!\n"); } /************************************************************** * Update the id[i]/ed[i] values of the affected nodes ***************************************************************/ SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; oldgain = ed[k]-id[k]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); /* Update the queue position */ if (where[k] == from) { if (ed[k] > 0 && bndptr[k] == -1) { /* It moves in boundary */ PQueueDelete(&parts[qnum[k]][1], k, oldgain); PQueueInsert(&parts[qnum[k]][0], k, ed[k]-id[k]); } else { /* It must be in the boundary already */ if (bndptr[k] == -1) printf("What you thought was wrong!\n"); PQueueUpdate(&parts[qnum[k]][0], k, oldgain, ed[k]-id[k]); } } /* Update its boundary information */ if (ed[k] == 0 && bndptr[k] != -1) BNDDelete(nbnd, bndind, bndptr, k); else if (ed[k] > 0 && bndptr[k] == -1) BNDInsert(nbnd, bndind, bndptr, k); } ASSERTP(ComputeCut(graph, where) == mincut, ("%d != %d\n", ComputeCut(graph, where), mincut)); } if (ctrl->dbglvl&DBG_REFINE) { printf("\tMincut: %6d, NBND: %6d, NPwgts: ", mincut, nbnd); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf(", LB: %.3f\n", Compute2WayHLoadImbalance(ncon, npwgts, tpwgts)); } graph->mincut = mincut; graph->nbnd = nbnd; for (i=0; i<ncon; i++) { PQueueFree(ctrl, &parts[i][0]); PQueueFree(ctrl, &parts[i][1]); } ASSERT(ComputeCut(graph, where) == graph->mincut); ASSERT(CheckBnd(graph)); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); }
void sgeco (float **a, int n, int *ipvt, float *rcond, float *z) /***************************************************************************** Gaussian elimination to obtain the LU factorization and condition number of a matrix. ****************************************************************************** Input: a matrix[n][n] to be factored (see notes below) n dimension of a Output: a matrix[n][n] factored (see notes below) ipvt indices of pivot permutations (see notes below) rcond reciprocal condition number (see notes below) Workspace: z array[n] ****************************************************************************** Notes: This function was adapted from LINPACK FORTRAN. Because two-dimensional arrays cannot be declared with variable dimensions in C, the matrix a is actually a pointer to an array of pointers to floats, as declared above and used below. Elements of a are stored as follows: a[0][0] a[1][0] a[2][0] ... a[n-1][0] a[0][1] a[1][1] a[2][1] ... a[n-1][1] a[0][2] a[1][2] a[2][2] ... a[n-1][2] . . . . . . . . . . a[0][n-1] a[1][n-1] a[2][n-1] ... a[n-1][n-1] Both the factored matrix a and the pivot indices ipvt are required to solve linear systems of equations via sgesl. Given the reciprocal of the condition number, rcond, and the float epsilon, FLT_EPSILON, the number of significant decimal digits, nsdd, in the solution of a linear system of equations may be estimated by: nsdd = (int)log10(rcond/FLT_EPSILON) ****************************************************************************** Author: Dave Hale, Colorado School of Mines, 10/01/89 *****************************************************************************/ { int info,j,k,kp1,l; float ek,t,wk,wkm,anorm,s,sm,ynorm; /* compute 1-norm of a */ for (j=0,anorm=0.0; j<n; j++) { t = sasum(n,a[j],1); anorm = (t>anorm)?t:anorm; } /* factor */ sgefa(a,n,ipvt,&info); /* rcond = 1/(norm(a)*(estimate of norm(inverse(a)))). * estimate = norm(z)/norm(y) where Az = y and A'y = e. * A' is the transpose of A. The components of e are * chosen to cause maximum local growth in the elements of * w where U'w = e. The vectors are frequently rescaled * to avoid overflow */ /* solve U'w = e */ ek = 1.0; for (j=0; j<n; j++) z[j] = 0.0; for (k=0; k<n; k++) { if (z[k]!=0.0) ek = (z[k]>0.0)?-ABS(ek):ABS(ek); if (ABS(ek-z[k])>ABS(a[k][k])) { s = ABS(a[k][k])/ABS(ek-z[k]); sscal(n,s,z,1); ek *= s; } wk = ek-z[k]; wkm = -ek-z[k]; s = ABS(wk); sm = ABS(wkm); if (a[k][k]==0.0) { wk = 1.0; wkm = 1.0; } else { wk = wk/a[k][k]; wkm = wkm/a[k][k]; } kp1 = k+1; if (kp1<n) { for (j=kp1; j<n; j++) { t = z[j]+wkm*a[j][k]; sm += ABS(t); z[j] += wk*a[j][k]; s += ABS(z[j]); } if (s<sm) { t = wkm-wk; wk = wkm; for (j=kp1; j<n; j++) z[j] += t*a[j][k]; } } z[k] = wk; } s = 1.0/sasum(n,z,1); sscal(n,s,z,1); /* solve L'y = w */ for (k=n-1; k>=0; k--) { if (k<n-1) z[k] += sdot(n-k-1,&a[k][k+1],1,&z[k+1],1); if (ABS(z[k])>1.0) { s = 1.0/ABS(z[k]); sscal(n,s,z,1); } l = ipvt[k]; t = z[l]; z[l] = z[k]; z[k] = t; } s = 1.0/sasum(n,z,1); sscal(n,s,z,1); ynorm = 1.0; /* solve Lv = y */ for (k=0; k<n; k++) { l = ipvt[k]; t = z[l]; z[l] = z[k]; z[k] = t; if (k<n-1) saxpy(n-k-1,t,&a[k][k+1],1,&z[k+1],1); if (ABS(z[k])>1.0) { s = 1.0/ABS(z[k]); sscal(n,s,z,1); ynorm *= s; } } s = 1.0/sasum(n,z,1); sscal(n,s,z,1); ynorm *= s; /* solve Uz = v */ for (k=n-1; k>=0; k--) { if (ABS(z[k])>ABS(a[k][k])) { s = ABS(a[k][k])/ABS(z[k]); sscal(n,s,z,1); ynorm *= s; } if (a[k][k]!=0.0) z[k] /= a[k][k]; else z[k] = 1.0; t = -z[k]; saxpy(k,t,a[k],1,z,1); } /* make znorm = 1.0 */ s = 1.0/sasum(n,z,1); sscal(n,s,z,1); ynorm *= s; if (anorm!=0.0) *rcond = ynorm/anorm; else *rcond = 0.0; }
/************************************************************************* * This function performs an edge-based FM refinement **************************************************************************/ void MocGeneral2WayBalance2(CtrlType *ctrl, GraphType *graph, float *tpwgts, float *ubvec) { int i, ii, j, k, l, kwgt, nvtxs, ncon, nbnd, nswaps, from, to, limit, tmp, cnum; idxtype *xadj, *adjncy, *adjwgt, *where, *id, *ed, *bndptr, *bndind; idxtype *moved, *swaps, *perm, *qnum; float *nvwgt, *npwgts, origbal[MAXNCON], minbal[MAXNCON], newbal[MAXNCON]; PQueueType parts[MAXNCON][2]; int higain, oldgain, mincut, newcut, mincutorder; float *maxwgt, *minwgt, tvec[MAXNCON]; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; id = graph->id; ed = graph->ed; npwgts = graph->npwgts; bndptr = graph->bndptr; bndind = graph->bndind; moved = idxwspacemalloc(ctrl, nvtxs); swaps = idxwspacemalloc(ctrl, nvtxs); perm = idxwspacemalloc(ctrl, nvtxs); qnum = idxwspacemalloc(ctrl, nvtxs); limit = amin(amax(0.01*nvtxs, 15), 100); /* Setup the weight intervals of the two subdomains */ minwgt = fwspacemalloc(ctrl, 2*ncon); maxwgt = fwspacemalloc(ctrl, 2*ncon); for (i=0; i<2; i++) { for (j=0; j<ncon; j++) { maxwgt[i*ncon+j] = tpwgts[i]*ubvec[j]; minwgt[i*ncon+j] = tpwgts[i]*(1.0/ubvec[j]); } } /* Initialize the queues */ for (i=0; i<ncon; i++) { PQueueInit(ctrl, &parts[i][0], nvtxs, PLUS_GAINSPAN+1); PQueueInit(ctrl, &parts[i][1], nvtxs, PLUS_GAINSPAN+1); } for (i=0; i<nvtxs; i++) qnum[i] = samax(ncon, nvwgt+i*ncon); Compute2WayHLoadImbalanceVec(ncon, npwgts, tpwgts, origbal); for (i=0; i<ncon; i++) minbal[i] = origbal[i]; newcut = mincut = graph->mincut; mincutorder = -1; if (ctrl->dbglvl&DBG_REFINE) { printf("Parts: ["); for (l=0; l<ncon; l++) printf("(%.3f, %.3f) ", npwgts[l], npwgts[ncon+l]); printf("] T[%.3f %.3f], Nv-Nb[%5d, %5d]. ICut: %6d, LB: ", tpwgts[0], tpwgts[1], graph->nvtxs, graph->nbnd, graph->mincut); for (i=0; i<ncon; i++) printf("%.3f ", origbal[i]); printf("[B]\n"); } idxset(nvtxs, -1, moved); ASSERT(ComputeCut(graph, where) == graph->mincut); ASSERT(CheckBnd(graph)); /* Insert all nodes in the priority queues */ nbnd = graph->nbnd; RandomPermute(nvtxs, perm, 1); for (ii=0; ii<nvtxs; ii++) { i = perm[ii]; PQueueInsert(&parts[qnum[i]][where[i]], i, ed[i]-id[i]); } for (nswaps=0; nswaps<nvtxs; nswaps++) { if (AreAllBelow(ncon, minbal, ubvec)) break; SelectQueue3(ncon, npwgts, tpwgts, &from, &cnum, parts, maxwgt); to = (from+1)%2; if (from == -1 || (higain = PQueueGetMax(&parts[cnum][from])) == -1) break; saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); newcut -= (ed[higain]-id[higain]); Compute2WayHLoadImbalanceVec(ncon, npwgts, tpwgts, newbal); if (IsBetter2wayBalance(ncon, newbal, minbal, ubvec) || (IsBetter2wayBalance(ncon, newbal, origbal, ubvec) && newcut < mincut)) { mincut = newcut; for (i=0; i<ncon; i++) minbal[i] = newbal[i]; mincutorder = nswaps; } else if (nswaps-mincutorder > limit) { /* We hit the limit, undo last move */ newcut += (ed[higain]-id[higain]); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+from*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); break; } where[higain] = to; moved[higain] = nswaps; swaps[nswaps] = higain; if (ctrl->dbglvl&DBG_MOVEINFO) { printf("Moved %6d from %d(%d). Gain: %5d, Cut: %5d, NPwgts: ", higain, from, cnum, ed[higain]-id[higain], newcut); for (i=0; i<ncon; i++) printf("(%.3f, %.3f) ", npwgts[i], npwgts[ncon+i]); Compute2WayHLoadImbalanceVec(ncon, npwgts, tpwgts, tvec); printf(", LB: "); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); if (mincutorder == nswaps) printf(" *\n"); else printf("\n"); } /************************************************************** * Update the id[i]/ed[i] values of the affected nodes ***************************************************************/ SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; oldgain = ed[k]-id[k]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); /* Update the queue position */ if (moved[k] == -1) PQueueUpdate(&parts[qnum[k]][where[k]], k, oldgain, ed[k]-id[k]); /* Update its boundary information */ if (ed[k] == 0 && bndptr[k] != -1) BNDDelete(nbnd, bndind, bndptr, k); else if (ed[k] > 0 && bndptr[k] == -1) BNDInsert(nbnd, bndind, bndptr, k); } } /**************************************************************** * Roll back computations *****************************************************************/ for (i=0; i<nswaps; i++) moved[swaps[i]] = -1; /* reset moved array */ for (nswaps--; nswaps>mincutorder; nswaps--) { higain = swaps[nswaps]; to = where[higain] = (where[higain]+1)%2; SWAP(id[higain], ed[higain], tmp); if (ed[higain] == 0 && bndptr[higain] != -1 && xadj[higain] < xadj[higain+1]) BNDDelete(nbnd, bndind, bndptr, higain); else if (ed[higain] > 0 && bndptr[higain] == -1) BNDInsert(nbnd, bndind, bndptr, higain); saxpy(ncon, 1.0, nvwgt+higain*ncon, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt+higain*ncon, 1, npwgts+((to+1)%2)*ncon, 1); for (j=xadj[higain]; j<xadj[higain+1]; j++) { k = adjncy[j]; kwgt = (to == where[k] ? adjwgt[j] : -adjwgt[j]); INC_DEC(id[k], ed[k], kwgt); if (bndptr[k] != -1 && ed[k] == 0) BNDDelete(nbnd, bndind, bndptr, k); if (bndptr[k] == -1 && ed[k] > 0) BNDInsert(nbnd, bndind, bndptr, k); } } if (ctrl->dbglvl&DBG_REFINE) { printf("\tMincut: %6d at %5d, NBND: %6d, NPwgts: [", mincut, mincutorder, nbnd); for (i=0; i<ncon; i++) printf("(%.3f, %.3f) ", npwgts[i], npwgts[ncon+i]); printf("], LB: "); Compute2WayHLoadImbalanceVec(ncon, npwgts, tpwgts, tvec); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); printf("\n"); } graph->mincut = mincut; graph->nbnd = nbnd; for (i=0; i<ncon; i++) { PQueueFree(ctrl, &parts[i][0]); PQueueFree(ctrl, &parts[i][1]); } idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); fwspacefree(ctrl, 2*ncon); fwspacefree(ctrl, 2*ncon); }
void sgesl (float **a, int n, int *ipvt, float *b, int job) /***************************************************************************** solve linear system Ax = b or A'x = b after LU factorization ****************************************************************************** Input: a matrix[n][n] that has been LU factored (see notes below) n dimension of a ipvt indices of pivot permutations (see notes below) b right-hand-side vector[n] job =0 to solve Ax = b =1 to solve A'x = b Output: b solution vector[n] ****************************************************************************** Notes: This function was adapted from LINPACK FORTRAN. Because two-dimensional arrays cannot be declared with variable dimensions in C, the matrix a is actually a pointer to an array of pointers to floats, as declared above and used below. ****************************************************************************** Author: Dave Hale, Colorado School of Mines, 10/01/89 *****************************************************************************/ { int k,l,nm1; float t; nm1 = n-1; /* if solving Ax = b */ if (job==0) { /* first solve Ly = b */ for (k=0; k<nm1; k++) { l = ipvt[k]; t = b[l]; if (l!=k) { b[l] = b[k]; b[k] = t; } saxpy(n-k-1,t,&a[k][k+1],1,&b[k+1],1); } /* now solve Ux = y */ for (k=n-1; k>=0; k--) { b[k] /= a[k][k]; t = -b[k]; saxpy(k,t,a[k],1,b,1); } /* else, if solving A'x = b */ } else { /* first solve U'y = b */ for (k=0; k<n; k++) { t = sdot(k,a[k],1,b,1); b[k] = (b[k]-t)/a[k][k]; } /* now solve L'x = y */ for (k=n-2; k>=0; k--) { b[k] += sdot(n-k-1,&a[k][k+1],1,&b[k+1],1); l = ipvt[k]; if (l!=k) { t = b[l]; b[l] = b[k]; b[k] = t; } } } }
/************************************************************************* * This function performs k-way refinement **************************************************************************/ void MCGreedy_KWayEdgeBalanceHorizontal(CtrlType *ctrl, GraphType *graph, int nparts, float *ubvec, int npasses) { int i, ii, /*iii,*/ j, /*jj,*/ k, /*l,*/ pass, nvtxs, ncon, nbnd, myndegrees, oldgain, gain, nmoves; int from, me, to, oldcut; idxtype *xadj, *adjncy, *adjwgt; idxtype *where, *perm, *bndptr, *bndind, *moved; EDegreeType *myedegrees; RInfoType *myrinfo; PQueueType queue; float *npwgts, *nvwgt, *minwgt, *maxwgt, tvec[MAXNCON]; nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; adjncy = graph->adjncy; adjwgt = graph->adjwgt; bndind = graph->bndind; bndptr = graph->bndptr; where = graph->where; npwgts = graph->npwgts; /* Setup the weight intervals of the various subdomains */ minwgt = fwspacemalloc(ctrl, ncon*nparts); maxwgt = fwspacemalloc(ctrl, ncon*nparts); for (i=0; i<nparts; i++) { for (j=0; j<ncon; j++) { maxwgt[i*ncon+j] = ubvec[j]/nparts; minwgt[i*ncon+j] = 1.0/(ubvec[j]*nparts); } } perm = idxwspacemalloc(ctrl, nvtxs); moved = idxwspacemalloc(ctrl, nvtxs); PQueueInit(ctrl, &queue, nvtxs, graph->adjwgtsum[idxamax(nvtxs, graph->adjwgtsum)]); if (ctrl->dbglvl&DBG_REFINE) { printf("Partitions: [%5.4f %5.4f], Nv-Nb[%6d %6d]. Cut: %6d, LB: ", npwgts[samin(ncon*nparts, npwgts)], npwgts[samax(ncon*nparts, npwgts)], graph->nvtxs, graph->nbnd, graph->mincut); ComputeHKWayLoadImbalance(ncon, nparts, npwgts, tvec); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); printf("[B]\n"); } for (pass=0; pass<npasses; pass++) { ASSERT(ComputeCut(graph, where) == graph->mincut); /* Check to see if things are out of balance, given the tolerance */ if (MocIsHBalanced(ncon, nparts, npwgts, ubvec)) break; PQueueReset(&queue); idxset(nvtxs, -1, moved); oldcut = graph->mincut; nbnd = graph->nbnd; RandomPermute(nbnd, perm, 1); for (ii=0; ii<nbnd; ii++) { i = bndind[perm[ii]]; PQueueInsert(&queue, i, graph->rinfo[i].ed - graph->rinfo[i].id); moved[i] = 2; } nmoves = 0; for (;;) { if ((i = PQueueGetMax(&queue)) == -1) break; moved[i] = 1; myrinfo = graph->rinfo+i; from = where[i]; nvwgt = graph->nvwgt+i*ncon; if (AreAllHVwgtsBelow(ncon, 1.0, npwgts+from*ncon, -1.0, nvwgt, minwgt+from*ncon)) continue; /* This cannot be moved! */ myedegrees = myrinfo->edegrees; myndegrees = myrinfo->ndegrees; for (k=0; k<myndegrees; k++) { to = myedegrees[k].pid; if (IsHBalanceBetterFT(ncon, nparts, npwgts+from*ncon, npwgts+to*ncon, nvwgt, ubvec)) break; } if (k == myndegrees) continue; /* break out if you did not find a candidate */ for (j=k+1; j<myndegrees; j++) { to = myedegrees[j].pid; if (IsHBalanceBetterTT(ncon, nparts, npwgts+myedegrees[k].pid*ncon, npwgts+to*ncon, nvwgt, ubvec)) k = j; } to = myedegrees[k].pid; j = 0; if (!AreAllHVwgtsBelow(ncon, 1.0, npwgts+from*ncon, 0.0, nvwgt, maxwgt+from*ncon)) j++; if (myedegrees[k].ed-myrinfo->id >= 0) j++; if (!AreAllHVwgtsAbove(ncon, 1.0, npwgts+to*ncon, 0.0, nvwgt, minwgt+to*ncon) && AreAllHVwgtsBelow(ncon, 1.0, npwgts+to*ncon, 1.0, nvwgt, maxwgt+to*ncon)) j++; if (j == 0) continue; /* DELETE if (myedegrees[k].ed-myrinfo->id < 0 && AreAllHVwgtsBelow(ncon, 1.0, npwgts+from*ncon, 0.0, nvwgt, maxwgt+from*ncon) && AreAllHVwgtsAbove(ncon, 1.0, npwgts+to*ncon, 0.0, nvwgt, minwgt+to*ncon) && AreAllHVwgtsBelow(ncon, 1.0, npwgts+to*ncon, 1.0, nvwgt, maxwgt+to*ncon)) continue; */ /*===================================================================== * If we got here, we can now move the vertex from 'from' to 'to' *======================================================================*/ graph->mincut -= myedegrees[k].ed-myrinfo->id; IFSET(ctrl->dbglvl, DBG_MOVEINFO, printf("\t\tMoving %6d to %3d. Gain: %4d. Cut: %6d\n", i, to, myedegrees[k].ed-myrinfo->id, graph->mincut)); /* Update where, weight, and ID/ED information of the vertex you moved */ saxpy(ncon, 1.0, nvwgt, 1, npwgts+to*ncon, 1); saxpy(ncon, -1.0, nvwgt, 1, npwgts+from*ncon, 1); where[i] = to; myrinfo->ed += myrinfo->id-myedegrees[k].ed; SWAP(myrinfo->id, myedegrees[k].ed, j); if (myedegrees[k].ed == 0) myedegrees[k] = myedegrees[--myrinfo->ndegrees]; else myedegrees[k].pid = from; if (myrinfo->ed == 0) BNDDelete(nbnd, bndind, bndptr, i); /* Update the degrees of adjacent vertices */ for (j=xadj[i]; j<xadj[i+1]; j++) { ii = adjncy[j]; me = where[ii]; myrinfo = graph->rinfo+ii; if (myrinfo->edegrees == NULL) { myrinfo->edegrees = ctrl->wspace.edegrees+ctrl->wspace.cdegree; ctrl->wspace.cdegree += xadj[ii+1]-xadj[ii]; } myedegrees = myrinfo->edegrees; ASSERT(CheckRInfo(myrinfo)); oldgain = (myrinfo->ed-myrinfo->id); if (me == from) { INC_DEC(myrinfo->ed, myrinfo->id, adjwgt[j]); if (myrinfo->ed > 0 && bndptr[ii] == -1) BNDInsert(nbnd, bndind, bndptr, ii); } else if (me == to) { INC_DEC(myrinfo->id, myrinfo->ed, adjwgt[j]); if (myrinfo->ed == 0 && bndptr[ii] != -1) BNDDelete(nbnd, bndind, bndptr, ii); } /* Remove contribution from the .ed of 'from' */ if (me != from) { for (k=0; k<myrinfo->ndegrees; k++) { if (myedegrees[k].pid == from) { if (myedegrees[k].ed == adjwgt[j]) myedegrees[k] = myedegrees[--myrinfo->ndegrees]; else myedegrees[k].ed -= adjwgt[j]; break; } } } /* Add contribution to the .ed of 'to' */ if (me != to) { for (k=0; k<myrinfo->ndegrees; k++) { if (myedegrees[k].pid == to) { myedegrees[k].ed += adjwgt[j]; break; } } if (k == myrinfo->ndegrees) { myedegrees[myrinfo->ndegrees].pid = to; myedegrees[myrinfo->ndegrees++].ed = adjwgt[j]; } } /* Update the queue */ if (me == to || me == from) { gain = myrinfo->ed-myrinfo->id; if (moved[ii] == 2) { if (myrinfo->ed > 0) PQueueUpdate(&queue, ii, oldgain, gain); else { PQueueDelete(&queue, ii, oldgain); moved[ii] = -1; } } else if (moved[ii] == -1 && myrinfo->ed > 0) { PQueueInsert(&queue, ii, gain); moved[ii] = 2; } } ASSERT(myrinfo->ndegrees <= xadj[ii+1]-xadj[ii]); ASSERT(CheckRInfo(myrinfo)); } nmoves++; } graph->nbnd = nbnd; if (ctrl->dbglvl&DBG_REFINE) { printf("\t [%5.4f %5.4f], Nb: %6d, Nmoves: %5d, Cut: %6d, LB: ", npwgts[samin(ncon*nparts, npwgts)], npwgts[samax(ncon*nparts, npwgts)], nbnd, nmoves, graph->mincut); ComputeHKWayLoadImbalance(ncon, nparts, npwgts, tvec); for (i=0; i<ncon; i++) printf("%.3f ", tvec[i]); printf("\n"); } if (nmoves == 0) break; } PQueueFree(ctrl, &queue); fwspacefree(ctrl, ncon*nparts); fwspacefree(ctrl, ncon*nparts); idxwspacefree(ctrl, nvtxs); idxwspacefree(ctrl, nvtxs); }
void sgefa (float **a, int n, int *ipvt, int *info) /***************************************************************************** Gaussian elimination to obtain the LU factorization of a matrix ****************************************************************************** Input: a matrix[n][n] to be factored (see notes below) n dimension of a Output: a matrix[n][n] factored (see notes below) ipvt indices of pivot permutations (see notes below) info index of last zero pivot (or -1 if no zero pivots) ****************************************************************************** Notes: This function was adapted from LINPACK FORTRAN. Because two-dimensional arrays cannot be declared with variable dimensions in C, the matrix a is actually a pointer to an array of pointers to floats, as declared above and used below. Elements of a are stored as follows: a[0][0] a[1][0] a[2][0] ... a[n-1][0] a[0][1] a[1][1] a[2][1] ... a[n-1][1] a[0][2] a[1][2] a[2][2] ... a[n-1][2] . . . . . . . . . . a[0][n-1] a[1][n-1] a[2][n-1] ... a[n-1][n-1] Both the factored matrix a and the pivot indices ipvt are required to solve linear systems of equations via sgesl. ****************************************************************************** Author: Dave Hale, Colorado School of Mines, 10/01/89 *****************************************************************************/ { int j,k,kp1,l,nm1; float t; *info = -1; nm1 = n-1; for (k=0; k<nm1; k++) { kp1 = k+1; /* find l = pivot index */ l = k+isamax(n-k,&a[k][k],1); ipvt[k] = l; /* zero pivot implies this column already triangularized */ if (a[k][l]==0.0) { *info = k; continue; } /* if necessary, interchange */ if (l!=k) { t = a[k][l]; a[k][l] = a[k][k]; a[k][k] = t; } /* compute multipliers */ t = -1.0/a[k][k]; sscal(n-k-1,t,&a[k][k+1],1); /* row elimination with column indexing */ for (j=kp1; j<n; j++) { t = a[j][l]; if (l!=k) { a[j][l] = a[j][k]; a[j][k] = t; } saxpy(n-k-1,t,&a[k][k+1],1,&a[j][k+1],1); } } ipvt[n-1] = n-1; if (a[n-1][n-1]==0.0) *info = n-1; }
/************************************************************************* * This function creates the coarser graph **************************************************************************/ void CreateCoarseGraph(CtrlType *ctrl, GraphType *graph, int cnvtxs, idxtype *match, idxtype *perm) { int i, j, jj, k, kk, l, m, istart, iend, nvtxs, nedges, ncon, cnedges, v, u, mask, dovsize; idxtype *xadj, *vwgt, *vsize, *adjncy, *adjwgt, *adjwgtsum, *auxadj; idxtype *cmap, *htable; idxtype *cxadj, *cvwgt, *cvsize, *cadjncy, *cadjwgt, *cadjwgtsum; float *nvwgt, *cnvwgt; GraphType *cgraph; dovsize = (ctrl->optype == OP_KVMETIS ? 1 : 0); mask = HTLENGTH; if (cnvtxs < 8*mask || graph->nedges/graph->nvtxs > 15) { CreateCoarseGraphNoMask(ctrl, graph, cnvtxs, match, perm); return; } IFSET(ctrl->dbglvl, DBG_TIME, starttimer(ctrl->ContractTmr)); nvtxs = graph->nvtxs; ncon = graph->ncon; xadj = graph->xadj; vwgt = graph->vwgt; vsize = graph->vsize; nvwgt = graph->nvwgt; adjncy = graph->adjncy; adjwgt = graph->adjwgt; adjwgtsum = graph->adjwgtsum; cmap = graph->cmap; /* Initialize the coarser graph */ cgraph = SetUpCoarseGraph(graph, cnvtxs, dovsize); cxadj = cgraph->xadj; cvwgt = cgraph->vwgt; cvsize = cgraph->vsize; cnvwgt = cgraph->nvwgt; cadjwgtsum = cgraph->adjwgtsum; cadjncy = cgraph->adjncy; cadjwgt = cgraph->adjwgt; iend = xadj[nvtxs]; auxadj = ctrl->wspace.auxcore; memcpy(auxadj, adjncy, iend*sizeof(idxtype)); for (i=0; i<iend; i++) auxadj[i] = cmap[auxadj[i]]; htable = idxset(mask+1, -1, idxwspacemalloc(ctrl, mask+1)); cxadj[0] = cnvtxs = cnedges = 0; for (i=0; i<nvtxs; i++) { v = perm[i]; if (cmap[v] != cnvtxs) continue; u = match[v]; if (ncon == 1) cvwgt[cnvtxs] = vwgt[v]; else scopy(ncon, nvwgt+v*ncon, cnvwgt+cnvtxs*ncon); if (dovsize) cvsize[cnvtxs] = vsize[v]; cadjwgtsum[cnvtxs] = adjwgtsum[v]; nedges = 0; istart = xadj[v]; iend = xadj[v+1]; for (j=istart; j<iend; j++) { k = auxadj[j]; kk = k&mask; if ((m = htable[kk]) == -1) { cadjncy[nedges] = k; cadjwgt[nedges] = adjwgt[j]; htable[kk] = nedges++; } else if (cadjncy[m] == k) { cadjwgt[m] += adjwgt[j]; } else { for (jj=0; jj<nedges; jj++) { if (cadjncy[jj] == k) { cadjwgt[jj] += adjwgt[j]; break; } } if (jj == nedges) { cadjncy[nedges] = k; cadjwgt[nedges++] = adjwgt[j]; } } } if (v != u) { if (ncon == 1) cvwgt[cnvtxs] += vwgt[u]; else saxpy(ncon, 1.0, nvwgt+u*ncon, 1, cnvwgt+cnvtxs*ncon, 1); if (dovsize) cvsize[cnvtxs] += vsize[u]; cadjwgtsum[cnvtxs] += adjwgtsum[u]; istart = xadj[u]; iend = xadj[u+1]; for (j=istart; j<iend; j++) { k = auxadj[j]; kk = k&mask; if ((m = htable[kk]) == -1) { cadjncy[nedges] = k; cadjwgt[nedges] = adjwgt[j]; htable[kk] = nedges++; } else if (cadjncy[m] == k) { cadjwgt[m] += adjwgt[j]; } else { for (jj=0; jj<nedges; jj++) { if (cadjncy[jj] == k) { cadjwgt[jj] += adjwgt[j]; break; } } if (jj == nedges) { cadjncy[nedges] = k; cadjwgt[nedges++] = adjwgt[j]; } } } /* Remove the contracted adjacency weight */ jj = htable[cnvtxs&mask]; if (jj >= 0 && cadjncy[jj] != cnvtxs) { for (jj=0; jj<nedges; jj++) { if (cadjncy[jj] == cnvtxs) break; } } if (jj >= 0 && cadjncy[jj] == cnvtxs) { /* This 2nd check is needed for non-adjacent matchings */ cadjwgtsum[cnvtxs] -= cadjwgt[jj]; cadjncy[jj] = cadjncy[--nedges]; cadjwgt[jj] = cadjwgt[nedges]; } } ASSERTP(cadjwgtsum[cnvtxs] == idxsum(nedges, cadjwgt), ("%d %d %d %d %d\n", cnvtxs, cadjwgtsum[cnvtxs], idxsum(nedges, cadjwgt), adjwgtsum[u], adjwgtsum[v])); for (j=0; j<nedges; j++) htable[cadjncy[j]&mask] = -1; /* Zero out the htable */ htable[cnvtxs&mask] = -1; cnedges += nedges; cxadj[++cnvtxs] = cnedges; cadjncy += nedges; cadjwgt += nedges; } cgraph->nedges = cnedges; ReAdjustMemory(graph, cgraph, dovsize); IFSET(ctrl->dbglvl, DBG_TIME, stoptimer(ctrl->ContractTmr)); idxwspacefree(ctrl, mask+1); }
int main (int argc, char **argv) { cublasStatus_t s; cublasHandle_t h; CUcontext pctx; CUresult r; int i; const int N = 256; float *h_X, *h_Y1, *h_Y2; float *d_X,*d_Y; float alpha = 2.0f; float error_norm; float ref_norm; /* Test 3 - OpenACC creates, cuBLAS shares. */ acc_set_device_num (0, acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } h_X = (float *) malloc (N * sizeof (float)); if (h_X == 0) { fprintf (stderr, "malloc failed: for h_X\n"); exit (EXIT_FAILURE); } h_Y1 = (float *) malloc (N * sizeof (float)); if (h_Y1 == 0) { fprintf (stderr, "malloc failed: for h_Y1\n"); exit (EXIT_FAILURE); } h_Y2 = (float *) malloc (N * sizeof (float)); if (h_Y2 == 0) { fprintf (stderr, "malloc failed: for h_Y2\n"); exit (EXIT_FAILURE); } for (i = 0; i < N; i++) { h_X[i] = rand () / (float) RAND_MAX; h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX; } d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float)); if (d_X == NULL) { fprintf (stderr, "copyin error h_X\n"); exit (EXIT_FAILURE); } d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float)); if (d_Y == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } context_check (pctx); s = cublasCreate (&h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasCreate failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasSaxpy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float)); context_check (pctx); saxpy (N, alpha, h_X, h_Y2); error_norm = 0; ref_norm = 0; for (i = 0; i < N; ++i) { float diff; diff = h_Y1[i] - h_Y2[i]; error_norm += diff * diff; ref_norm += h_Y2[i] * h_Y2[i]; } error_norm = (float) sqrt ((double) error_norm); ref_norm = (float) sqrt ((double) ref_norm); if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f)) { fprintf (stderr, "math error\n"); exit (EXIT_FAILURE); } free (h_X); free (h_Y1); free (h_Y2); acc_free (d_X); acc_free (d_Y); context_check (pctx); s = cublasDestroy (h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasDestroy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_shutdown (acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } if (pctx) { fprintf (stderr, "Unexpected context\n"); exit (EXIT_FAILURE); } return EXIT_SUCCESS; }