/* * Check the inf-norm of the error vector */ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, double xtrue[], int_t ldxtrue, gridinfo_t *grid) { double err, xnorm, temperr, tempxnorm; double *x_work, *xtrue_work; int i, j; for (j = 0; j < nrhs; j++) { x_work = &x[j*ldx]; xtrue_work = &xtrue[j*ldxtrue]; err = xnorm = 0.0; for (i = 0; i < n; i++) { err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); } /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } }
/*! \brief <pre> Purpose ======= DLANGS_dist returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest absolute value of a real matrix A. Description =========== DLANGE returns the value DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' ( ( norm1(A), NORM = '1', 'O' or 'o' ( ( normI(A), NORM = 'I' or 'i' ( ( normF(A), NORM = 'F', 'f', 'E' or 'e' where norm1 denotes the one norm of a matrix (maximum column sum), normI denotes the infinity norm of a matrix (maximum row sum) and normF denotes the Frobenius norm of a matrix (square root of sum of squares). Note that max(abs(A(i,j))) is not a matrix norm. Arguments ========= NORM (input) CHARACTER*1 Specifies the value to be returned in DLANGE as described above. A (input) SuperMatrix* The M by N sparse matrix A. ===================================================================== </pre> */ double dlangs_dist(char *norm, SuperMatrix *A) { /* Local variables */ NCformat *Astore; double *Aval; int_t i, j, irow; double value=0., sum; double *rwork; Astore = (NCformat *) A->Store; Aval = (double *) Astore->nzval; if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { value = 0.; } else if ( strncmp(norm, "M", 1)==0 ) { /* Find max(abs(A(i,j))). */ value = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) value = SUPERLU_MAX( value, fabs( Aval[i]) ); } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') { /* Find norm1(A). */ value = 0.; for (j = 0; j < A->ncol; ++j) { sum = 0.; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) sum += fabs(Aval[i]); value = SUPERLU_MAX(value, sum); } } else if ( strncmp(norm, "I", 1)==0 ) { /* Find normI(A). */ if ( !(rwork = (double *) SUPERLU_MALLOC(A->nrow * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for rwork."); for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { irow = Astore->rowind[i]; rwork[irow] += fabs(Aval[i]); } value = 0.; for (i = 0; i < A->nrow; ++i) value = SUPERLU_MAX(value, rwork[i]); SUPERLU_FREE (rwork); } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) { /* Find normF(A). */ ABORT("Not implemented."); } else ABORT("Illegal norm specified."); return (value); } /* dlangs_dist */
/* * Convert a full matrix into a sparse matrix format. */ int sp_dconvert(int m, int n, double *A, int lda, int kl, int ku, double *a, int *asub, int *xa, int *nnz) { int lasta = 0; int i, j, ilow, ihigh; int *row; double *val; for (j = 0; j < n; ++j) { xa[j] = lasta; val = &a[xa[j]]; row = &asub[xa[j]]; ilow = SUPERLU_MAX(0, j - ku); ihigh = SUPERLU_MIN(n-1, j + kl); for (i = ilow; i <= ihigh; ++i) { val[i-ilow] = A[i + j*lda]; row[i-ilow] = i; } lasta += ihigh - ilow + 1; } xa[n] = *nnz = lasta; return 0; }
void log_memory(long long cur_bytes, SuperLUStat_t *stat) { stat->current_buffer += (float) cur_bytes; if (cur_bytes > 0) { stat->peak_buffer = SUPERLU_MAX(stat->peak_buffer, stat->current_buffer); } }
float sqselect(int n, float A[], int k) { register int i, j, p; register float val; k = SUPERLU_MAX(k, 0); k = SUPERLU_MIN(k, n - 1); while (n > 1) { i = 0; j = n-1; p = j; val = A[p]; while (i < j) { for (; A[i] >= val && i < p; i++); if (A[i] < val) { A[p] = A[i]; p = i; } for (; A[j] <= val && j > p; j--); if (A[j] > val) { A[p] = A[j]; p = j; } } A[p] = val; if (p == k) return val; else if (p > k) n = p; else { p++; n -= p; A += p; k -= p; } } return A[0]; }
doublecomplex *doublecomplexMalloc_dist(int_t n) { doublecomplex *buf; buf = (doublecomplex *) SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex)); return (buf); }
/* * Generate a banded square matrix A, with dimension n and semi-bandwidth b. */ void zband(int n, int b, int nonz, doublecomplex **nzval, int **rowind, int **colptr) { int iseed[] = {1992,1993,1994,1995}; register int i, j, ub, lb, ilow, ihigh, lasta = 0; doublecomplex *a; int *asub, *xa; doublecomplex *val; int *row; extern double dlaran_(); printf("A banded matrix."); zallocateA(n, nonz, nzval, rowind, colptr); /* Allocate storage */ a = *nzval; asub = *rowind; xa = *colptr; ub = lb = b; for (i = 0; i < 4; ++i) iseed[i] = abs( iseed[i] ) % 4096; if ( iseed[3] % 2 != 1 ) ++iseed[3]; for (j = 0; j < n; ++j) { xa[j] = lasta; val = &a[lasta]; row = &asub[lasta]; ilow = SUPERLU_MAX(0, j - ub); ihigh = SUPERLU_MIN(n-1, j + lb); for (i = ilow; i <= ihigh; ++i) { val[i-ilow].r = dlaran_(iseed); row[i-ilow] = i; } lasta += ihigh - ilow + 1; } /* for j ... */ xa[n] = lasta; }
/*! \brief * * <pre> * Purpose * ======= * * GetDiagU extracts the main diagonal of matrix U of the LU factorization. * * Arguments * ========= * * n (input) int * Dimension of the matrix. * * LUstruct (input) LUstruct_t* * The data structures to store the distributed L and U factors. * see superlu_ddefs.h for its definition. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * * diagU (output) double*, dimension (n) * The main diagonal of matrix U. * On exit, it is available on all processes. * * * Note * ==== * * The diagonal blocks of the L and U matrices are stored in the L * data structures, and are on the diagonal processes of the * 2D process grid. * * This routine is modified from gather_diag_to_all() in pzgstrs_Bglobal.c. * </pre> */ void pzGetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *diagU) { int_t *xsup; int iam, knsupc, pkk; int nsupr; /* number of rows in the block L(:,k) (LDA) */ int_t i, j, jj, k, lk, lwork, nsupers, p; int_t num_diag_procs, *diag_procs, *diag_len; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex *zblock, *zwork, *lusup; iam = grid->iam; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; get_diag_procs(n, Glu_persist, grid, &num_diag_procs, &diag_procs, &diag_len); jj = diag_len[0]; for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); if ( !(zwork = doublecomplexMalloc_dist(jj)) ) ABORT("Malloc fails for zwork[]"); for (p = 0; p < num_diag_procs; ++p) { pkk = diag_procs[p]; if ( iam == pkk ) { /* Copy diagonal into buffer dwork[]. */ lwork = 0; for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBj( k, grid ); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */ zwork[lwork+i] = lusup[i*(nsupr+1)]; lwork += knsupc; } MPI_Bcast( zwork, lwork, SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); } else { MPI_Bcast( zwork, diag_len[p], SuperLU_MPI_DOUBLE_COMPLEX, pkk, grid->comm ); } /* Scatter zwork[] into global diagU vector. */ lwork = 0; for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); zblock = &diagU[FstBlockC( k )]; for (i = 0; i < knsupc; ++i) zblock[i] = zwork[lwork+i]; lwork += knsupc; } } /* for p = ... */ SUPERLU_FREE(diag_procs); SUPERLU_FREE(diag_len); SUPERLU_FREE(zwork); }
double *doubleCalloc_dist(int_t n) { double *buf; register int_t i; double zero = 0.0; buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double)); if ( !buf ) return (buf); for (i = 0; i < n; ++i) buf[i] = zero; return (buf); }
/* * Check the inf-norm of the error vector */ void dinf_norm_error_dist(int_t n, int_t nrhs, double *x, int_t ldx, double *xtrue, int_t ldxtrue, gridinfo_t *grid) { double err, xnorm; double *x_work, *xtrue_work; int i, j; for (j = 0; j < nrhs; j++) { x_work = &x[j*ldx]; xtrue_work = &xtrue[j*ldxtrue]; err = xnorm = 0.0; for (i = 0; i < n; i++) { err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); } err = err / xnorm; printf("(%d) .. ||X-Xtrue||/||X|| = %e\n", grid->iam, err); } }
doublecomplex *doublecomplexCalloc_dist(int_t n) { doublecomplex *buf; register int_t i; doublecomplex zero = {0.0, 0.0}; buf = (doublecomplex *) SUPERLU_MALLOC(SUPERLU_MAX(1, n) * sizeof(doublecomplex)); if ( !buf ) return (buf); for (i = 0; i < n; ++i) buf[i] = zero; return (buf); }
/* * Check the inf-norm of the error vector */ void sinf_norm_error(int nrhs, SuperMatrix *X, float *xtrue) { DNformat *Xstore; float err, xnorm; float *Xmat, *soln_work; int i, j; Xstore = X->Store; Xmat = Xstore->nzval; for (j = 0; j < nrhs; j++) { soln_work = &Xmat[j*Xstore->lda]; err = xnorm = 0.0; for (i = 0; i < X->nrow; i++) { err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i])); xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i])); } err = err / xnorm; printf("||X - Xtrue||/||X|| = %e\n", err); } }
/*! \brief Set up pointers for real working arrays. */ void sSetRWork(int m, int panel_size, float *dworkptr, float **dense, float **tempv) { float zero = 0.0; int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), rowblk = sp_ienv(4); *dense = dworkptr; *tempv = *dense + panel_size*m; sfill (*dense, m * panel_size, zero); sfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); }
/*! \brief Set up pointers for real working arrays. */ void zSetRWork(int m, int panel_size, doublecomplex *dworkptr, doublecomplex **dense, doublecomplex **tempv) { doublecomplex zero = {0.0, 0.0}; int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), rowblk = sp_ienv(4); *dense = dworkptr; *tempv = *dense + panel_size*m; zfill (*dense, m * panel_size, zero); zfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero); }
/* * Check the inf-norm of the error vector */ void zinf_norm_error_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx, doublecomplex *xtrue, int_t ldxtrue, gridinfo_t *grid) { double err, xnorm; doublecomplex *x_work, *xtrue_work; doublecomplex temp; int i, j; for (j = 0; j < nrhs; j++) { x_work = &x[j*ldx]; xtrue_work = &xtrue[j*ldxtrue]; err = xnorm = 0.0; for (i = 0; i < n; i++) { z_sub(&temp, &x_work[i], &xtrue_work[i]); err = SUPERLU_MAX(err, z_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, z_abs(&x_work[i])); } err = err / xnorm; printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } }
/*! \brief Check the inf-norm of the error vector */ void cinf_norm_error(int nrhs, SuperMatrix *X, complex *xtrue) { DNformat *Xstore; float err, xnorm; complex *Xmat, *soln_work; complex temp; int i, j; Xstore = X->Store; Xmat = Xstore->nzval; for (j = 0; j < nrhs; j++) { soln_work = &Xmat[j*Xstore->lda]; err = xnorm = 0.0; for (i = 0; i < X->nrow; i++) { c_sub(&temp, &soln_work[i], &xtrue[i]); err = SUPERLU_MAX(err, c_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, c_abs(&soln_work[i])); } err = err / xnorm; printf("||X - Xtrue||/||X|| = %e\n", err); } }
/*! \brief Allocate known working storage. Returns 0 if success, otherwise returns the number of bytes allocated so far when failure occurred. */ int sLUWorkInit(int m, int n, int panel_size, int **iworkptr, float **dworkptr, GlobalLU_t *Glu) { int isize, dsize, extra; float *old_ptr; int maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ), rowblk = sp_ienv(4); isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int); dsize = (m * panel_size + NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(float); if ( Glu->MemModel == SYSTEM ) *iworkptr = (int *) intCalloc(isize/sizeof(int)); else *iworkptr = (int *) suser_malloc(isize, TAIL, Glu); if ( ! *iworkptr ) { fprintf(stderr, "sLUWorkInit: malloc fails for local iworkptr[]\n"); return (isize + n); } if ( Glu->MemModel == SYSTEM ) *dworkptr = (float *) SUPERLU_MALLOC(dsize); else { *dworkptr = (float *) suser_malloc(dsize, TAIL, Glu); if ( NotDoubleAlign(*dworkptr) ) { old_ptr = *dworkptr; *dworkptr = (float*) DoubleAlign(*dworkptr); *dworkptr = (float*) ((double*)*dworkptr - 1); extra = (char*)old_ptr - (char*)*dworkptr; #ifdef DEBUG printf("sLUWorkInit: not aligned, extra %d\n", extra); #endif Glu->stack.top2 -= extra; Glu->stack.used += extra; } } if ( ! *dworkptr ) { fprintf(stderr, "malloc fails for local dworkptr[]."); return (isize + dsize + n); } return 0; }
void StatInit(SuperLUStat_t *stat) { register int i, w, panel_size, relax; panel_size = sp_ienv(1); relax = sp_ienv(2); w = SUPERLU_MAX(panel_size, relax); stat->panel_histo = intCalloc(w+1); stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double)); if (!stat->utime) ABORT("SUPERLU_MALLOC fails for stat->utime"); stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); if (!stat->ops) ABORT("SUPERLU_MALLOC fails for stat->ops"); for (i = 0; i < NPHASES; ++i) { stat->utime[i] = 0.; stat->ops[i] = 0.; } }
/* * Allocate storage for various statistics. */ void StatAlloc(const int n, const int nprocs, const int panel_size, const int relax, Gstat_t *Gstat) { register int w; w = SUPERLU_MAX( panel_size, relax ) + 1; Gstat->panel_histo = intCalloc(w); Gstat->utime = (double *) doubleMalloc(NPHASES); Gstat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); if ( !(Gstat->procstat = (procstat_t *) SUPERLU_MALLOC(nprocs*sizeof(procstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for procstat[]" ); #if (PRNTlevel==1) printf(".. StatAlloc(): n %d, nprocs %d, panel_size %d, relax %d\n", n, nprocs, panel_size, relax); #endif #ifdef PROFILE if ( !(Gstat->panstat = (panstat_t*) SUPERLU_MALLOC(n * sizeof(panstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for panstat[]" ); Gstat->panhows = intCalloc(3); Gstat->height = intCalloc(n+1); if ( !(Gstat->flops_by_height = (float *) SUPERLU_MALLOC(n * sizeof(float))) ) SUPERLU_ABORT("SUPERLU_MALLOC failed for flops_by_height[]"); #endif #ifdef PREDICT_OPT if ( !(cp_panel = (cp_panel_t *) SUPERLU_MALLOC(n * sizeof(cp_panel_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for cp_panel[]" ); if ( !(desc_eft = (desc_eft_t *) SUPERLU_MALLOC(n * sizeof(desc_eft_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for desc_eft[]" ); cp_firstkid = intMalloc(n+1); cp_nextkid = intMalloc(n+1); #endif }
int pcgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ complex *dense, /* modified */ complex *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; complex *lusup; int *xlusup, *xlusup_end; complex *tempv1; int mem_error; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; #ifdef DEBUG if (jcol == -1) printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
int cgst02(trans_t trans, int m, int n, int nrhs, SuperMatrix *A, complex *x, int ldx, complex *b, int ldb, float *resid) { /* Purpose ======= CGST02 computes the residual for a solution of a system of linear equations A*x = b or A'*x = b: RESID = norm(B - A*X) / ( norm(A) * norm(X) * EPS ), where EPS is the machine epsilon. Arguments ========= TRANS (input) trans_t Specifies the form of the system of equations: = NOTRANS: A *x = b = TRANS : A'*x = b, where A' is the transpose of A = CONJ : A'*x = b, where A' is the transpose of A M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. NRHS (input) INTEGER The number of columns of B, the matrix of right hand sides. NRHS >= 0. A (input) SuperMatrix*, dimension (LDA,N) The original M x N sparse matrix A. X (input) COMPLEX PRECISION array, dimension (LDX,NRHS) The computed solution vectors for the system of linear equations. LDX (input) INTEGER The leading dimension of the array X. If TRANS = NOTRANS, LDX >= max(1,N); if TRANS = TRANS or CONJ, LDX >= max(1,M). B (input/output) COMPLEX PRECISION array, dimension (LDB,NRHS) On entry, the right hand side vectors for the system of linear equations. On exit, B is overwritten with the difference B - A*X. LDB (input) INTEGER The leading dimension of the array B. IF TRANS = NOTRANS, LDB >= max(1,M); if TRANS = TRANS or CONJ, LDB >= max(1,N). RESID (output) FLOAT PRECISION The maximum over the number of right hand sides of norm(B - A*X) / ( norm(A) * norm(X) * EPS ). ===================================================================== */ /* Table of constant values */ complex alpha = {-1., 0.0}; complex beta = {1., 0.0}; int c__1 = 1; /* System generated locals */ float d__1, d__2; /* Local variables */ int j; int n1, n2; float anorm, bnorm; float xnorm; float eps; char transc[1]; /* Function prototypes */ extern int lsame_(char *, char *); extern float clangs(char *, SuperMatrix *); extern float scasum_(int *, complex *, int *); /* Function Body */ if ( m <= 0 || n <= 0 || nrhs == 0) { *resid = 0.; return 0; } if ( (trans == TRANS) || (trans == CONJ) ) { n1 = n; n2 = m; *transc = 'T'; } else { n1 = m; n2 = n; *transc = 'N'; } /* Exit with RESID = 1/EPS if ANORM = 0. */ eps = slamch_("Epsilon"); anorm = clangs("1", A); if (anorm <= 0.) { *resid = 1. / eps; return 0; } /* Compute B - A*X (or B - A'*X ) and store in B. */ sp_cgemm(transc, "N", n1, nrhs, n2, alpha, A, x, ldx, beta, b, ldb); /* Compute the maximum over the number of right hand sides of norm(B - A*X) / ( norm(A) * norm(X) * EPS ) . */ *resid = 0.; for (j = 0; j < nrhs; ++j) { bnorm = scasum_(&n1, &b[j*ldb], &c__1); xnorm = scasum_(&n2, &x[j*ldx], &c__1); if (xnorm <= 0.) { *resid = 1. / eps; } else { /* Computing MAX */ d__1 = *resid, d__2 = bnorm / anorm / xnorm / eps; *resid = SUPERLU_MAX(d__1, d__2); } } return 0; } /* cgst02 */
int main(int argc, char *argv[]) { void smatvec_mult(float alpha, float x[], float beta, float y[]); void spsolve(int n, float x[], float y[]); extern int sfgmr( int n, void (*matvec_mult)(float, float [], float, float []), void (*psolve)(int n, float [], float[]), float *rhs, float *sol, double tol, int restrt, int *itmax, FILE *fits); extern int sfill_diag(int n, NCformat *Astore); char equed[1] = {'B'}; yes_no_t equil; trans_t trans; SuperMatrix A, L, U; SuperMatrix B, X; NCformat *Astore; NCformat *Ustore; SCformat *Lstore; GlobalLU_t Glu; /* facilitate multiple factorizations with SamePattern_SameRowPerm */ float *a; int *asub, *xa; int *etree; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ int nrhs, ldx, lwork, info, m, n, nnz; float *rhsb, *rhsx, *xact; float *work = NULL; float *R, *C; float u, rpg, rcond; float zero = 0.0; float one = 1.0; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; FILE *fp = stdin; int restrt, iter, maxit, i; double resid; float *x, *b; #ifdef DEBUG extern int num_drop_L, num_drop_U; #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Defaults */ lwork = 0; nrhs = 1; trans = NOTRANS; /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 0.1; //different from complete LU options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; options.RowPerm = LargeDiag; options.ILU_DropTol = 1e-4; options.ILU_FillTol = 1e-2; options.ILU_FillFactor = 10.0; options.ILU_DropRule = DROP_BASIC | DROP_AREA; options.ILU_Norm = INF_NORM; options.ILU_MILU = SILU; */ ilu_set_default_options(&options); /* Modify the defaults. */ options.PivotGrowth = YES; /* Compute reciprocal pivot growth */ options.ConditionNumber = YES;/* Compute reciprocal condition number */ if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) ABORT("Malloc fails for work[]."); } /* Read matrix A from a file in Harwell-Boeing format.*/ if (argc < 2) { printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n" "-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n" "-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n" "-t -triplet:\n\t[INPUT] is a triplet format matrix.\n", argv[0]); return 0; } else { switch (argv[1][1]) { case 'H': case 'h': printf("Input a Harwell-Boeing format matrix:\n"); sreadhb(fp, &m, &n, &nnz, &a, &asub, &xa); break; case 'R': case 'r': printf("Input a Rutherford-Boeing format matrix:\n"); sreadrb(&m, &n, &nnz, &a, &asub, &xa); break; case 'T': case 't': printf("Input a triplet format matrix:\n"); sreadtriple(&m, &n, &nnz, &a, &asub, &xa); break; default: printf("Unrecognized format.\n"); return 0; } } sCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_S, SLU_GE); Astore = A.Store; sfill_diag(n, Astore); printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); fflush(stdout); /* Generate the right-hand side */ if ( !(rhsb = floatMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsx = floatMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[]."); sCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_S, SLU_GE); sCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_S, SLU_GE); xact = floatMalloc(n * nrhs); ldx = n; sGenXtrue(n, nrhs, xact, ldx); sFillRHS(trans, nrhs, xact, ldx, &A, &B); if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(R = (float *) SUPERLU_MALLOC(A.nrow * sizeof(float))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (float *) SUPERLU_MALLOC(A.ncol * sizeof(float))) ) ABORT("SUPERLU_MALLOC fails for C[]."); info = 0; #ifdef DEBUG num_drop_L = 0; num_drop_U = 0; #endif /* Initialize the statistics variables. */ StatInit(&stat); /* Compute the incomplete factorization and compute the condition number and pivot growth using dgsisx. */ B.ncol = 0; /* not to perform triangular solution */ sgsisx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, &Glu, &mem_usage, &stat, &info); /* Set RHS for GMRES. */ if (!(b = floatMalloc(m))) ABORT("Malloc fails for b[]."); if (*equed == 'R' || *equed == 'B') { for (i = 0; i < n; ++i) b[i] = rhsb[i] * R[i]; } else { for (i = 0; i < m; i++) b[i] = rhsb[i]; } printf("sgsisx(): info %d, equed %c\n", info, equed[0]); if (info > 0 || rcond < 1e-8 || rpg > 1e8) printf("WARNING: This preconditioner might be unstable.\n"); if ( info == 0 || info == n+1 ) { if ( options.PivotGrowth == YES ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber == YES ) printf("Recip. condition number = %e\n", rcond); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("n(A) = %d, nnz(A) = %d\n", n, Astore->nnz); printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("Fill ratio: nnz(F)/nnz(A) = %.3f\n", ((double)(Lstore->nnz) + (double)(Ustore->nnz) - (double)n) / (double)Astore->nnz); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); fflush(stdout); /* Set the global variables. */ GLOBAL_A = &A; GLOBAL_L = &L; GLOBAL_U = &U; GLOBAL_STAT = &stat; GLOBAL_PERM_C = perm_c; GLOBAL_PERM_R = perm_r; GLOBAL_OPTIONS = &options; GLOBAL_R = R; GLOBAL_C = C; GLOBAL_MEM_USAGE = &mem_usage; /* Set the options to do solve-only. */ options.Fact = FACTORED; options.PivotGrowth = NO; options.ConditionNumber = NO; /* Set the variables used by GMRES. */ restrt = SUPERLU_MIN(n / 3 + 1, 50); maxit = 1000; iter = maxit; resid = 1e-8; if (!(x = floatMalloc(n))) ABORT("Malloc fails for x[]."); if (info <= n + 1) { int i_1 = 1; double maxferr = 0.0, nrmA, nrmB, res, t; float temp; extern float snrm2_(int *, float [], int *); extern void saxpy_(int *, float *, float [], int *, float [], int *); /* Initial guess */ for (i = 0; i < n; i++) x[i] = zero; t = SuperLU_timer_(); /* Call GMRES */ sfgmr(n, smatvec_mult, spsolve, b, x, resid, restrt, &iter, stdout); t = SuperLU_timer_() - t; /* Output the result. */ nrmA = snrm2_(&(Astore->nnz), (float *)((DNformat *)A.Store)->nzval, &i_1); nrmB = snrm2_(&m, b, &i_1); sp_sgemv("N", -1.0, &A, x, 1, 1.0, b, 1); res = snrm2_(&m, b, &i_1); resid = res / nrmB; printf("||A||_F = %.1e, ||B||_2 = %.1e, ||B-A*X||_2 = %.1e, " "relres = %.1e\n", nrmA, nrmB, res, resid); if (iter >= maxit) { if (resid >= 1.0) iter = -180; else if (resid > 1e-8) iter = -111; } printf("iteration: %d\nresidual: %.1e\nGMRES time: %.2f seconds.\n", iter, resid, t); /* Scale the solution back if equilibration was performed. */ if (*equed == 'C' || *equed == 'B') for (i = 0; i < n; i++) x[i] *= C[i]; for (i = 0; i < m; i++) { maxferr = SUPERLU_MAX(maxferr, fabs(x[i] - xact[i])); } printf("||X-X_true||_oo = %.1e\n", maxferr); } #ifdef DEBUG printf("%d entries in L and %d entries in U dropped.\n", num_drop_L, num_drop_U); #endif fflush(stdout); if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhsb); SUPERLU_FREE (rhsx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); if ( lwork >= 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } SUPERLU_FREE(b); SUPERLU_FREE(x); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif return 0; }
float cPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, SuperMatrix *L, SuperMatrix *U) { /* * Purpose * ======= * * Compute the reciprocal pivot growth factor of the leading ncols columns * of the matrix, using the formula: * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) * * Arguments * ========= * * ncols (input) int * The number of columns of matrices A, L and U. * * A (input) SuperMatrix* * Original matrix A, permuted by columns, of dimension * (A->nrow, A->ncol). The type of A can be: * Stype = NC; Dtype = SLU_C; Mtype = GE. * * L (output) SuperMatrix* * The factor L from the factorization Pr*A=L*U; use compressed row * subscripts storage for supernodes, i.e., L has type: * Stype = SC; Dtype = SLU_C; Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise * storage scheme, i.e., U has types: Stype = NC; * Dtype = SLU_C; Mtype = TRU. * */ NCformat *Astore; SCformat *Lstore; NCformat *Ustore; complex *Aval, *Lval, *Uval; int fsupc, nsupr, luptr, nz_in_U; int i, j, k, oldcol; int *inv_perm_c; float rpg, maxaj, maxuj; extern double slamch_(char *); float smlnum; complex *luval; complex temp_comp; /* Get machine constants. */ smlnum = slamch_("S"); rpg = 1. / smlnum; Astore = A->Store; Lstore = L->Store; Ustore = U->Store; Aval = Astore->nzval; Lval = Lstore->nzval; Uval = Ustore->nzval; inv_perm_c = (int *) SUPERLU_MALLOC(A->ncol*sizeof(int)); for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; for (k = 0; k <= Lstore->nsuper; ++k) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); luptr = L_NZ_START(fsupc); luval = &Lval[luptr]; nz_in_U = 1; for (j = fsupc; j < L_FST_SUPC(k+1) && j < ncols; ++j) { maxaj = 0.; oldcol = inv_perm_c[j]; for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; ++i) maxaj = SUPERLU_MAX( maxaj, slu_c_abs1( &Aval[i]) ); maxuj = 0.; for (i = Ustore->colptr[j]; i < Ustore->colptr[j+1]; i++) maxuj = SUPERLU_MAX( maxuj, slu_c_abs1( &Uval[i]) ); /* Supernode */ for (i = 0; i < nz_in_U; ++i) maxuj = SUPERLU_MAX( maxuj, slu_c_abs1( &luval[i]) ); ++nz_in_U; luval += nsupr; if ( maxuj == 0. ) rpg = SUPERLU_MIN( rpg, 1.); else rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); } if ( j >= ncols ) break; } SUPERLU_FREE(inv_perm_c); return (rpg); }
void dgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, double *colcnd, double *amax, int *info) { /* Purpose ======= dgsequ() computes row and column scalings intended to equilibrate an M-by-N sparse matrix A and reduce its condition number. R returns the row scale factors and C the column scale factors, chosen to try to make the largest element in each row and column of the matrix B with elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. R(i) and C(j) are restricted to be between SMLNUM = smallest safe number and BIGNUM = largest safe number. Use of these scaling factors is not guaranteed to reduce the condition number of A but works well in practice. See supermatrix.h for the definition of 'SuperMatrix' structure. Arguments ========= A (input) SuperMatrix* The matrix of dimension (A->nrow, A->ncol) whose equilibration factors are to be computed. The type of A can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. R (output) double*, size A->nrow If INFO = 0 or INFO > M, R contains the row scale factors for A. C (output) double*, size A->ncol If INFO = 0, C contains the column scale factors for A. ROWCND (output) double* If INFO = 0 or INFO > M, ROWCND contains the ratio of the smallest R(i) to the largest R(i). If ROWCND >= 0.1 and AMAX is neither too large nor too small, it is not worth scaling by R. COLCND (output) double* If INFO = 0, COLCND contains the ratio of the smallest C(i) to the largest C(i). If COLCND >= 0.1, it is not worth scaling by C. AMAX (output) double* Absolute value of largest matrix element. If AMAX is very close to overflow or very close to underflow, the matrix should be scaled. INFO (output) int* = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, and i is <= M: the i-th row of A is exactly zero > M: the (i-M)-th column of A is exactly zero ===================================================================== */ /* Local variables */ NCformat *Astore; double *Aval; int i, j, irow; double rcmin, rcmax; double bignum, smlnum; extern double dlamch_(char *); /* Test the input parameters. */ *info = 0; if ( A->nrow < 0 || A->ncol < 0 || A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -1; if (*info != 0) { i = -(*info); xerbla_("dgsequ", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || A->ncol == 0 ) { *rowcnd = 1.; *colcnd = 1.; *amax = 0.; return; } Astore = A->Store; Aval = Astore->nzval; /* Get machine constants. */ smlnum = dlamch_("S"); bignum = 1. / smlnum; /* Compute row scale factors. */ for (i = 0; i < A->nrow; ++i) r[i] = 0.; /* Find the maximum element in each row. */ for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); } /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; for (i = 0; i < A->nrow; ++i) { rcmax = SUPERLU_MAX(rcmax, r[i]); rcmin = SUPERLU_MIN(rcmin, r[i]); } *amax = rcmax; if (rcmin == 0.) { /* Find the first zero scale factor and return an error code. */ for (i = 0; i < A->nrow; ++i) if (r[i] == 0.) { *info = i + 1; return; } } else { /* Invert the scale factors. */ for (i = 0; i < A->nrow; ++i) r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); /* Compute ROWCND = min(R(I)) / max(R(I)) */ *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } /* Compute column scale factors */ for (j = 0; j < A->ncol; ++j) c[j] = 0.; /* Find the maximum element in each column, assuming the row scalings computed above. */ for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); } /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; for (j = 0; j < A->ncol; ++j) { rcmax = SUPERLU_MAX(rcmax, c[j]); rcmin = SUPERLU_MIN(rcmin, c[j]); } if (rcmin == 0.) { /* Find the first zero scale factor and return an error code. */ for (j = 0; j < A->ncol; ++j) if ( c[j] == 0. ) { *info = A->nrow + j + 1; return; } } else { /* Invert the scale factors. */ for (j = 0; j < A->ncol; ++j) c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); /* Compute COLCND = min(C(J)) / max(C(J)) */ *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } return; } /* dgsequ */
void cpanel_bmod ( const int m, /* in - number of rows in the matrix */ const int w, /* in */ const int jcol, /* in */ const int nseg, /* in */ complex *dense, /* out, of size n by w */ complex *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in, of size n by w */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha, beta; #endif register int k, ksub; int fsupc, nsupc, nsupr, nrow; int krep, krep_ind; complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* Points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i; register int jj; /* Index through each column in the panel */ int *xsup, *supno; int *lsub, *xlsub; complex *lusup; int *xlusup; int *repfnz_col; /* repfnz[] for a column in the panel */ complex *dense_col; /* dense[] for a column in the panel */ complex *tempv1; /* Used in 1-D update */ complex *TriTmp, *MatvecTmp; /* used in 2-D update */ complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex comp_temp, comp_temp1; register int ldaTmp; register int r_ind, r_hi; static int first = 1, maxsuper, rowblk, colblk; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; if ( first ) { maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } ldaTmp = maxsuper + rowblk; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nrow = nsupr - nsupc; lptr = xlsub[fsupc]; krep_ind = lptr + nsupc - 1; repfnz_col = repfnz; dense_col = dense; if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ TriTmp = tempv; /* Sequence through each column in panel -- triangular solves */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* Copy U[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solves. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif } /* else ... */ } /* for jj ... end tri-solves */ /* Block row updates; push all the way into dense[*] block */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); luptr = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; /* Sequence through each column in panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Perform a block update, and scatter the result of matrix-vector to dense[]. */ no_zeros = kfnz - fsupc; luptr1 = luptr + nsupr * no_zeros; MatvecTmp = &TriTmp[maxsuper]; #ifdef USE_VENDOR_BLAS alpha = one; beta = zero; #ifdef _CRAY CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #else cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #endif #else cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], TriTmp, MatvecTmp); #endif /* Scatter MatvecTmp[*] into SPA dense[*] temporarily * such that MatvecTmp[*] can be re-used for the * the next blok row update. dense[] will be copied into * global store after the whole panel has been finished. */ isub = isub1; for (i = 0; i < block_nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &MatvecTmp[i]); MatvecTmp[i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* Scatter the triangular solves into SPA dense[*] */ repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; TriTmp[i] = zero; ++isub; } } /* for jj ... */ } else { /* 1-D block modification */ /* Sequence through each column in the panel */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense[]. */ no_zeros = kfnz - fsupc; /* Copy U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[*] into SPA dense[*] temporarily, such * that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; tempv[i] = zero; isub++; } /* Scatter the update from tempv1[*] into SPA dense[*] */ /* Start dense rectangular L */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze>=4 ... */ } /* for each column in the panel... */ } /* else 1-D update ... */ } /* for each updating supernode ... */ }
void psgstrf_panel_bmod( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int bcol, /* first column of the farthest busy snode*/ int *inv_perm_r,/* in; inverse of the row pivoting */ int *etree, /* in */ int *nseg, /* modified */ int *segrep, /* modified */ int *repfnz, /* modified, size n-by-w */ int *panel_lsub,/* modified */ int *w_lsub_end,/* modified */ int *spa_marker,/* modified; size n-by-w */ float *dense, /* modified, size n-by-w */ float *tempv, /* working array - zeros on input/output */ pxgstrf_shared_t *pxgstrf_shared /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features combined 1D and 2D blocking of the source updating s-node. * It consists of two steps: * (1) accumulates updates from "done" s-nodes. * (2) accumulates updates from "busy" s-nodes. * * Before entering this routine, the nonzeros of the original A in * this panel were already copied into the SPA dense[n,w]. * * Updated/Output arguments * ======================== * L[*,j:j+w-1] and U[*,j:j+w-1] are returned collectively in the * m-by-w vector dense[*,w]. The locations of nonzeros in L[*,j:j+w-1] * are given by lsub[*] and U[*,j:j+w-1] by (nseg,segrep,repfnz). * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */ register int j, k, ksub; register int fsupc, nsupc, nsupr, nrow; register int kcol, krep, ksupno, dadsupno; register int jj; /* index through each column in the panel */ int *xsup, *xsup_end, *supno; int *lsub, *xlsub, *xlsub_end; int *repfnz_col; /* repfnz[] for a column in the panel */ float *dense_col; /* dense[] for a column in the panel */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ static int first = 1, rowblk, colblk; #ifdef PROFILE double t1, t2; /* temporary time */ #endif #ifdef PREDICT_OPT register float pmod, max_child_eft = 0, sum_pmod = 0, min_desc_eft = 0; register float pmod_eft; register int kid, ndesc = 0; #endif #if ( DEBUGlevel>=2 ) int dbg_addr = 0*m; #endif if ( first ) { rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) check_panel_dfs_list(pnum, "begin", jcol, *nseg, segrep);*/ if (jcol == BADPAN) printf("(%d) Enter psgstrf_panel_bmod() jcol %d,BADCOL %d,dense_col[%d] %.10f\n", pnum, jcol, BADCOL, BADROW, dense[dbg_addr+BADROW]); #endif /* -------------------------------------------------------------------- For each non-busy supernode segment of U[*,jcol] in topological order, perform sup-panel update. -------------------------------------------------------------------- */ k = *nseg - 1; for (ksub = 0; ksub < *nseg; ++ksub) { /* * krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops; #endif if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops - pmod; kid = (Glu->pan_status[krep].size > 0) ? krep : (krep + Glu->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) non-busy update: krep %d, repfnz %d, dense_col[%d] %.10e\n", pnum, krep, repfnz[dbg_addr+krep], BADROW, dense[dbg_addr+BADROW]); #endif } /* for each updating supernode ... */ #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After non-busy update: dense_col[%d] %.10e\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* --------------------------------------------------------------------- * Now wait for the "busy" s-nodes to become "done" -- this amounts to * climbing up the e-tree along the path starting from "bcol". * Several points are worth noting: * * (1) There are two possible relations between supernodes and panels * along the path of the e-tree: * o |s-node| < |panel| * want to climb up the e-tree one column at a time in order * to achieve more concurrency * o |s-node| > |panel| * want to climb up the e-tree one panel at a time; this * processor is stalled anyway while waiting for the panel. * * (2) Need to accommodate new fills, append them in panel_lsub[*,w]. * o use an n-by-w marker array, as part of the SPA (not scalable!) * * (3) Symbolically, need to find out repfnz[S, w], for each (busy) * supernode S. * o use dense[inv_perm_r[kcol]], filter all zeros * o detect the first nonzero in each segment * (at this moment, the boundary of the busy supernode/segment * S has already been identified) * * --------------------------------------------------------------------- */ kcol = bcol; while ( kcol < jcol ) { /* Pointers to each column of the w-wide arrays. */ repfnz_col = repfnz; dense_col = dense; col_marker = spa_marker; col_lsub = panel_lsub; /* Wait for the supernode, and collect wait-time statistics. */ if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); Gstat->panstat[jcol].pipewaits++; Gstat->panstat[jcol].spintime += t2; Gstat->procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(1); #endif #endif } /* Find leading column "fsupc" in the supernode that contains column "kcol" */ ksupno = supno[kcol]; fsupc = kcol; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) */ if ( jcol==BADCOL ) printf("(%d) psgstrf_panel_bmod[1] kcol %d, ksupno %d, fsupc %d\n", pnum, kcol, ksupno, fsupc); #endif /* Wait for the whole supernode to become "done" -- climb up e-tree one column at a time */ do { krep = SUPER_REP( ksupno ); kcol = etree[kcol]; if ( kcol >= jcol ) break; if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await ( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); Gstat->panstat[jcol].pipewaits++; Gstat->panstat[jcol].spintime += t2; Gstat->procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(2); #endif #endif } dadsupno = supno[kcol]; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) printf("(%d) psgstrf_panel_bmod[2] krep %d, dad=kcol %d, dadsupno %d\n", pnum, krep, kcol, dadsupno); #endif } while ( dadsupno == ksupno ); /* Append the new segment into segrep[*]. After column_bmod(), copy_to_ucol() will use them. */ segrep[*nseg] = krep; ++(*nseg); /* Determine repfnz[krep, w] for each column in the panel */ for (jj = jcol; jj < jcol + w; ++jj, dense_col += m, repfnz_col += m, col_marker += m, col_lsub += m) { /* * Note: relaxed supernode may not form a path on the e-tree, * but its column numbers are contiguous. */ #ifdef SCATTER_FOUND for (kcol = fsupc; kcol <= krep; ++kcol) { if ( col_marker[inv_perm_r[kcol]] == jj ) { repfnz_col[krep] = kcol; /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; break; /* found the leading nonzero in the segment */ } } #else for (kcol = fsupc; kcol <= krep; ++kcol) { if ( dense_col[inv_perm_r[kcol]] != 0.0 ) { repfnz_col[krep] = kcol; break; /* Found the leading nonzero in the U-segment */ } } /* In this case, we always treat the L-subscripts of the busy s-node [kcol : krep] as the new fills, even if the corresponding U-segment may be all zero. */ /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; #endif #if ( DEBUGlevel>=2 ) if (jj == BADCOL) { printf("(%d) psgstrf_panel_bmod[fills]: jj %d, repfnz_col[%d] %d, inv_pr[%d] %d\n", pnum, jj, krep, repfnz_col[krep], fsupc, inv_perm_r[fsupc]); printf("(%d) psgstrf_panel_bmod[fills] xlsub %d, xlsub_end %d, #lsub[%d] %d\n", pnum,xlsub[krep],xlsub_end[krep],krep, xlsub_end[krep]-xlsub[krep]); } #endif } /* for jj ... */ #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops; #endif /* Perform sup-panel updates - use combined 1D + 2D updates. */ nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops - pmod; kid = (pxgstrf_shared->pan_status[krep].size > 0) ? krep : (krep + pxgstrf_shared->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After busy update: dense_col[%d] %.10f\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* Go to the parent of "krep" */ kcol = etree[krep]; } /* while kcol < jcol ... */ #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) check_panel_dfs_list(pnum, "after-busy", jcol, *nseg, segrep); #endif #ifdef PREDICT_OPT qsort(desc_eft, ndesc, sizeof(desc_eft_t), (int(*)())numcomp); pmod_eft = 0; for (j = 0; j < ndesc; ++j) { pmod_eft = SUPERLU_MAX( pmod_eft, desc_eft[j].eft ) + desc_eft[j].pmod; } if ( ndesc == 0 ) { /* No modifications from descendants */ pmod_eft = 0; for (j = cp_firstkid[jcol]; j != EMPTY; j = cp_nextkid[j]) { kid = (pxgstrf_shared->pan_status[j].size > 0) ? j : (j + pxgstrf_shared->pan_status[j].size); pmod_eft = SUPERLU_MAX( pmod_eft, cp_panel[kid].est + cp_panel[kid].pdiv ); } } cp_panel[jcol].est = pmod_eft; #endif }
int dgst01(int m, int n, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, double *resid) { /* Purpose ======= DGST01 reconstructs a matrix A from its L*U factorization and computes the residual norm(L*U - A) / ( N * norm(A) * EPS ), where EPS is the machine epsilon. Arguments ========== M (input) INT The number of rows of the matrix A. M >= 0. N (input) INT The number of columns of the matrix A. N >= 0. A (input) SuperMatrix *, dimension (A->nrow, A->ncol) The original M x N matrix A. L (input) SuperMatrix *, dimension (L->nrow, L->ncol) The factor matrix L. U (input) SuperMatrix *, dimension (U->nrow, U->ncol) The factor matrix U. perm_c (input) INT array, dimension (N) The column permutation from DGSTRF. perm_r (input) INT array, dimension (M) The pivot indices from DGSTRF. RESID (output) DOUBLE* norm(L*U - A) / ( N * norm(A) * EPS ) ===================================================================== */ /* Local variables */ double zero = 0.0; int i, j, k, arow, lptr,isub, urow, superno, fsupc, u_part; double utemp, comp_temp; double anorm, tnorm, cnorm; double eps; double *work; SCformat *Lstore; NCformat *Astore, *Ustore; double *Aval, *Lval, *Uval; int *colbeg, *colend; /* Function prototypes */ extern double dlangs(char *, SuperMatrix *); /* Quick exit if M = 0 or N = 0. */ if (m <= 0 || n <= 0) { *resid = 0.f; return 0; } work = (double *)doubleCalloc(m); Astore = A->Store; Aval = Astore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; colbeg = intMalloc(n); colend = intMalloc(n); for (i = 0; i < n; i++) { colbeg[perm_c[i]] = Astore->colptr[i]; colend[perm_c[i]] = Astore->colptr[i+1]; } /* Determine EPS and the norm of A. */ eps = dmach("Epsilon"); anorm = dlangs("1", A); cnorm = 0.; /* Compute the product L*U, one column at a time */ for (k = 0; k < n; ++k) { /* The U part outside the rectangular supernode */ for (i = U_NZ_START(k); i < U_NZ_START(k+1); ++i) { urow = U_SUB(i); utemp = Uval[i]; superno = Lstore->col_to_sup[urow]; fsupc = L_FST_SUPC(superno); u_part = urow - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)] -= utemp; /* L_ii = 1 */ for (j = L_NZ_START(urow) + u_part; j < L_NZ_START(urow+1); ++j) { isub = L_SUB(lptr); work[isub] -= Lval[j] * utemp; ++lptr; } } /* The U part inside the rectangular supernode */ superno = Lstore->col_to_sup[k]; fsupc = L_FST_SUPC(superno); urow = L_NZ_START(k); for (i = fsupc; i <= k; ++i) { utemp = Lval[urow++]; u_part = i - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)] -= utemp; /* L_ii = 1 */ for (j = L_NZ_START(i)+u_part; j < L_NZ_START(i+1); ++j) { isub = L_SUB(lptr); work[isub] -= Lval[j] * utemp; ++lptr; } } /* Now compute A[k] - (L*U)[k] (Both matrices may be permuted.) */ for (i = colbeg[k]; i < colend[k]; ++i) { arow = Astore->rowind[i]; work[perm_r[arow]] += Aval[i]; } /* Now compute the 1-norm of the column vector work */ tnorm = 0.; for (i = 0; i < m; ++i) { tnorm += fabs(work[i]); work[i] = zero; } cnorm = SUPERLU_MAX(tnorm, cnorm); } *resid = cnorm; if (anorm <= 0.f) { if (*resid != 0.f) { *resid = 1.f / eps; } } else { *resid = *resid / (float) n / anorm / eps; } SUPERLU_FREE(work); SUPERLU_FREE(colbeg); SUPERLU_FREE(colend); return 0; /* End of DGST01 */ } /* dgst01_ */
/*! \brief * * <pre> * Purpose * ======= * * PDGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates * for the solution. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * A (input) SuperMatrix* * The original matrix A, or the scaled A if equilibration was done. * A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from pdgstrf for * the possibly scaled and permuted matrix A. * See superlu_ddefs.h for the definition of 'LUstruct_t'. * * ScalePermstruct (input) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input) double* (local) * The m_loc-by-NRHS right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * * ldb (input) int (local) * Leading dimension of matrix B. * * X (input/output) double* (local) * On entry, the solution matrix Y, as computed by PDGSTRS, of the * transformed system A1*Y = Pc*Pr*B. where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X. * On exit, the improved solution matrix Y. * * In order to obtain the solution X to the original system, * Y should be permutated by Pc^T, and premultiplied by diag(C) * if DiagScale = COL or BOTH. * This must be done after this routine is called. * * ldx (input) int (local) * Leading dimension of matrix X. * * nrhs (input) int * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * berr (output) double*, dimension (nrhs) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics about the refinement steps. * See util.h for the definition of SuperLUStat_t. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * </pre> */ void pdgsrfs(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, double *B, int_t ldb, double *X, int_t ldx, int nrhs, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 20 Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; double *ax, *R, *dx, *temp, *work, *B_col, *X_col; int_t count, i, j, lwork, nz; int iam; double eps, lstres; double s, safmin, safe1, safe2; /* Data structures used by matrix-vector multiply routine. */ pdgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; NRformat_loc *Astore; int_t m_loc, fst_row; /* Initialization. */ Astore = (NRformat_loc *) A->Store; m_loc = Astore->m_loc; fst_row = Astore->fst_row; iam = grid->iam; /* Test the input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; else if ( nrhs < 0 ) *info = -13; if (*info != 0) { i = -(*info); pxerbla("PDGSRFS", grid, i); return; } /* Quick return if possible. */ if ( n == 0 || nrhs == 0 ) { return; } #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgsrfs()"); #endif lwork = 2 * m_loc; /* For ax/R/dx and temp */ if ( !(work = doubleMalloc_dist(lwork)) ) ABORT("Malloc fails for work[]"); ax = R = dx = work; temp = ax + m_loc; /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dmach("Epsilon"); safmin = dmach("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; #if ( DEBUGlevel>=1 ) if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", eps, anorm, safe1, safe2); #endif /* Do for each right-hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; B_col = &B[j*ldb]; X_col = &X[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ /* Matrix-vector multiply. */ pdgsmv(0, A, grid, gsmv_comm, X_col, ax); /* Compute residual, stored in R[]. */ for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i]; /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ pdgsmv(1, A, grid, gsmv_comm, X_col, temp); for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]); s = 0.0; for (i = 0; i < m_loc; ++i) { if ( temp[i] > safe2 ) { s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); } else if ( temp[i] != 0.0 ) { /* Adding SAFE1 to the numerator guards against spuriously zero residuals (underflow). */ s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]); } /* If temp[i] is exactly 0.0 (computed by PxGSMV), then we know the true residual also must be exactly 0.0. */ } MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); #if ( PRNTlevel>= 1 ) if ( !iam ) printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); #endif if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { /* Compute new dx. */ pdgstrs(n, LUstruct, ScalePermstruct, grid, dx, m_loc, fst_row, m_loc, 1, SOLVEstruct, stat, info); /* Update solution. */ for (i = 0; i < m_loc; ++i) X_col[i] += dx[i]; lstres = berr[j]; ++count; } else { break; } } /* end while */ stat->RefineSteps = count; } /* for j ... */ /* Deallocate storage. */ SUPERLU_FREE(work); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgsrfs()"); #endif } /* PDGSRFS */
void cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, equed_t equed, float *R, float *C, SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * cgsrfs improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A**T * X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose = Transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = NC, Dtype = _D, Mtype = GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = NCP, Dtype = _D, Mtype = TRU. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * equed (input) equed_t * Specifies the form of equilibration that was done. * = NOEQUIL: No equilibration. * = ROW: Row equilibration, i.e., A was premultiplied by diag(R). * = COL: Column equilibration, i.e., A was postmultiplied by * diag(C). * = BOTH: Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = ROW or BOTH, A is premultiplied by diag(R). * If equed = NOEQUIL or COL, R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = COL or BOTH, A is postmultiplied by diag(C). * If equed = NOEQUIL or ROW, C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * The right hand side matrix B. * * X (input/output) SuperMatrix* * X has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the solution matrix X, as computed by dgstrs(). * On exit, the improved solution matrix X. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 5 /* Table of constant values */ int ione = 1; complex ndone = {-1., 0.}; complex done = {1., 0.}; /* Local variables */ NCformat *Astore; complex *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; complex *Bmat, *Xmat, *Bptr, *Xptr; int kase; float safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; float s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; complex *work; float *rwork; int *iwork; extern double slamch_(char *); extern int clacon_(int *, complex *, complex *, float *, int *); #ifdef _CRAY extern int CCOPY(int *, complex *, int *, complex *, int *); extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *); #else extern int ccopy_(int *, complex *, int *, complex *, int *); extern int caxpy_(int *, complex *, complex *, int *, complex *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SCP || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NCP || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("cgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = (equed == ROW) || (equed == BOTH); colequ = (equed == COL) || (equed == BOTH); /* Allocate working space */ work = complexMalloc(2*A->nrow); rwork = (float *) SUPERLU_MALLOC( (size_t) A->nrow * sizeof(float) ); iwork = intMalloc(A->nrow); if ( !work || !rwork || !iwork ) SUPERLU_ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else { *(unsigned char *)transc = 'T'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) SUPERLU_ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY CCOPY(&A->nrow, Bptr, &ione, work, &ione); #else ccopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += c_abs1(&Aval[i]) * c_abs1(&Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) { s = SUPERLU_MAX( s, c_abs1(&work[i]) / rwork[i] ); } else if ( rwork[i] != 0.0 ) { s = SUPERLU_MAX( s, (c_abs1(&work[i]) + safe1) / rwork[i] ); } /* If rwork[i] is exactly 0.0, then we know the true residual also must be exactly 0.0. */ } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info); #ifdef _CRAY CAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else caxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use CLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = c_abs1( &Xptr[irow] ); s += c_abs1(&Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { clacon_(&A->nrow, &work[A->nrow], work, &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], R[i]); } cgstrs (transt, L, U, perm_r, perm_c, &Bjcol, Gstat, info); for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } cgstrs (trans, L, U, perm_r, perm_c, &Bjcol, Gstat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], R[i]); } } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * c_abs1( &Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * c_abs1( &Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, c_abs1( &Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* cgsrfs */
float clangs(char *norm, SuperMatrix *A) { /* Purpose ======= CLANGS returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest absolute value of a real matrix A. Description =========== CLANGE returns the value CLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' ( ( norm1(A), NORM = '1', 'O' or 'o' ( ( normI(A), NORM = 'I' or 'i' ( ( normF(A), NORM = 'F', 'f', 'E' or 'e' where norm1 denotes the one norm of a matrix (maximum column sum), normI denotes the infinity norm of a matrix (maximum row sum) and normF denotes the Frobenius norm of a matrix (square root of sum of squares). Note that max(abs(A(i,j))) is not a matrix norm. Arguments ========= NORM (input) CHARACTER*1 Specifies the value to be returned in CLANGE as described above. A (input) SuperMatrix* The M by N sparse matrix A. ===================================================================== */ /* Local variables */ NCformat *Astore; complex *Aval; int i, j, irow; float value, sum; float *rwork; Astore = A->Store; Aval = Astore->nzval; if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { value = 0.; } else if (lsame_(norm, "M")) { /* Find max(abs(A(i,j))). */ value = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) value = SUPERLU_MAX( value, slu_c_abs( &Aval[i]) ); } else if (lsame_(norm, "O") || *(unsigned char *)norm == '1') { /* Find norm1(A). */ value = 0.; for (j = 0; j < A->ncol; ++j) { sum = 0.; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) sum += slu_c_abs( &Aval[i] ); value = SUPERLU_MAX(value,sum); } } else if (lsame_(norm, "I")) { /* Find normI(A). */ if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) ) ABORT("SUPERLU_MALLOC fails for rwork."); for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { irow = Astore->rowind[i]; rwork[irow] += slu_c_abs( &Aval[i] ); } value = 0.; for (i = 0; i < A->nrow; ++i) value = SUPERLU_MAX(value, rwork[i]); SUPERLU_FREE (rwork); } else if (lsame_(norm, "F") || lsame_(norm, "E")) { /* Find normF(A). */ ABORT("Not implemented."); } else ABORT("Illegal norm specified."); return (value); } /* clangs */