/*! \brief * * <pre> * Purpose * ======= * * pzgssvx_ABglobal solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right hand sides, and its dimensions ldb and nrhs * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has several options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * - A, the input matrix * * as well as the following options, which are described in more * detail below: * * - options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * - options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * - options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * - options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * - ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * - ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * - ScalePermstruct->R, array of row scale factors * - ScalePermstruct->C, array of column scale factors * - ScalePermstruct->perm_r, row permutation vector * - ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * - A, the input matrix A overwritten by the scaled and permuted matrix * Pc*Pr*diag(R)*A*diag(C) * where * Pr and Pc are row and columns permutation matrices determined * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, * respectively, and * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * - LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In this * case the algorithm saves time by reusing the previously computed * column permutation vector stored in ScalePermstruct->perm_c * and the "elimination tree" of A stored in LUstruct->etree. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->RowPerm * - options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * - A, the input matrix * - ScalePermstruct->perm_c, the column permutation * - LUstruct->etree, the elimination tree * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * - LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * - options->Equil * - options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are ignored. * This is because the permutations from ScalePermstruct->perm_r and * ScalePermstruct->perm_c are used as input. * * The user must also supply * * - A, the input matrix * - ScalePermstruct->DiagScale, how the previous matrix was row and/or * column scaled * - ScalePermstruct->R, the row scalings of the previous matrix, if any * - ScalePermstruct->C, the columns scalings of the previous matrix, * if any * - ScalePermstruct->perm_r, the row permutation of the previous matrix * - ScalePermstruct->perm_c, the column permutation of the previous * matrix * - all of LUstruct, the previously computed information about L and U * (the actual numerical values of L and U stored in * LUstruct->Llu are ignored) * * The outputs returned include * * - A, the input matrix A overwritten by the scaled and permuted matrix * as described above * - ScalePermstruct, modified to describe how the input matrix A was * equilibrated * (thus ScalePermstruct->DiagScale, R and C may be modified) * - LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * - A, the unfactored matrix, only in the case that iterative refinment * is to be done (specifically A must be the output A from * the previous call, so that it has been scaled and permuted) * - all of ScalePermstruct * - all of LUstruct, including the actual numerical values of L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = SLU_DOUBLE: accumulate residual in double precision. * = SLU_EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. That is, A is stored in * compressed column format (also known as Harwell-Boeing format). * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine pzgstrf can factorize rectangular matrices. * On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C), * depending on ScalePermstruct->DiagScale, options->RowPerm and * options->colpem: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->RowPerm != NATURAL, A is further overwritten by * Pr*diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * Pc*Pr*diag(R)*A*diag(C). * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * NOTE: Currently, A must reside in all processes when calling * this routine. * * ScalePermstruct (input/output) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) doublecomplex* * On entry, the right-hand side matrix of dimension (A->nrow, nrhs). * On exit, the solution matrix if info = 0; * * NOTE: Currently, B must reside in all processes when calling * this routine. * * ldb (input) int (global) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_zdefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * berr (output) double*, dimension (nrhs) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * * See superlu_zdefs.h for the definitions of various data types. * </pre> */ void pzgssvx_ABglobal(superlu_options_t *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, doublecomplex B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, double *berr, SuperLUStat_t *stat, int *info) { SuperMatrix AC; NCformat *Astore; NCPformat *ACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by DDISTRIBUTE routine. They will be freed after DDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; doublecomplex *a; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *colptr, *rowind; int_t colequ, Equil, factored, job, notran, rowequ; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int iam; int ldx; /* LDA for matrix X (global). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; doublecomplex *X, *b_col, *b_work, *x_col; double t; static mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif /* Test input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA ) *info = -1; else if ( options->IterRefine == SLU_EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < A->nrow ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pzgssvx_ABglobal", grid, -*info); return; } /* Initialization */ factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; m = A->nrow; n = A->ncol; Astore = A->Store; nnz = Astore->nnz; a = Astore->nzval; colptr = Astore->colptr; rowind = Astore->rowind; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgssvx_ABglobal()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; if ( Equil && Fact != SamePattern_SameRowPerm ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; zd_mult(&a[i], &a[i], R[i]); /* Scale rows. */ } } break; case COL: for (j = 0; j < n; ++j) for (i = colptr[j]; i < colptr[j+1]; ++i) zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */ break; case BOTH: for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; zd_mult(&a[i], &a[i], R[irow]); /* Scale rows. */ zd_mult(&a[i], &a[i], C[j]); /* Scale columns. */ } } break; } } else { if ( !iam ) { /* Compute row and column scalings to equilibrate matrix A. */ zgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { if ( iinfo > 0 ) { if ( iinfo <= m ) fprintf(stderr, "The %d-th row of A is exactly zero\n", iinfo); else fprintf(stderr, "The %d-th column of A is exactly zero\n", iinfo-n); exit(-1); } } } else { MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); } else { ABORT("ZGSEQU failed\n"); } } /* Equilibrate matrix A. */ zlaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* end if Equil ... */ /* ------------------------------------------------------------ Permute rows of A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } } else if ( !factored ) { if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation for large diagonal. */ zldperm(job, m, nnz, colptr, rowind, a, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; zd_mult(&a[i], &a[i], R1[irow]); /* Scale rows. */ zd_mult(&a[i], &a[i], C1[j]); /* Scale columns. */ rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) /* New diagonal */ dprod *= slud_z_abs1(&a[i]); #endif } } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } else { /* No equilibration. */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i])); else if ( job == 4 ) dsum += slud_z_abs1(&a[i]); else if ( job == 5 ) dprod *= slud_z_abs1(&a[i]); } #endif } } } #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* else !factored */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i < m; ++i) perm_r[i] = i; } if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = zlangs_dist(norm, A); } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && Fact == DOFACT ) /* Use an ordering provided by SuperLU */ get_perm_c_dist(iam, permc_spec, A, perm_c); /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ sp_colorder(options, A, perm_c, etree, &AC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ ACstore = AC.Store; for (j = 0; j < n; ++j) for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { irow = ACstore->rowind[i]; ACstore->rowind[i] = perm_c[irow]; } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Perform a symbolic factorization on matrix A and set up the nonzero data structures which are suitable for supernodal GENP. */ if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); iinfo = symbfact(options, iam, &AC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr, "symbfact() error returns %d\n", iinfo); exit(-1); } } } /* Distribute the L and U factors onto the process grid. */ t = SuperLU_timer_(); dist_mem_use = zdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factor. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pzgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; zQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif #if ( PRNTlevel>=2 ) if ( !iam ) printf(".. pzgstrf INFO = %d\n", *info); #endif } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ /* Permute columns of A to form A*Pc' using the existing perm_c. * NOTE: rows of A were previously permuted to Pc*A. */ sp_colorder(options, A, perm_c, NULL, &AC); } /* if !factored ... */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doublecomplexMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], R[i]); b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) zd_mult(&b_col[i], &b_col[i], C[i]); b_col += ldb; } } /* ------------------------------------------------------------ Permute the right-hand side to form Pr*B. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } } /* ------------------------------------------------------------ Permute the right-hand side to form Pc*B. ------------------------------------------------------------*/ if ( notran ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; for (i = 0; i < m; ++i) b_col[i] = b_work[i]; b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doublecomplexMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ pzgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ t = SuperLU_timer_(); pzgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, X, ldx, nrhs, berr, stat, info); stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix X <= Pc'*X. */ for (j = 0; j < nrhs; j++) { b_col = &B[j*ldb]; x_col = &X[j*ldx]; for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; } /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], C[i]); b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < n; ++i) zd_mult(&b_col[i], &b_col[i], R[i]); b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it is not used. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored || (factored && options->IterRefine) ) Destroy_CompCol_Permuted_dist(&AC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pzgssvx_ABglobal()"); #endif }
/*! \brief <pre> Purpose ======= ZLAQGS_DIST equilibrates a general sparse M by N matrix A using the row and column scaling factors in the vectors R and C. See supermatrix.h for the definition of 'SuperMatrix' structure. Arguments ========= A (input/output) SuperMatrix* On exit, the equilibrated matrix. See EQUED for the form of the equilibrated matrix. The type of A can be: Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. R (input) double*, dimension (A->nrow) The row scale factors for A. C (input) double*, dimension (A->ncol) The column scale factors for A. ROWCND (input) double Ratio of the smallest R(i) to the largest R(i). COLCND (input) double Ratio of the smallest C(i) to the largest C(i). AMAX (input) double Absolute value of largest matrix entry. EQUED (output) char* Specifies the form of equilibration that was done. = 'N': No equilibration = 'R': Row equilibration, i.e., A has been premultiplied by diag(R). = 'C': Column equilibration, i.e., A has been postmultiplied by diag(C). = 'B': Both row and column equilibration, i.e., A has been replaced by diag(R) * A * diag(C). Internal Parameters =================== THRESH is a threshold value used to decide if row or column scaling should be done based on the ratio of the row or column scaling factors. If ROWCND < THRESH, row scaling is done, and if COLCND < THRESH, column scaling is done. LARGE and SMALL are threshold values used to decide if row scaling should be done based on the absolute size of the largest matrix element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. ===================================================================== </pre> */ void zlaqgs_dist(SuperMatrix *A, double *r, double *c, double rowcnd, double colcnd, double amax, char *equed) { #define THRESH (0.1) /* Local variables */ NCformat *Astore; doublecomplex *Aval; int i, j, irow; double large, small, cj; double temp; /* Quick return if possible */ if (A->nrow <= 0 || A->ncol <= 0) { *(unsigned char *)equed = 'N'; return; } Astore = (NCformat *) A->Store; Aval = (doublecomplex *) Astore->nzval; /* Initialize LARGE and SMALL. */ small = dmach("Safe minimum") / dmach("Precision"); large = 1. / small; if (rowcnd >= THRESH && amax >= small && amax <= large) { if (colcnd >= THRESH) *(unsigned char *)equed = 'N'; else { /* Column scaling */ for (j = 0; j < A->ncol; ++j) { cj = c[j]; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { zd_mult(&Aval[i], &Aval[i], cj); } } *(unsigned char *)equed = 'C'; } } else if (colcnd >= THRESH) { /* Row scaling, no column scaling */ for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; zd_mult(&Aval[i], &Aval[i], r[irow]); } *(unsigned char *)equed = 'R'; } else { /* Row and column scaling */ for (j = 0; j < A->ncol; ++j) { cj = c[j]; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; temp = cj * r[irow]; zd_mult(&Aval[i], &Aval[i], temp); } } *(unsigned char *)equed = 'B'; } return; } /* zlaqgs_dist */
int main(int argc, char *argv[]) { void zmatvec_mult(doublecomplex alpha, doublecomplex x[], doublecomplex beta, doublecomplex y[]); void zpsolve(int n, doublecomplex x[], doublecomplex y[]); extern int zfgmr( int n, void (*matvec_mult)(doublecomplex, doublecomplex [], doublecomplex, doublecomplex []), void (*psolve)(int n, doublecomplex [], doublecomplex[]), doublecomplex *rhs, doublecomplex *sol, double tol, int restrt, int *itmax, FILE *fits); extern int zfill_diag(int n, NCformat *Astore); char equed[1] = {'B'}; yes_no_t equil; trans_t trans; SuperMatrix A, L, U; SuperMatrix B, X; NCformat *Astore; NCformat *Ustore; SCformat *Lstore; doublecomplex *a; int *asub, *xa; int *etree; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ int nrhs, ldx, lwork, info, m, n, nnz; doublecomplex *rhsb, *rhsx, *xact; doublecomplex *work = NULL; double *R, *C; double u, rpg, rcond; doublecomplex zero = {0.0, 0.0}; doublecomplex one = {1.0, 0.0}; doublecomplex none = {-1.0, 0.0}; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; int restrt, iter, maxit, i; double resid; doublecomplex *x, *b; #ifdef DEBUG extern int num_drop_L, num_drop_U; #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Defaults */ lwork = 0; nrhs = 1; trans = NOTRANS; /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 0.1; //different from complete LU options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; options.RowPerm = LargeDiag; options.ILU_DropTol = 1e-4; options.ILU_FillTol = 1e-2; options.ILU_FillFactor = 10.0; options.ILU_DropRule = DROP_BASIC | DROP_AREA; options.ILU_Norm = INF_NORM; options.ILU_MILU = SILU; */ ilu_set_default_options(&options); /* Modify the defaults. */ options.PivotGrowth = YES; /* Compute reciprocal pivot growth */ options.ConditionNumber = YES;/* Compute reciprocal condition number */ if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) ABORT("Malloc fails for work[]."); } /* Read matrix A from a file in Harwell-Boeing format.*/ if (argc < 2) { printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n" "-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n" "-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n" "-t -triplet:\n\t[INPUT] is a triplet format matrix.\n", argv[0]); return 0; } else { switch (argv[1][1]) { case 'H': case 'h': printf("Input a Harwell-Boeing format matrix:\n"); zreadhb(&m, &n, &nnz, &a, &asub, &xa); break; case 'R': case 'r': printf("Input a Rutherford-Boeing format matrix:\n"); zreadrb(&m, &n, &nnz, &a, &asub, &xa); break; case 'T': case 't': printf("Input a triplet format matrix:\n"); zreadtriple(&m, &n, &nnz, &a, &asub, &xa); break; default: printf("Unrecognized format.\n"); return 0; } } zCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE); Astore = A.Store; zfill_diag(n, Astore); printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); fflush(stdout); /* Generate the right-hand side */ if ( !(rhsb = doublecomplexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsx = doublecomplexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[]."); zCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_Z, SLU_GE); zCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_Z, SLU_GE); xact = doublecomplexMalloc(n * nrhs); ldx = n; zGenXtrue(n, nrhs, xact, ldx); zFillRHS(trans, nrhs, xact, ldx, &A, &B); if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(R = (double *) SUPERLU_MALLOC(A.nrow * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (double *) SUPERLU_MALLOC(A.ncol * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C[]."); info = 0; #ifdef DEBUG num_drop_L = 0; num_drop_U = 0; #endif /* Initialize the statistics variables. */ StatInit(&stat); /* Compute the incomplete factorization and compute the condition number and pivot growth using dgsisx. */ B.ncol = 0; /* not to perform triangular solution */ zgsisx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, &mem_usage, &stat, &info); /* Set RHS for GMRES. */ if (!(b = doublecomplexMalloc(m))) ABORT("Malloc fails for b[]."); if (*equed == 'R' || *equed == 'B') { for (i = 0; i < n; ++i) zd_mult(&b[i], &rhsb[i], R[i]); } else { for (i = 0; i < m; i++) b[i] = rhsb[i]; } printf("zgsisx(): info %d, equed %c\n", info, equed[0]); if (info > 0 || rcond < 1e-8 || rpg > 1e8) printf("WARNING: This preconditioner might be unstable.\n"); if ( info == 0 || info == n+1 ) { if ( options.PivotGrowth == YES ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber == YES ) printf("Recip. condition number = %e\n", rcond); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("n(A) = %d, nnz(A) = %d\n", n, Astore->nnz); printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("Fill ratio: nnz(F)/nnz(A) = %.3f\n", ((double)(Lstore->nnz) + (double)(Ustore->nnz) - (double)n) / (double)Astore->nnz); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); fflush(stdout); /* Set the global variables. */ GLOBAL_A = &A; GLOBAL_L = &L; GLOBAL_U = &U; GLOBAL_STAT = &stat; GLOBAL_PERM_C = perm_c; GLOBAL_PERM_R = perm_r; GLOBAL_OPTIONS = &options; GLOBAL_R = R; GLOBAL_C = C; GLOBAL_MEM_USAGE = &mem_usage; /* Set the options to do solve-only. */ options.Fact = FACTORED; options.PivotGrowth = NO; options.ConditionNumber = NO; /* Set the variables used by GMRES. */ restrt = SUPERLU_MIN(n / 3 + 1, 50); maxit = 1000; iter = maxit; resid = 1e-8; if (!(x = doublecomplexMalloc(n))) ABORT("Malloc fails for x[]."); if (info <= n + 1) { int i_1 = 1; double maxferr = 0.0, nrmA, nrmB, res, t; doublecomplex temp; extern double dznrm2_(int *, doublecomplex [], int *); extern void zaxpy_(int *, doublecomplex *, doublecomplex [], int *, doublecomplex [], int *); /* Initial guess */ for (i = 0; i < n; i++) x[i] = zero; t = SuperLU_timer_(); /* Call GMRES */ zfgmr(n, zmatvec_mult, zpsolve, b, x, resid, restrt, &iter, stdout); t = SuperLU_timer_() - t; /* Output the result. */ nrmA = dznrm2_(&(Astore->nnz), (doublecomplex *)((DNformat *)A.Store)->nzval, &i_1); nrmB = dznrm2_(&m, b, &i_1); sp_zgemv("N", none, &A, x, 1, one, b, 1); res = dznrm2_(&m, b, &i_1); resid = res / nrmB; printf("||A||_F = %.1e, ||B||_2 = %.1e, ||B-A*X||_2 = %.1e, " "relres = %.1e\n", nrmA, nrmB, res, resid); if (iter >= maxit) { if (resid >= 1.0) iter = -180; else if (resid > 1e-8) iter = -111; } printf("iteration: %d\nresidual: %.1e\nGMRES time: %.2f seconds.\n", iter, resid, t); /* Scale the solution back if equilibration was performed. */ if (*equed == 'C' || *equed == 'B') for (i = 0; i < n; i++) zd_mult(&x[i], &x[i], C[i]); for (i = 0; i < m; i++) { z_sub(&temp, &x[i], &xact[i]); maxferr = SUPERLU_MAX(maxferr, z_abs1(&temp)); } printf("||X-X_true||_oo = %.1e\n", maxferr); } #ifdef DEBUG printf("%d entries in L and %d entries in U dropped.\n", num_drop_L, num_drop_U); #endif fflush(stdout); if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhsb); SUPERLU_FREE (rhsx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); if ( lwork >= 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } SUPERLU_FREE(b); SUPERLU_FREE(x); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif return 0; }
Z::doublecomplex operator()(Z::doublecomplex amesos_z, double amesos_d) { Z::doublecomplex amesos_zr; zd_mult(&amesos_zr, &amesos_z, amesos_d); // zd_mult is a macro, so no namespacing return( amesos_zr ); }
void zgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) { DNformat *Bstore, *Xstore; doublecomplex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; trans_t trant; char norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh; double t0; /* temporary time */ double *utime; int *perm = NULL; /* External functions */ extern double zlangs(char *, SuperMatrix *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; *info = 0; nofact = (options->Fact != FACTORED); equil = (options->Equil == YES); notran = (options->Trans == NOTRANS); mc64 = (options->RowPerm == LargeDiag); if ( nofact ) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* Test the input parameters */ if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && options->Fact != SamePattern_SameRowPerm && !notran && options->Trans != TRANS && options->Trans != CONJ && !equil && options->Equil != NO) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if (options->Fact == FACTORED && !(rowequ || colequ || lsame_(equed, "N"))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -12; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -13; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || (B->ncol != 0 && B->ncol != X->ncol) || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -14; } } if (*info != 0) { i = -(*info); xerbla_("zgsisx", &i); return; } /* Initialization for factor parameters */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = options->DiagPivotThresh; utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == SLU_NC */ trant = options->Trans; AA = A; } if ( nofact ) { register int i, j; NCformat *Astore = AA->Store; int nnz = Astore->nnz; int *colptr = Astore->colptr; int *rowind = Astore->rowind; doublecomplex *nzval = (doublecomplex *)Astore->nzval; int n = AA->nrow; if ( mc64 ) { *equed = 'B'; rowequ = colequ = 1; t0 = SuperLU_timer_(); if ((perm = intMalloc(n)) == NULL) ABORT("SUPERLU_MALLOC fails for perm[]"); info1 = zldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); if (info1 > 0) { /* MC64 fails, call zgsequ() later */ mc64 = 0; SUPERLU_FREE(perm); perm = NULL; } else { for (i = 0; i < n; i++) { R[i] = exp(R[i]); C[i] = exp(C[i]); } /* permute and scale the matrix */ for (j = 0; j < n; j++) { for (i = colptr[j]; i < colptr[j + 1]; i++) { zd_mult(&nzval[i], &nzval[i], R[rowind[i]] * C[j]); rowind[i] = perm[rowind[i]]; } } } utime[EQUIL] = SuperLU_timer_() - t0; } if ( !mc64 & equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } } if ( nrhs > 0 ) { /* Scale the right hand side if equilibration was performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); } } } if ( nofact ) { t0 = SuperLU_timer_(); /* * Gnet column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t0; t0 = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); zgsitrf(options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, L, U, stat, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( options->PivotGrowth ) { if ( *info > 0 ) return; /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); } if ( options->ConditionNumber ) { /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = zlangs(norm, AA); zgscon(norm, L, U, anorm, rcond, stat, &info1); utime[RCOND] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); zgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); utime[SOLVE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); } } } else { if ( rowequ ) { if (perm) { doublecomplex *tmp; int n = A->nrow; if ((tmp = doublecomplexMalloc(n)) == NULL) ABORT("SUPERLU_MALLOC fails for tmp[]"); for (j = 0; j < nrhs; j++) { for (i = 0; i < n; i++) tmp[i] = Xmat[i + j * ldx]; /*dcopy*/ for (i = 0; i < n; i++) zd_mult(&Xmat[i+j*ldx], &tmp[perm[i]], R[i]); } SUPERLU_FREE(tmp); } else { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); } } } } } /* end if nrhs > 0 */ if ( options->ConditionNumber ) { /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; } if (perm) SUPERLU_FREE(perm); if ( nofact ) { ilu_zQuerySpace(L, U, mem_usage); Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
void zgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, double *ferr, double *berr, mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) { DNformat *Bstore, *Xstore; doublecomplex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ, permc_spec; trans_t trant; char norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh, drop_tol; double t0; /* temporary time */ double *utime; /* External functions */ extern double zlangs(char *, SuperMatrix *); extern double dlamch_(char *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; *info = 0; nofact = (options->Fact != FACTORED); equil = (options->Equil == YES); notran = (options->Trans == NOTRANS); if ( nofact ) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } #if 0 printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", options->Fact, options->Trans, *equed); #endif /* Test the input parameters */ if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && options->Fact != SamePattern_SameRowPerm && !notran && options->Trans != TRANS && options->Trans != CONJ && !equil && options->Equil != NO) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if (options->Fact == FACTORED && !(rowequ || colequ || lsame_(equed, "N"))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -12; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -13; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || (B->ncol != 0 && B->ncol != X->ncol) || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -14; } } if (*info != 0) { i = -(*info); xerbla_("zgssvx", &i); return; } /* Initialization for factor parameters */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = options->DiagPivotThresh; drop_tol = 0.0; utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == SLU_NC */ trant = options->Trans; AA = A; } if ( nofact && equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Scale the right hand side if equilibration was performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); } } } if ( nofact ) { t0 = SuperLU_timer_(); /* * Gnet column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t0; t0 = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); */ /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); zgstrf(options, &AC, drop_tol, relax, panel_size, etree, work, lwork, perm_c, perm_r, L, U, stat, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( options->PivotGrowth ) { if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); } return; } /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); } if ( options->ConditionNumber ) { /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = zlangs(norm, AA); zgscon(norm, L, U, anorm, rcond, stat, info); utime[RCOND] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); zgstrs (trant, L, U, perm_c, perm_r, X, stat, info); utime[SOLVE] = SuperLU_timer_() - t0; /* Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. */ t0 = SuperLU_timer_(); if ( options->IterRefine != NOREFINE ) { zgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, X, ferr, berr, stat, info); } else { for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; } utime[REFINE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); } } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); } } } /* end if nrhs > 0 */ if ( options->ConditionNumber ) { /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; } if ( nofact ) { zQuerySpace(L, U, mem_usage); Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
void zgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, int *etree, void *work, int lwork, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */ SuperLUStat_t *stat, int *info) { /* Local working arrays */ NCPformat *Astore; int *iperm_r = NULL; /* inverse of perm_r; used when options->Fact == SamePattern_SameRowPerm */ int *iperm_c; /* inverse of perm_c */ int *swap, *iswap; /* swap is used to store the row permutation during the factorization. Initially, it is set to iperm_c (row indeces of Pc*A*Pc'). iswap is the inverse of swap. After the factorization, it is equal to perm_r. */ int *iwork; doublecomplex *zwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *marker, *marker_relax; doublecomplex *dense, *tempv; double *dtempv; int *relax_end, *relax_fsupc; doublecomplex *a; int *asub; int *xa_begin, *xa_end; int *xsup, *supno; int *xlsub, *xlusup, *xusub; int nzlumax; double *amax; doublecomplex drop_sum; double alpha, omega; /* used in MILU, mimicing DRIC */ double *dwork2; /* used by the second dropping rule */ /* Local scalars */ fact_t fact = options->Fact; double diag_pivot_thresh = options->DiagPivotThresh; double drop_tol = options->ILU_DropTol; /* tau */ double fill_ini = options->ILU_FillTol; /* tau^hat */ double gamma = options->ILU_FillFactor; int drop_rule = options->ILU_DropRule; milu_t milu = options->ILU_MILU; double fill_tol; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ register int jcol; register int kcol; /* end column of a relaxed snode */ register int icol; register int i, k, jj, new_next, iinfo; int m, n, min_mn, jsupno, fsupc, nextlu, nextu; int w_def; /* upper bound on panel width */ int usepr, iperm_r_allocated = 0; int nnzL, nnzU; int *panel_histo = stat->panel_histo; flops_t *ops = stat->ops; int last_drop;/* the last column which the dropping rules applied */ int quota; int nnzAj; /* number of nonzeros in A(:,1:j) */ int nnzLj, nnzUj; double tol_L = drop_tol, tol_U = drop_tol; doublecomplex zero = {0.0, 0.0}; double one = 1.0; /* Executable */ iinfo = 0; m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN(m, n); Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; /* Allocate storage common to the factor routines */ *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, gamma, L, U, Glu, &iwork, &zwork); if ( *info ) return; xsup = Glu->xsup; supno = Glu->supno; xlsub = Glu->xlsub; xlusup = Glu->xlusup; xusub = Glu->xusub; SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &marker_relax, &marker); zSetRWork(m, panel_size, zwork, &dense, &tempv); usepr = (fact == SamePattern_SameRowPerm); if ( usepr ) { /* Compute the inverse of perm_r */ iperm_r = (int *) intMalloc(m); for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; iperm_r_allocated = 1; } iperm_c = (int *) intMalloc(n); for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; swap = (int *)intMalloc(n); for (k = 0; k < n; k++) swap[k] = iperm_c[k]; iswap = (int *)intMalloc(n); for (k = 0; k < n; k++) iswap[k] = perm_c[k]; amax = (double *) SUPERLU_MALLOC(panel_size * sizeof(double)); if (drop_rule & DROP_SECONDARY) dwork2 = SUPERLU_MALLOC(n * sizeof(double)); else dwork2 = NULL; nnzAj = 0; nnzLj = 0; nnzUj = 0; last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); /* Identify relaxed snodes */ relax_end = (int *) intMalloc(n); relax_fsupc = (int *) intMalloc(n); if ( options->SymmetricMode == YES ) ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); else ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); ifill (perm_r, m, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); supno[0] = -1; xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; w_def = panel_size; /* Mark the rows used by relaxed supernodes */ ifill (marker_relax, m, EMPTY); i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, asub, marker_relax); #if ( PRNTlevel >= 1) printf("%d relaxed supernodes.\n", i); #endif /* * Work on one "panel" at a time. A panel is one of the following: * (a) a relaxed supernode at the bottom of the etree, or * (b) panel_size contiguous columns, defined by the user */ for (jcol = 0; jcol < min_mn; ) { if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ kcol = relax_end[jcol]; /* end of the relaxed snode */ panel_histo[kcol-jcol+1]++; /* Drop small rows in the previous supernode. */ if (jcol > 0 && jcol < last_drop) { int first = xsup[supno[jcol - 1]]; int last = jcol - 1; int quota; /* Compute the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * (m - first) / m * (last - first + 1); else if (drop_rule & DROP_COLUMN) { int i; quota = 0; for (i = first; i <= last; i++) quota += xa_end[i] - xa_begin[i]; quota = gamma * quota * (m - first) / m; } else if (drop_rule & DROP_AREA) quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) - nnzLj; else quota = m * n; fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); /* Drop small rows */ dtempv = (double *) tempv; i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, &fill_tol, Glu, dtempv, dwork2, 0); /* Reset the parameters */ if (drop_rule & DROP_DYNAMIC) { if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) < nnzLj) tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); else tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); } if (fill_tol < 0) iinfo -= (int)fill_tol; #ifdef DEBUG num_drop_L += i * (last - first + 1); #endif } /* -------------------------------------- * Factorize the relaxed supernode(jcol:kcol) * -------------------------------------- */ /* Determine the union of the row structure of the snode */ if ( (*info = ilu_zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, marker, Glu)) != 0 ) return; nextu = xusub[jcol]; nextlu = xlusup[jcol]; jsupno = supno[jcol]; fsupc = xsup[jsupno]; new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); nzlumax = Glu->nzlumax; while ( new_next > nzlumax ) { if ((*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))) return; } for (icol = jcol; icol <= kcol; icol++) { xusub[icol+1] = nextu; amax[0] = 0.0; /* Scatter into SPA dense[*] */ for (k = xa_begin[icol]; k < xa_end[icol]; k++) { register double tmp = z_abs1 (&a[k]); if (tmp > amax[0]) amax[0] = tmp; dense[asub[k]] = a[k]; } nnzAj += xa_end[icol] - xa_begin[icol]; if (amax[0] == 0.0) { amax[0] = fill_ini; #if ( PRNTlevel >= 1) printf("Column %d is entirely zero!\n", icol); fflush(stdout); #endif } /* Numeric update within the snode */ zsnode_bmod(icol, jsupno, fsupc, dense, tempv, Glu, stat); if (usepr) pivrow = iperm_r[icol]; fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); if ( (*info = ilu_zpivotL(icol, diag_pivot_thresh, &usepr, perm_r, iperm_c[icol], swap, iswap, marker_relax, &pivrow, amax[0] * fill_tol, milu, zero, Glu, stat)) ) { iinfo++; marker[pivrow] = kcol; } } jcol = kcol + 1; } else { /* Work on one panel of panel_size columns */ /* Adjust panel_size so that a panel won't overlap with the next * relaxed snode. */ panel_size = w_def; for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) if ( relax_end[k] != EMPTY ) { panel_size = k - jcol; break; } if ( k == min_mn ) panel_size = min_mn - jcol; panel_histo[panel_size]++; /* symbolic factor on a panel of columns */ ilu_zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, dense, amax, panel_lsub, segrep, repfnz, marker, parent, xplore, Glu); /* numeric sup-panel updates in topological order */ zpanel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, Glu, stat); /* Sparse LU within the panel, and below panel diagonal */ for (jj = jcol; jj < jcol + panel_size; jj++) { k = (jj - jcol) * m; /* column index for w-wide arrays */ nseg = nseg1; /* Begin after all the panel segments */ nnzAj += xa_end[jj] - xa_begin[jj]; if ((*info = ilu_zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], segrep, &repfnz[k], marker, parent, xplore, Glu))) return; /* Numeric updates */ if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], tempv, &segrep[nseg1], &repfnz[k], jcol, Glu, stat)) != 0) return; /* Make a fill-in position if the column is entirely zero */ if (xlsub[jj + 1] == xlsub[jj]) { register int i, row; int nextl; int nzlmax = Glu->nzlmax; int *lsub = Glu->lsub; int *marker2 = marker + 2 * m; /* Allocate memory */ nextl = xlsub[jj] + 1; if (nextl >= nzlmax) { int error = zLUMemXpand(jj, nextl, LSUB, &nzlmax, Glu); if (error) { *info = error; return; } lsub = Glu->lsub; } xlsub[jj + 1]++; assert(xlusup[jj]==xlusup[jj+1]); xlusup[jj + 1]++; ((doublecomplex *) Glu->lusup)[xlusup[jj]] = zero; /* Choose a row index (pivrow) for fill-in */ for (i = jj; i < n; i++) if (marker_relax[swap[i]] <= jj) break; row = swap[i]; marker2[row] = jj; lsub[xlsub[jj]] = row; #ifdef DEBUG printf("Fill col %d.\n", jj); fflush(stdout); #endif } /* Computer the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * jj / m; else if (drop_rule & DROP_COLUMN) quota = gamma * (xa_end[jj] - xa_begin[jj]) * (jj + 1) / m; else if (drop_rule & DROP_AREA) quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; else quota = m; /* Copy the U-segments to ucol[*] and drop small entries */ if ((*info = ilu_zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], perm_r, &dense[k], drop_rule, milu, amax[jj - jcol] * tol_U, quota, &drop_sum, &nnzUj, Glu, dwork2)) != 0) return; /* Reset the dropping threshold if required */ if (drop_rule & DROP_DYNAMIC) { if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); else tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); } if (drop_sum.r != 0.0 && drop_sum.i != 0.0) { omega = SUPERLU_MIN(2.0*(1.0-alpha)/z_abs1(&drop_sum), 1.0); zd_mult(&drop_sum, &drop_sum, omega); } if (usepr) pivrow = iperm_r[jj]; fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); if ( (*info = ilu_zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, iperm_c[jj], swap, iswap, marker_relax, &pivrow, amax[jj - jcol] * fill_tol, milu, drop_sum, Glu, stat)) ) { iinfo++; marker[m + pivrow] = jj; marker[2 * m + pivrow] = jj; } /* Reset repfnz[] for this column */ resetrep_col (nseg, segrep, &repfnz[k]); /* Start a new supernode, drop the previous one */ if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { int first = xsup[supno[jj - 1]]; int last = jj - 1; int quota; /* Compute the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * (m - first) / m * (last - first + 1); else if (drop_rule & DROP_COLUMN) { int i; quota = 0; for (i = first; i <= last; i++) quota += xa_end[i] - xa_begin[i]; quota = gamma * quota * (m - first) / m; } else if (drop_rule & DROP_AREA) quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) - nnzLj; else quota = m * n; fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / (double)min_mn); /* Drop small rows */ dtempv = (double *) tempv; i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, &fill_tol, Glu, dtempv, dwork2, 1); /* Reset the parameters */ if (drop_rule & DROP_DYNAMIC) { if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) < nnzLj) tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); else tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); } if (fill_tol < 0) iinfo -= (int)fill_tol; #ifdef DEBUG num_drop_L += i * (last - first + 1); #endif } /* if start a new supernode */ } /* for */ jcol += panel_size; /* Move to the next panel */ } /* else */ } /* for */ *info = iinfo; if ( m > n ) { k = 0; for (i = 0; i < m; ++i) if ( perm_r[i] == EMPTY ) { perm_r[i] = n + k; ++k; } } ilu_countnz(min_mn, &nnzL, &nnzU, Glu); fixupL(min_mn, perm_r, Glu); zLUWorkFree(iwork, zwork, Glu); /* Free work space and compress storage */ if ( fact == SamePattern_SameRowPerm ) { /* L and U structures may have changed due to possibly different pivoting, even though the storage is available. There could also be memory expansions, so the array locations may have changed, */ ((SCformat *)L->Store)->nnz = nnzL; ((SCformat *)L->Store)->nsuper = Glu->supno[n]; ((SCformat *)L->Store)->nzval = (doublecomplex *) Glu->lusup; ((SCformat *)L->Store)->nzval_colptr = Glu->xlusup; ((SCformat *)L->Store)->rowind = Glu->lsub; ((SCformat *)L->Store)->rowind_colptr = Glu->xlsub; ((NCformat *)U->Store)->nnz = nnzU; ((NCformat *)U->Store)->nzval = (doublecomplex *) Glu->ucol; ((NCformat *)U->Store)->rowind = Glu->usub; ((NCformat *)U->Store)->colptr = Glu->xusub; } else { zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, (doublecomplex *) Glu->lusup, Glu->xlusup, Glu->lsub, Glu->xlsub, Glu->supno, Glu->xsup, SLU_SC, SLU_Z, SLU_TRLU); zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, (doublecomplex *) Glu->ucol, Glu->usub, Glu->xusub, SLU_NC, SLU_Z, SLU_TRU); } ops[FACT] += ops[TRSV] + ops[GEMV]; stat->expansions = --(Glu->num_expansions); if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); SUPERLU_FREE (iperm_c); SUPERLU_FREE (relax_end); SUPERLU_FREE (swap); SUPERLU_FREE (iswap); SUPERLU_FREE (relax_fsupc); SUPERLU_FREE (amax); if ( dwork2 ) SUPERLU_FREE (dwork2); }
void pzgssvx(int nprocs, superlumt_options_t *superlumt_options, SuperMatrix *A, int *perm_c, int *perm_r, equed_t *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, double *ferr, double *berr, superlu_memusage_t *superlu_memusage, int *info) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * pzgssvx() solves the system of linear equations A*X=B or A'*X=B, using * the LU factorization from zgstrf(). Error bounds on the solution and * a condition estimate are also provided. It performs the following steps: * * 1. If A is stored column-wise (A->Stype = NC): * * 1.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate * the system: * trans = NOTRANS: diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B * trans = TRANS: (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = CONJ: (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B * (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ). * * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation matrix * that usually preserves sparsity. * For more details of this step, see zsp_colorder.c. * * 1.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to * factor the matrix A (after equilibration if fact = EQUILIBRATE) as * Pr*A*Pc = L*U, with Pr determined by partial pivoting. * * 1.4. Compute the reciprocal pivot growth factor. * * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the routine * returns with info = i. Otherwise, the factored form of A is used to * estimate the condition number of the matrix A. If the reciprocal of * the condition number is less than machine precision, * info = A->ncol+1 is returned as a warning, but the routine still * goes on to solve for X and computes error bounds as described below. * * 1.6. The system of equations is solved for X using the factored form * of A. * * 1.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 1.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ) * so that it solves the original system before equilibration. * * 2. If A is stored row-wise (A->Stype = NR), apply the above algorithm * to the tranpose of A: * * 2.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate * the system: * trans = NOTRANS:diag(R)*A'*diag(C)*inv(diag(C))*X = diag(R)*B * trans = TRANS: (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = CONJ: (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A' is * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B * (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ). * * 2.2. Permute columns of transpose(A) (rows of A), * forming transpose(A)*Pc, where Pc is a permutation matrix that * usually preserves sparsity. * For more details of this step, see zsp_colorder.c. * * 2.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to * factor the matrix A (after equilibration if fact = EQUILIBRATE) as * Pr*transpose(A)*Pc = L*U, with the permutation Pr determined by * partial pivoting. * * 2.4. Compute the reciprocal pivot growth factor. * * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the routine * returns with info = i. Otherwise, the factored form of transpose(A) * is used to estimate the condition number of the matrix A. * If the reciprocal of the condition number is less than machine * precision, info = A->nrow+1 is returned as a warning, but the * routine still goes on to solve for X and computes error bounds * as described below. * * 2.6. The system of equations is solved for X using the factored form * of transpose(A). * * 2.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 2.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ) * so that it solves the original system before equilibration. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * nprocs (input) int * Number of processes (or threads) to be spawned and used to perform * the LU factorization by pzgstrf(). There is a single thread of * control to call pzgstrf(), and all threads spawned by pzgstrf() * are terminated before returning from pzgstrf(). * * superlumt_options (input) superlumt_options_t* * The structure defines the input parameters and data structure * to control how the LU factorization will be performed. * The following fields should be defined for this structure: * * o fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, whether the matrix A should * be equilibrated before it is factored. * = FACTORED: On entry, L, U, perm_r and perm_c contain the * factored form of A. If equed is not NOEQUIL, the matrix A has * been equilibrated with scaling factors R and C. * A, L, U, perm_r are not modified. * = DOFACT: The matrix A will be factored, and the factors will be * stored in L and U. * = EQUILIBRATE: The matrix A will be equilibrated if necessary, * then factored into L and U. * * o trans (trans_t) * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A**T * X = B (Transpose) * = CONJ: A**H * X = B (Transpose) * * o refact (yes_no_t) * Specifies whether this is first time or subsequent factorization. * = NO: this factorization is treated as the first one; * = YES: it means that a factorization was performed prior to this * one. Therefore, this factorization will re-use some * existing data structures, such as L and U storage, column * elimination tree, and the symbolic information of the * Householder matrix. * * o panel_size (int) * A panel consists of at most panel_size consecutive columns. * * o relax (int) * To control degree of relaxing supernodes. If the number * of nodes (columns) in a subtree of the elimination tree is less * than relax, this subtree is considered as one supernode, * regardless of the row structures of those columns. * * o diag_pivot_thresh (double) * Diagonal pivoting threshold. At step j of the Gaussian * elimination, if * abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * use A_jj as pivot, else use A_ij with maximum magnitude. * 0 <= diag_pivot_thresh <= 1. The default value is 1, * corresponding to partial pivoting. * * o usepr (yes_no_t) * Whether the pivoting will use perm_r specified by the user. * = YES: use perm_r; perm_r is input, unchanged on exit. * = NO: perm_r is determined by partial pivoting, and is output. * * o drop_tol (double) (NOT IMPLEMENTED) * Drop tolerance parameter. At step j of the Gaussian elimination, * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. * 0 <= drop_tol <= 1. The default value of drop_tol is 0, * corresponding to not dropping any entry. * * o work (void*) of size lwork * User-supplied work space and space for the output data structures. * Not referenced if lwork = 0; * * o lwork (int) * Specifies the length of work array. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * superlu_memusage->total_needed; no other side effects. * * A (input/output) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol), where * A->nrow = A->ncol. Currently, the type of A can be: * Stype = NC or NR, Dtype = _D, Mtype = GE. In the future, * more general A will be handled. * * On entry, If superlumt_options->fact = FACTORED and equed is not * NOEQUIL, then A must have been equilibrated by the scaling factors * in R and/or C. On exit, A is not modified * if superlumt_options->fact = FACTORED or DOFACT, or * if superlumt_options->fact = EQUILIBRATE and equed = NOEQUIL. * * On exit, if superlumt_options->fact = EQUILIBRATE and equed is not * NOEQUIL, A is scaled as follows: * If A->Stype = NC: * equed = ROW: A := diag(R) * A * equed = COL: A := A * diag(C) * equed = BOTH: A := diag(R) * A * diag(C). * If A->Stype = NR: * equed = ROW: transpose(A) := diag(R) * transpose(A) * equed = COL: transpose(A) := transpose(A) * diag(C) * equed = BOTH: transpose(A) := diag(R) * transpose(A) * diag(C). * * perm_c (input/output) int* * If A->Stype = NC, Column permutation vector of size A->ncol, * which defines the permutation matrix Pc; perm_c[i] = j means * column i of A is in position j in A*Pc. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree * is already in postorder. * * If A->Stype = NR, column permutation vector of size A->nrow, * which describes permutation of columns of tranpose(A) * (rows of A) as described above. * * perm_r (input/output) int* * If A->Stype = NC, row permutation vector of size A->nrow, * which defines the permutation matrix Pr, and is determined * by partial pivoting. perm_r[i] = j means row i of A is in * position j in Pr*A. * * If A->Stype = NR, permutation vector of size A->ncol, which * determines permutation of rows of transpose(A) * (columns of A) as described above. * * If superlumt_options->usepr = NO, perm_r is output argument; * If superlumt_options->usepr = YES, the pivoting routine will try * to use the input perm_r, unless a certain threshold criterion * is violated. In that case, perm_r is overwritten by a new * permutation determined by partial pivoting or diagonal * threshold pivoting. * * equed (input/output) equed_t* * Specifies the form of equilibration that was done. * = NOEQUIL: No equilibration. * = ROW: Row equilibration, i.e., A was premultiplied by diag(R). * = COL: Column equilibration, i.e., A was postmultiplied by diag(C). * = BOTH: Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * If superlumt_options->fact = FACTORED, equed is an input argument, * otherwise it is an output argument. * * R (input/output) double*, dimension (A->nrow) * The row scale factors for A or transpose(A). * If equed = ROW or BOTH, A (if A->Stype = NC) or transpose(A) * (if A->Stype = NR) is multiplied on the left by diag(R). * If equed = NOEQUIL or COL, R is not accessed. * If fact = FACTORED, R is an input argument; otherwise, R is output. * If fact = FACTORED and equed = ROW or BOTH, each element of R must * be positive. * * C (input/output) double*, dimension (A->ncol) * The column scale factors for A or transpose(A). * If equed = COL or BOTH, A (if A->Stype = NC) or trnspose(A) * (if A->Stype = NR) is multiplied on the right by diag(C). * If equed = NOEQUIL or ROW, C is not accessed. * If fact = FACTORED, C is an input argument; otherwise, C is output. * If fact = FACTORED and equed = COL or BOTH, each element of C must * be positive. * * L (output) SuperMatrix* * The factor L from the factorization * Pr*A*Pc=L*U (if A->Stype = NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = NR). * Uses compressed row subscripts storage for supernodes, i.e., * L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization * Pr*A*Pc=L*U (if A->Stype = NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = NR). * Uses column-wise storage scheme, i.e., U has types: * Stype = NCP, Dtype = _D, Mtype = TRU. * * B (input/output) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the right hand side matrix. * On exit, * if equed = NOEQUIL, B is not modified; otherwise * if A->Stype = NC: * if trans = NOTRANS and equed = ROW or BOTH, B is overwritten * by diag(R)*B; * if trans = TRANS or CONJ and equed = COL of BOTH, B is * overwritten by diag(C)*B; * if A->Stype = NR: * if trans = NOTRANS and equed = COL or BOTH, B is overwritten * by diag(C)*B; * if trans = TRANS or CONJ and equed = ROW of BOTH, B is * overwritten by diag(R)*B. * * X (output) SuperMatrix* * X has types: Stype = DN, Dtype = _D, Mtype = GE. * If info = 0 or info = A->ncol+1, X contains the solution matrix * to the original system of equations. Note that A and B are modified * on exit if equed is not NOEQUIL, and the solution to the * equilibrated system is inv(diag(C))*X if trans = NOTRANS and * equed = COL or BOTH, or inv(diag(R))*X if trans = TRANS or CONJ * and equed = ROW or BOTH. * * recip_pivot_growth (output) double* * The reciprocal pivot growth factor computed as * max_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ). * If recip_pivot_growth is much less than 1, the stability of the * LU factorization could be poor. * * rcond (output) double* * The estimate of the reciprocal condition number of the matrix A * after equilibration (if done). If rcond is less than the machine * precision (in particular, if rcond = 0), the matrix is singular * to working precision. This condition is indicated by a return * code of info > 0. * * ferr (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * berr (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * superlu_memusage (output) superlu_memusage_t* * Record the memory usage statistics, consisting of following fields: * - for_lu (float) * The amount of space used in bytes for L\U data structures. * - total_needed (float) * The amount of space needed in bytes to perform factorization. * - expansions (int) * The number of memory expansions during the LU factorization. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly * singular, so the solution and error bounds * could not be computed. * = A->ncol+1: U is nonsingular, but RCOND is less than machine * precision, meaning that the matrix is singular to * working precision. Nevertheless, the solution and * error bounds are computed because there are a number * of situations where the computed solution can be more * accurate than the value of RCOND would suggest. * > A->ncol+1: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ NCformat *Astore; DNformat *Bstore, *Xstore; doublecomplex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA; /* A in NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, dofact, notran, rowequ; char norm[1]; trans_t trant; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int n, relax, panel_size; Gstat_t Gstat; double t0; /* temporary time */ double *utime; flops_t *ops, flopcnt; /* External functions */ extern double zlangs(char *, SuperMatrix *); extern double dlamch_(char *); Astore = A->Store; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; n = A->ncol; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; superlumt_options->perm_c = perm_c; superlumt_options->perm_r = perm_r; *info = 0; dofact = (superlumt_options->fact == DOFACT); equil = (superlumt_options->fact == EQUILIBRATE); notran = (superlumt_options->trans == NOTRANS); if (dofact || equil) { *equed = NOEQUIL; rowequ = FALSE; colequ = FALSE; } else { rowequ = (*equed == ROW) || (*equed == BOTH); colequ = (*equed == COL) || (*equed == BOTH); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* ------------------------------------------------------------ Test the input parameters. ------------------------------------------------------------*/ if ( nprocs <= 0 ) *info = -1; else if ( (!dofact && !equil && (superlumt_options->fact != FACTORED)) || (!notran && (superlumt_options->trans != TRANS) && (superlumt_options->trans != CONJ)) || (superlumt_options->refact != YES && superlumt_options->refact != NO) || (superlumt_options->usepr != YES && superlumt_options->usepr != NO) || superlumt_options->lwork < -1 ) *info = -2; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -3; else if ((superlumt_options->fact == FACTORED) && !(rowequ || colequ || (*equed == NOEQUIL))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -11; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || B->ncol != X->ncol || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -12; } } if (*info != 0) { i = -(*info); xerbla_("pzgssvx", &i); return; } /* ------------------------------------------------------------ Allocate storage and initialize statistics variables. ------------------------------------------------------------*/ panel_size = superlumt_options->panel_size; relax = superlumt_options->relax; StatAlloc(n, nprocs, panel_size, relax, &Gstat); StatInit(n, nprocs, &Gstat); utime = Gstat.utime; ops = Gstat.ops; /* ------------------------------------------------------------ Convert A to NC format when necessary. ------------------------------------------------------------*/ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == NC */ trant = superlumt_options->trans; AA = A; } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = (*equed == ROW) || (*equed == BOTH); colequ = (*equed == COL) || (*equed == BOTH); } utime[EQUIL] = SuperLU_timer_() - t0; } /* ------------------------------------------------------------ Scale the right hand side. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); } } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( dofact || equil ) { /* Obtain column etree, the column count (colcnt_h) and supernode partition (part_super_h) for the Householder matrix. */ t0 = SuperLU_timer_(); sp_colorder(AA, perm_c, superlumt_options, &AC); utime[ETREE] = SuperLU_timer_() - t0; #if ( PRNTlevel >= 2 ) printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); #endif /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); pzgstrf(superlumt_options, &AC, perm_r, L, U, &Gstat, info); utime[FACT] = SuperLU_timer_() - t0; flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; ops[FACT] = flopcnt; if ( superlumt_options->lwork == -1 ) { superlu_memusage->total_needed = *info - A->ncol; return; } } if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); } } else { /* ------------------------------------------------------------ Compute the reciprocal pivot growth factor *recip_pivot_growth. ------------------------------------------------------------*/ *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); /* ------------------------------------------------------------ Estimate the reciprocal of the condition number of A. ------------------------------------------------------------*/ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = zlangs(norm, AA); zgscon(norm, L, U, anorm, rcond, info); utime[RCOND] = SuperLU_timer_() - t0; /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); zgstrs(trant, L, U, perm_r, perm_c, X, &Gstat, info); utime[SOLVE] = SuperLU_timer_() - t0; ops[SOLVE] = ops[TRISOLVE]; /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ t0 = SuperLU_timer_(); zgsrfs(trant, AA, L, U, perm_r, perm_c, *equed, R, C, B, X, ferr, berr, &Gstat, info); utime[REFINE] = SuperLU_timer_() - t0; /* ------------------------------------------------------------ Transform the solution matrix X to a solution of the original system. ------------------------------------------------------------*/ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); } } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); } } /* Set INFO = A->ncol+1 if the matrix is singular to working precision.*/ if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; } superlu_zQuerySpace(nprocs, L, U, panel_size, superlu_memusage); /* ------------------------------------------------------------ Deallocate storage after factorization. ------------------------------------------------------------*/ if ( superlumt_options->refact == NO ) { SUPERLU_FREE(superlumt_options->etree); SUPERLU_FREE(superlumt_options->colcnt_h); SUPERLU_FREE(superlumt_options->part_super_h); } if ( dofact || equil ) { Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } /* ------------------------------------------------------------ Print timings, then deallocate statistic variables. ------------------------------------------------------------*/ #ifdef PROFILE { SCPformat *Lstore = (SCPformat *) L->Store; ParallelProfile(n, Lstore->nsuper+1, Gstat.num_panels, nprocs, &Gstat); } #endif PrintStat(&Gstat); StatFree(&Gstat); }
void pzlaqgs(SuperMatrix *A, double *r, double *c, double rowcnd, double colcnd, double amax, char *equed) { /* Purpose ======= PZLAQGS equilibrates a general sparse M by N matrix A using the row and column scaling factors in the vectors R and C. See supermatrix.h for the definition of 'SuperMatrix' structure. Arguments ========= A (input/output) SuperMatrix* On exit, the equilibrated matrix. See EQUED for the form of the equilibrated matrix. The type of A can be: Stype = NR_loc; Dtype = SLU_Z; Mtype = GE. R (input) double*, dimension (A->nrow) The row scale factors for A. C (input) double*, dimension (A->ncol) The column scale factors for A. ROWCND (input) double Ratio of the smallest R(i) to the largest R(i). COLCND (input) double Ratio of the smallest C(i) to the largest C(i). AMAX (input) double Absolute value of largest matrix entry. EQUED (output) char* Specifies the form of equilibration that was done. = 'N': No equilibration = 'R': Row equilibration, i.e., A has been premultiplied by diag(R). = 'C': Column equilibration, i.e., A has been postmultiplied by diag(C). = 'B': Both row and column equilibration, i.e., A has been replaced by diag(R) * A * diag(C). Internal Parameters =================== THRESH is a threshold value used to decide if row or column scaling should be done based on the ratio of the row or column scaling factors. If ROWCND < THRESH, row scaling is done, and if COLCND < THRESH, column scaling is done. LARGE and SMALL are threshold values used to decide if row scaling should be done based on the absolute size of the largest matrix element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. ===================================================================== */ #define THRESH (0.1) /* Local variables */ NRformat_loc *Astore; doublecomplex *Aval; int_t i, j, irow, jcol, m_loc; double large, small, cj; extern double dlamch_(char *); double temp; /* Quick return if possible */ if (A->nrow <= 0 || A->ncol <= 0) { *(unsigned char *)equed = 'N'; return; } Astore = A->Store; Aval = Astore->nzval; m_loc = Astore->m_loc; /* Initialize LARGE and SMALL. */ small = dlamch_("Safe minimum") / dlamch_("Precision"); large = 1. / small; if (rowcnd >= THRESH && amax >= small && amax <= large) { if (colcnd >= THRESH) *(unsigned char *)equed = 'N'; else { /* Column scaling */ irow = Astore->fst_row; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { jcol = Astore->colind[j]; zd_mult(&Aval[j], &Aval[j], c[jcol]); } ++irow; } *(unsigned char *)equed = 'C'; } } else if (colcnd >= THRESH) { /* Row scaling, no column scaling */ irow = Astore->fst_row; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) zd_mult(&Aval[j], &Aval[j], r[irow]); ++irow; } *(unsigned char *)equed = 'R'; } else { /* Both row and column scaling */ irow = Astore->fst_row; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { jcol = Astore->colind[j]; temp = r[irow] * c[jcol]; zd_mult(&Aval[j], &Aval[j], temp); } ++irow; } *(unsigned char *)equed = 'B'; } return; } /* pzlaqgs */
/*! \brief * * <pre> * Purpose * ======= * * ZGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_Z, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * zgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = SLU_NC, Dtype = SLU_Z, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * On entry, the solution matrix X, as computed by zgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * </pre> */ void zgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, char *equed, double *R, double *C, SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 5 /* Table of constant values */ int ione = 1; doublecomplex ndone = {-1., 0.}; doublecomplex done = {1., 0.}; /* Local variables */ NCformat *Astore; doublecomplex *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; doublecomplex *Bmat, *Xmat, *Bptr, *Xptr; int kase; double safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; double s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; doublecomplex *work; double *rwork; int *iwork; int isave[3]; extern int zlacon2_(int *, doublecomplex *, doublecomplex *, double *, int *, int []); #ifdef _CRAY extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *); extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); #else extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *); extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_Z || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_Z || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); input_error("zgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = strncmp(equed, "R", 1)==0 || strncmp(equed, "B", 1)==0; colequ = strncmp(equed, "C", 1)==0 || strncmp(equed, "B", 1)==0; /* Allocate working space */ work = doublecomplexMalloc(2*A->nrow); rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); iwork = intMalloc(A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else if ( trans == TRANS ) { *(unsigned char *)transc = 'T'; transt = NOTRANS; } else if ( trans == CONJ ) { *(unsigned char *)transc = 'C'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dmach("Epsilon"); safmin = dmach("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY CCOPY(&A->nrow, Bptr, &ione, work, &ione); #else zcopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_zgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = z_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; } } else { /* trans = TRANS or CONJ */ for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) { s = SUPERLU_MAX( s, z_abs1(&work[i]) / rwork[i] ); } else if ( rwork[i] != 0.0 ) { s = SUPERLU_MAX( s, (z_abs1(&work[i]) + safe1) / rwork[i] ); } /* If rwork[i] is exactly 0.0, then we know the true residual also must be exactly 0.0. */ } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); #ifdef _CRAY CAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else zaxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ stat->RefineSteps = count; /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use ZLACON2 to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = z_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; } } else { /* trans == TRANS or CONJ */ for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = z_abs1( &Xptr[irow] ); s += z_abs1(&Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { zlacon2_(&A->nrow, &work[A->nrow], work, &ferr[j], &kase, isave); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], R[i]); } zgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], rwork[i]); } } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], rwork[i]); } zgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], R[i]); } } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * z_abs1( &Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * z_abs1( &Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, z_abs1( &Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* zgsrfs */
void zgsrfs(char *trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, char *equed, double *R, double *C, SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, int *info) { /* * Purpose * ======= * * ZGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose = Transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = NC, Dtype = _Z, Mtype = GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = _Z, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * zgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = NC, Dtype = _Z, Mtype = TRU. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = DN, Dtype = _Z, Mtype = GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = DN, Dtype = _Z, Mtype = GE. * On entry, the solution matrix X, as computed by zgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 5 /* Table of constant values */ int ione = 1; doublecomplex ndone = {-1., 0.}; doublecomplex done = {1., 0.}; /* Local variables */ NCformat *Astore; doublecomplex *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; doublecomplex *Bmat, *Xmat, *Bptr, *Xptr; int kase; double safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; double s, xk, lstres, eps, safmin; char transt[1]; doublecomplex *work; double *rwork; int *iwork; extern double dlamch_(char *); extern int zlacon_(int *, doublecomplex *, doublecomplex *, double *, int *); #ifdef _CRAY extern int CCOPY(int *, doublecomplex *, int *, doublecomplex *, int *); extern int CSAXPY(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); #else extern int zcopy_(int *, doublecomplex *, int *, doublecomplex *, int *); extern int zaxpy_(int *, doublecomplex *, doublecomplex *, int *, doublecomplex *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != NC || A->Dtype != _Z || A->Mtype != GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SC || L->Dtype != _Z || L->Mtype != TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != NC || U->Dtype != _Z || U->Mtype != TRU ) *info = -4; else if ( ldb < MAX(0, A->nrow) || B->Stype != DN || B->Dtype != _Z || B->Mtype != GE ) *info = -10; else if ( ldx < MAX(0, A->nrow) || X->Stype != DN || X->Dtype != _Z || X->Mtype != GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("zgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); /* Allocate working space */ work = doublecomplexMalloc(2*A->nrow); rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); iwork = intMalloc(A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY CCOPY(&A->nrow, Bptr, &ione, work, &ione); #else zcopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_zgemv(trans, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator and denominator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = z_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += z_abs1(&Aval[i]) * z_abs1(&Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) s = MAX( s, z_abs1(&work[i]) / rwork[i] ); else s = MAX( s, (z_abs1(&work[i]) + safe1) / (rwork[i] + safe1) ); } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ zgstrs (trans, L, U, perm_r, perm_c, &Bjcol, info); #ifdef _CRAY CAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else zaxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use ZLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = z_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = z_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += z_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = z_abs1( &Xptr[irow] ); s += z_abs1(&Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = z_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = z_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { zlacon_(&A->nrow, &work[A->nrow], work, &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], R[i]); } zgstrs (transt, L, U, perm_r, perm_c, &Bjcol, info); for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], rwork[i]); } } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) { zd_mult(&work[i], &work[i], rwork[i]); } zgstrs (trans, L, U, perm_r, perm_c, &Bjcol, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) { zd_mult(&work[i], &work[i], R[i]); } } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, C[i] * z_abs1( &Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, R[i] * z_abs1( &Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, z_abs1( &Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* zgsrfs */
void zgssvx(char *fact, char *trans, char *refact, SuperMatrix *A, factor_param_t *factor_params, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, double *ferr, double *berr, mem_usage_t *mem_usage, int *info ) { /* * Purpose * ======= * * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using * the LU factorization from zgstrf(). Error bounds on the solution and * a condition estimate are also provided. It performs the following steps: * * 1. If A is stored column-wise (A->Stype = SLU_NC): * * 1.1. If fact = 'E', scaling factors are computed to equilibrate the * system: * trans = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * trans = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if trans='N') * or diag(C)*B (if trans = 'T' or 'C'). * * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation * matrix that usually preserves sparsity. * For more details of this step, see sp_preorder.c. * * 1.3. If fact = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if fact = 'E') as Pr*A*Pc = L*U, * with Pr determined by partial pivoting. * * 1.4. Compute the reciprocal pivot growth factor. * * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the * routine returns with info = i. Otherwise, the factored form of * A is used to estimate the condition number of the matrix A. If * the reciprocal of the condition number is less than machine * precision, info = A->ncol+1 is returned as a warning, but the * routine still goes on to solve for X and computes error bounds * as described below. * * 1.6. The system of equations is solved for X using the factored form * of A. * * 1.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 1.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so * that it solves the original system before equilibration. * * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm * to the transpose of A: * * 2.1. If fact = 'E', scaling factors are computed to equilibrate the * system: * trans = 'N': diag(R)*A'*diag(C) *inv(diag(C))*X = diag(R)*B * trans = 'T': (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = 'C': (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A' is * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). * * 2.2. Permute columns of transpose(A) (rows of A), * forming transpose(A)*Pc, where Pc is a permutation matrix that * usually preserves sparsity. * For more details of this step, see sp_preorder.c. * * 2.3. If fact = 'N' or 'E', the LU decomposition is used to factor the * transpose(A) (after equilibration if fact = 'E') as * Pr*transpose(A)*Pc = L*U with the permutation Pr determined by * partial pivoting. * * 2.4. Compute the reciprocal pivot growth factor. * * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the * routine returns with info = i. Otherwise, the factored form * of transpose(A) is used to estimate the condition number of the * matrix A. If the reciprocal of the condition number * is less than machine precision, info = A->nrow+1 is returned as * a warning, but the routine still goes on to solve for X and * computes error bounds as described below. * * 2.6. The system of equations is solved for X using the factored form * of transpose(A). * * 2.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 2.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so * that it solves the original system before equilibration. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * fact (input) char* * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, whether the matrix A should * be equilibrated before it is factored. * = 'F': On entry, L, U, perm_r and perm_c contain the factored * form of A. If equed is not 'N', the matrix A has been * equilibrated with scaling factors R and C. * A, L, U, perm_r are not modified. * = 'N': The matrix A will be factored, and the factors will be * stored in L and U. * = 'E': The matrix A will be equilibrated if necessary, then * factored into L and U. * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Transpose) * * refact (input) char* * Specifies whether we want to re-factor the matrix. * = 'N': Factor the matrix A. * = 'Y': Matrix A was factored before, now we want to re-factor * matrix A with perm_r and etree as inputs. Use * the same storage for the L\U factors previously allocated, * expand it if necessary. User should insure to use the same * memory model. In this case, perm_r may be modified due to * different pivoting determined by diagonal threshold. * If fact = 'F', then refact is not accessed. * * A (input/output) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Currently, the type of A can be: * Stype = SLU_NC or SLU_NR, Dtype = SLU_Z, Mtype = SLU_GE. * In the future, more general A may be handled. * * On entry, If fact = 'F' and equed is not 'N', then A must have * been equilibrated by the scaling factors in R and/or C. * A is not modified if fact = 'F' or 'N', or if fact = 'E' and * equed = 'N' on exit. * * On exit, if fact = 'E' and equed is not 'N', A is scaled as follows: * If A->Stype = SLU_NC: * equed = 'R': A := diag(R) * A * equed = 'C': A := A * diag(C) * equed = 'B': A := diag(R) * A * diag(C). * If A->Stype = SLU_NR: * equed = 'R': transpose(A) := diag(R) * transpose(A) * equed = 'C': transpose(A) := transpose(A) * diag(C) * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). * * factor_params (input) factor_param_t* * The structure defines the input scalar parameters, consisting of * the following fields. If factor_params = NULL, the default * values are used for all the fields; otherwise, the values * are given by the user. * - panel_size (int): Panel size. A panel consists of at most * panel_size consecutive columns. If panel_size = -1, use * default value 8. * - relax (int): To control degree of relaxing supernodes. If the * number of nodes (columns) in a subtree of the elimination * tree is less than relax, this subtree is considered as one * supernode, regardless of the row structures of those columns. * If relax = -1, use default value 8. * - diag_pivot_thresh (double): Diagonal pivoting threshold. * At step j of the Gaussian elimination, if * abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * then use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. * If diag_pivot_thresh = -1, use default value 1.0, * which corresponds to standard partial pivoting. * - drop_tol (double): Drop tolerance threshold. (NOT IMPLEMENTED) * At step j of the Gaussian elimination, if * abs(A_ij)/(max_i abs(A_ij)) < drop_tol, * then drop entry A_ij. 0 <= drop_tol <= 1. * If drop_tol = -1, use default value 0.0, which corresponds to * standard Gaussian elimination. * * perm_c (input/output) int* * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, * which defines the permutation matrix Pc; perm_c[i] = j means * column i of A is in position j in A*Pc. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree * is already in postorder. * * If A->Stype = SLU_NR, column permutation vector of size A->nrow, * which describes permutation of columns of transpose(A) * (rows of A) as described above. * * perm_r (input/output) int* * If A->Stype = SLU_NC, row permutation vector of size A->nrow, * which defines the permutation matrix Pr, and is determined * by partial pivoting. perm_r[i] = j means row i of A is in * position j in Pr*A. * * If A->Stype = SLU_NR, permutation vector of size A->ncol, which * determines permutation of rows of transpose(A) * (columns of A) as described above. * * If refact is not 'Y', perm_r is output argument; * If refact = 'Y', the pivoting routine will try to use the input * perm_r, unless a certain threshold criterion is violated. * In that case, perm_r is overwritten by a new permutation * determined by partial pivoting or diagonal threshold pivoting. * * etree (input/output) int*, dimension (A->ncol) * Elimination tree of Pc'*A'*A*Pc. * If fact is not 'F' and refact = 'Y', etree is an input argument, * otherwise it is an output argument. * Note: etree is a vector of parent pointers for a forest whose * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * equed (input/output) char* * Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * If fact = 'F', equed is an input argument, otherwise it is * an output argument. * * R (input/output) double*, dimension (A->nrow) * The row scale factors for A or transpose(A). * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). * If equed = 'N' or 'C', R is not accessed. * If fact = 'F', R is an input argument; otherwise, R is output. * If fact = 'F' and equed = 'R' or 'B', each element of R must * be positive. * * C (input/output) double*, dimension (A->ncol) * The column scale factors for A or transpose(A). * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). * If equed = 'N' or 'R', C is not accessed. * If fact = 'F', C is an input argument; otherwise, C is output. * If fact = 'F' and equed = 'C' or 'B', each element of C must * be positive. * * L (output) SuperMatrix* * The factor L from the factorization * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). * Uses compressed row subscripts storage for supernodes, i.e., * L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). * Uses column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_Z, Mtype = TRU. * * work (workspace/output) void*, size (lwork) (in bytes) * User supplied workspace, should be large enough * to hold data structures for factors L and U. * On exit, if fact is not 'F', L and U point to this array. * * lwork (input) int * Specifies the size of work array in bytes. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * mem_usage->total_needed; no other side effects. * * See argument 'mem_usage' for memory usage statistics. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, * if equed = 'N', B is not modified; otherwise * if A->Stype = SLU_NC: * if trans = 'N' and equed = 'R' or 'B', B is overwritten by * diag(R)*B; * if trans = 'T' or 'C' and equed = 'C' of 'B', B is * overwritten by diag(C)*B; * if A->Stype = SLU_NR: * if trans = 'N' and equed = 'C' or 'B', B is overwritten by * diag(C)*B; * if trans = 'T' or 'C' and equed = 'R' of 'B', B is * overwritten by diag(R)*B. * * X (output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * If info = 0 or info = A->ncol+1, X contains the solution matrix * to the original system of equations. Note that A and B are modified * on exit if equed is not 'N', and the solution to the equilibrated * system is inv(diag(C))*X if trans = 'N' and equed = 'C' or 'B', * or inv(diag(R))*X if trans = 'T' or 'C' and equed = 'R' or 'B'. * * recip_pivot_growth (output) double* * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). * The infinity norm is used. If recip_pivot_growth is much less * than 1, the stability of the LU factorization could be poor. * * rcond (output) double* * The estimate of the reciprocal condition number of the matrix A * after equilibration (if done). If rcond is less than the machine * precision (in particular, if rcond = 0), the matrix is singular * to working precision. This condition is indicated by a return * code of info > 0. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * mem_usage (output) mem_usage_t* * Record the memory usage statistics, consisting of following fields: * - for_lu (float) * The amount of space used in bytes for L\U data structures. * - total_needed (float) * The amount of space needed in bytes to perform factorization. * - expansions (int) * The number of memory expansions during the LU factorization. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly * singular, so the solution and error bounds * could not be computed. * = A->ncol+1: U is nonsingular, but RCOND is less than machine * precision, meaning that the matrix is singular to * working precision. Nevertheless, the solution and * error bounds are computed because there are a number * of situations where the computed solution can be more * accurate than the value of RCOND would suggest. * > A->ncol+1: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ DNformat *Bstore, *Xstore; doublecomplex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ; char trant[1], norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh, drop_tol; double t0; /* temporary time */ double *utime; extern SuperLUStat_t SuperLUStat; /* External functions */ extern double zlangs(char *, SuperMatrix *); extern double dlamch_(char *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; #if 0 printf("zgssvx: fact=%c, trans=%c, refact=%c, equed=%c\n", *fact, *trans, *refact, *equed); #endif *info = 0; nofact = lsame_(fact, "N"); equil = lsame_(fact, "E"); notran = lsame_(trans, "N"); if (nofact || equil) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* Test the input parameters */ if (!nofact && !equil && !lsame_(fact, "F")) *info = -1; else if (!notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2; else if ( !(lsame_(refact,"Y") || lsame_(refact, "N")) ) *info = -3; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -4; else if (lsame_(fact, "F") && !(rowequ || colequ || lsame_(equed, "N"))) *info = -9; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -10; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -11; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -15; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -16; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || B->ncol != X->ncol || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -17; } } if (*info != 0) { i = -(*info); xerbla_("zgssvx", &i); return; } /* Default values for factor_params */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = 1.0; drop_tol = 0.0; if ( factor_params != NULL ) { if ( factor_params->panel_size != -1 ) panel_size = factor_params->panel_size; if ( factor_params->relax != -1 ) relax = factor_params->relax; if ( factor_params->diag_pivot_thresh != -1 ) diag_pivot_thresh = factor_params->diag_pivot_thresh; if ( factor_params->drop_tol != -1 ) drop_tol = factor_params->drop_tol; } StatInit(panel_size, relax); utime = SuperLUStat.utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ *trant = 'T'; notran = 0; } else { *trant = 'N'; notran = 1; } } else { /* A->Stype == SLU_NC */ *trant = *trans; AA = A; } if ( equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } /* Scale the right hand side if equilibration was performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], R[i]); } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], C[i]); } } if ( nofact || equil ) { t0 = SuperLU_timer_(); sp_preorder(refact, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); */ /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); zgstrf(refact, &AC, diag_pivot_thresh, drop_tol, relax, panel_size, etree, work, lwork, perm_r, perm_c, L, U, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); } return; } /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = zlangs(norm, AA); zgscon(norm, L, U, anorm, rcond, info); utime[RCOND] = SuperLU_timer_() - t0; /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); zgstrs (trant, L, U, perm_r, perm_c, X, info); utime[SOLVE] = SuperLU_timer_() - t0; /* Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. */ t0 = SuperLU_timer_(); zgsrfs(trant, AA, L, U, perm_r, perm_c, equed, R, C, B, X, ferr, berr, info); utime[REFINE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i + j*ldx], &Xmat[i + j*ldx], C[i]); } } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+ j*ldx], &Xmat[i+ j*ldx], R[i]); } } /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; zQuerySpace(L, U, panel_size, mem_usage); if ( nofact || equil ) Destroy_CompCol_Permuted(&AC); if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } PrintStat( &SuperLUStat ); StatFree(); }
void zlaqgs(SuperMatrix *A, double *r, double *c, double rowcnd, double colcnd, double amax, equed_t *equed) { /* Purpose ======= zlaqgs() equilibrates a general sparse M by N matrix A using the row and scaling factors in the vectors R and C. See supermatrix.h for the definition of 'SuperMatrix' structure. Arguments ========= A (input/output) SuperMatrix* On exit, the equilibrated matrix. See EQUED for the form of the equilibrated matrix. The type of A can be: Stype = SLU_NC; Dtype = SLU_Z; Mtype = SLU_GE. R (input) double*, dimension (A->nrow) The row scale factors for A. C (input) double*, dimension (A->ncol) The column scale factors for A. ROWCND (input) double Ratio of the smallest R(i) to the largest R(i). COLCND (input) double Ratio of the smallest C(i) to the largest C(i). AMAX (input) double Absolute value of largest matrix entry. EQUED (output) equed_t* Specifies the form of equilibration that was done. = NOEQUIL: No equilibration = ROW: Row equilibration, i.e., A has been premultiplied by diag(R). = COL: Column equilibration, i.e., A has been postmultiplied by diag(C). = BOTH: Both row and column equilibration, i.e., A has been replaced by diag(R) * A * diag(C). Internal Parameters =================== THRESH is a threshold value used to decide if row or column scaling should be done based on the ratio of the row or column scaling factors. If ROWCND < THRESH, row scaling is done, and if COLCND < THRESH, column scaling is done. LARGE and SMALL are threshold values used to decide if row scaling should be done based on the absolute size of the largest matrix element. If AMAX > LARGE or AMAX < SMALL, row scaling is done. ===================================================================== */ #define THRESH (0.1) /* Local variables */ NCformat *Astore; doublecomplex *Aval; int i, j, irow; double large, small, cj; extern double dlamch_(char *); double temp; /* Quick return if possible */ if (A->nrow <= 0 || A->ncol <= 0) { *equed = NOEQUIL; return; } Astore = A->Store; Aval = Astore->nzval; /* Initialize LARGE and SMALL. */ small = dlamch_("Safe minimum") / dlamch_("Precision"); large = 1. / small; if (rowcnd >= THRESH && amax >= small && amax <= large) { if (colcnd >= THRESH) *equed = NOEQUIL; else { /* Column scaling */ for (j = 0; j < A->ncol; ++j) { cj = c[j]; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { zd_mult(&Aval[i], &Aval[i], cj); } } *equed = COL; } } else if (colcnd >= THRESH) { /* Row scaling, no column scaling */ for (j = 0; j < A->ncol; ++j) for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; zd_mult(&Aval[i], &Aval[i], r[irow]); } *equed = ROW; } else { /* Row and column scaling */ for (j = 0; j < A->ncol; ++j) { cj = c[j]; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; temp = cj * r[irow]; zd_mult(&Aval[i], &Aval[i], temp); } } *equed = BOTH; } return; } /* zlaqgs */
/*! \brief * <pre> * Purpose * ======= * ilu_zdrop_row() - Drop some small rows from the previous * supernode (L-part only). * </pre> */ int ilu_zdrop_row( superlu_options_t *options, /* options */ int first, /* index of the first column in the supernode */ int last, /* index of the last column in the supernode */ double drop_tol, /* dropping parameter */ int quota, /* maximum nonzero entries allowed */ int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, * does not change if options->ILU_MILU != SMILU1 */ GlobalLU_t *Glu, /* modified */ double dwork[], /* working space * the length of dwork[] should be no less than * the number of rows in the supernode */ double dwork2[], /* working space with the same size as dwork[], * used only by the second dropping rule */ int lastc /* if lastc == 0, there is nothing after the * working supernode [first:last]; * if lastc == 1, there is one more column after * the working supernode. */ ) { register int i, j, k, m1; register int nzlc; /* number of nonzeros in column last+1 */ register int xlusup_first, xlsub_first; int m, n; /* m x n is the size of the supernode */ int r = 0; /* number of dropped rows */ register double *temp; register doublecomplex *lusup = Glu->lusup; register int *lsub = Glu->lsub; register int *xlsub = Glu->xlsub; register int *xlusup = Glu->xlusup; register double d_max = 0.0, d_min = 1.0; int drop_rule = options->ILU_DropRule; milu_t milu = options->ILU_MILU; norm_t nrm = options->ILU_Norm; doublecomplex zero = {0.0, 0.0}; doublecomplex one = {1.0, 0.0}; doublecomplex none = {-1.0, 0.0}; int i_1 = 1; int inc_diag; /* inc_diag = m + 1 */ int nzp = 0; /* number of zero pivots */ double alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); xlusup_first = xlusup[first]; xlsub_first = xlsub[first]; m = xlusup[first + 1] - xlusup_first; n = last - first + 1; m1 = m - 1; inc_diag = m + 1; nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; temp = dwork - n; /* Quick return if nothing to do. */ if (m == 0 || m == n || drop_rule == NODROP) { *nnzLj += m * n; return 0; } /* basic dropping: ILU(tau) */ for (i = n; i <= m1; ) { /* the average abs value of ith row */ switch (nrm) { case ONE_NORM: temp[i] = dzasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; break; case TWO_NORM: temp[i] = dznrm2_(&n, &lusup[xlusup_first + i], &m) / sqrt((double)n); break; case INF_NORM: default: k = izamax_(&n, &lusup[xlusup_first + i], &m) - 1; temp[i] = z_abs1(&lusup[xlusup_first + i + m * k]); break; } /* drop small entries due to drop_tol */ if (drop_rule & DROP_BASIC && temp[i] < drop_tol) { r++; /* drop the current row and move the last undropped row here */ if (r > 1) /* add to last row */ { /* accumulate the sum (for MILU) */ switch (milu) { case SMILU_1: case SMILU_2: zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, &lusup[xlusup_first + m - 1], &m); break; case SMILU_3: for (j = 0; j < n; j++) lusup[xlusup_first + (m - 1) + j * m].r += z_abs1(&lusup[xlusup_first + i + j * m]); break; case SILU: default: break; } zcopy_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); } /* if (r > 1) */ else /* move to last row */ { zswap_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); if (milu == SMILU_3) for (j = 0; j < n; j++) { lusup[xlusup_first + m1 + j * m].r = z_abs1(&lusup[xlusup_first + m1 + j * m]); lusup[xlusup_first + m1 + j * m].i = 0.0; } } lsub[xlsub_first + i] = lsub[xlsub_first + m1]; m1--; continue; } /* if dropping */ else { if (temp[i] > d_max) d_max = temp[i]; if (temp[i] < d_min) d_min = temp[i]; } i++; } /* for */ /* Secondary dropping: drop more rows according to the quota. */ quota = ceil((double)quota / (double)n); if (drop_rule & DROP_SECONDARY && m - r > quota) { register double tol = d_max; /* Calculate the second dropping tolerance */ if (quota > n) { if (drop_rule & DROP_INTERP) /* by interpolation */ { d_max = 1.0 / d_max; d_min = 1.0 / d_min; tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); } else /* by quick select */ { int len = m1 - n + 1; dcopy_(&len, dwork, &i_1, dwork2, &i_1); tol = dqselect(len, dwork2, quota - n); #if 0 register int *itemp = iwork - n; A = temp; for (i = n; i <= m1; i++) itemp[i] = i; qsort(iwork, m1 - n + 1, sizeof(int), _compare_); tol = temp[itemp[quota]]; #endif } } for (i = n; i <= m1; ) { if (temp[i] <= tol) { register int j; r++; /* drop the current row and move the last undropped row here */ if (r > 1) /* add to last row */ { /* accumulate the sum (for MILU) */ switch (milu) { case SMILU_1: case SMILU_2: zaxpy_(&n, &one, &lusup[xlusup_first + i], &m, &lusup[xlusup_first + m - 1], &m); break; case SMILU_3: for (j = 0; j < n; j++) lusup[xlusup_first + (m - 1) + j * m].r += z_abs1(&lusup[xlusup_first + i + j * m]); break; case SILU: default: break; } zcopy_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); } /* if (r > 1) */ else /* move to last row */ { zswap_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); if (milu == SMILU_3) for (j = 0; j < n; j++) { lusup[xlusup_first + m1 + j * m].r = z_abs1(&lusup[xlusup_first + m1 + j * m]); lusup[xlusup_first + m1 + j * m].i = 0.0; } } lsub[xlsub_first + i] = lsub[xlsub_first + m1]; m1--; temp[i] = temp[m1]; continue; } i++; } /* for */ } /* if secondary dropping */ for (i = n; i < m; i++) temp[i] = 0.0; if (r == 0) { *nnzLj += m * n; return 0; } /* add dropped entries to the diagnal */ if (milu != SILU) { register int j; doublecomplex t; double omega; for (j = 0; j < n; j++) { t = lusup[xlusup_first + (m - 1) + j * m]; if (t.r == 0.0 && t.i == 0.0) continue; omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / z_abs1(&t), 1.0); zd_mult(&t, &t, omega); switch (milu) { case SMILU_1: if ( !(z_eq(&t, &none)) ) { z_add(&t, &t, &one); zz_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], &t); } else { zd_mult( &lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], *fill_tol); #ifdef DEBUG printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); fflush(stdout); #endif nzp++; } break; case SMILU_2: zd_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], 1.0 + z_abs1(&t)); break; case SMILU_3: z_add(&t, &t, &one); zz_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], &t); break; case SILU: default: break; } } if (nzp > 0) *fill_tol = -nzp; } /* Remove dropped entries from the memory and fix the pointers. */ m1 = m - r; for (j = 1; j < n; j++) { register int tmp1, tmp2; tmp1 = xlusup_first + j * m1; tmp2 = xlusup_first + j * m; for (i = 0; i < m1; i++) lusup[i + tmp1] = lusup[i + tmp2]; } for (i = 0; i < nzlc; i++) lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; for (i = 0; i < nzlc; i++) lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; for (i = first + 1; i <= last + 1; i++) { xlusup[i] -= r * (i - first); xlsub[i] -= r; } if (lastc) { xlusup[last + 2] -= r * n; xlsub[last + 2] -= r; } *nnzLj += (m - r) * n; return r; }