/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b * * <pre> * Purpose * ======= * * sp_ztrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A^H*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU. * * x - (input/output) doublecomplex* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * </pre> */ int sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; doublecomplex *Lval, *Uval; int incx = 1, incy = 1; doublecomplex temp; doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; doublecomplex comp_zero = {0.0, 0.0}; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; doublecomplex *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( strncmp(uplo,"L", 1)!=0 && strncmp(uplo, "U", 1)!=0 ) *info = -1; else if ( strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 && strncmp(trans, "C", 1)!=0) *info = -2; else if ( strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0 ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); input_error("sp_ztrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = doublecomplexCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_ztrsv()."); if ( strncmp(trans, "N", 1)==0 ) { /* Form x := inv(A)*x. */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; /* 1 z_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; solve_ops += 8 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]); z_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ work[i] = comp_zero; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); /* 1 z_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); zz_mult(&comp_zero, &x[fsupc], &Uval[i]); z_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); zz_mult(&comp_zero, &x[jcol], &Uval[i]); z_sub(&x[irow], &x[irow], &comp_zero); } } } } /* for k ... */ } } else if ( strncmp(trans, "T", 1)==0 ) { /* Form x := inv(A')*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); zz_mult(&comp_zero, &x[irow], &Lval[i]); z_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); zz_mult(&comp_zero, &x[irow], &Uval[i]); z_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 z_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { z_div(&x[fsupc], &x[fsupc], &Lval[luptr]); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } else { /* Form x := conj(inv(A'))*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := conj(inv(L'))*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); zz_conj(&temp, &Lval[i]); zz_mult(&comp_zero, &x[irow], &temp); z_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := conj(inv(U'))*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); zz_conj(&temp, &Uval[i]); zz_mult(&comp_zero, &x[irow], &temp); z_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 z_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { zz_conj(&temp, &Lval[luptr]); z_div(&x[fsupc], &x[fsupc], &temp); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
float sPivotGrowth(int ncols, SuperMatrix *A, int *perm_c, SuperMatrix *L, SuperMatrix *U) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * Compute the reciprocal pivot growth factor of the leading ncols columns * of the matrix, using the formula: * min_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ) * * Arguments * ========= * * ncols (input) int * The number of columns of matrices A, L and U. * * A (input) SuperMatrix* * Original matrix A, permuted by columns, of dimension * (A->nrow, A->ncol). The type of A can be: * Stype = NC; Dtype = _D; Mtype = GE. * * L (output) SuperMatrix* * The factor L from the factorization Pr*A=L*U; use compressed row * subscripts storage for supernodes, i.e., L has type: * Stype = SC; Dtype = _D; Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise * storage scheme, i.e., U has types: Stype = NC; * Dtype = _D; Mtype = TRU. * */ NCformat *Astore; SCPformat *Lstore; NCPformat *Ustore; float *Aval, *Lval, *Uval; int fsupc, nsupr, luptr, nz_in_U; int i, j, k, oldcol; int *inv_perm_c; float rpg, maxaj, maxuj; extern double slamch_(char *); float smlnum; float *luval; /* Get machine constants. */ smlnum = slamch_("S"); rpg = 1. / smlnum; Astore = A->Store; Lstore = L->Store; Ustore = U->Store; Aval = Astore->nzval; Lval = Lstore->nzval; Uval = Ustore->nzval; inv_perm_c = (int *) SUPERLU_MALLOC( (size_t) A->ncol*sizeof(int) ); for (j = 0; j < A->ncol; ++j) inv_perm_c[perm_c[j]] = j; for (k = 0; k <= Lstore->nsuper; ++k) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_END(fsupc) - L_SUB_START(fsupc); luptr = L_NZ_START(fsupc); luval = &Lval[luptr]; nz_in_U = 1; for (j = fsupc; j < L_LAST_SUPC(k) && j < ncols; ++j) { maxaj = 0.; oldcol = inv_perm_c[j]; for (i = Astore->colptr[oldcol]; i < Astore->colptr[oldcol+1]; i++) maxaj = SUPERLU_MAX( maxaj, fabs(Aval[i]) ); maxuj = 0.; for (i = Ustore->colbeg[j]; i < Ustore->colend[j]; i++) maxuj = SUPERLU_MAX( maxuj, fabs(Uval[i]) ); /* Supernode */ for (i = 0; i < nz_in_U; ++i) maxuj = SUPERLU_MAX( maxuj, fabs(luval[i]) ); ++nz_in_U; luval += nsupr; if ( maxuj == 0. ) rpg = SUPERLU_MIN( rpg, 1.); else rpg = SUPERLU_MIN( rpg, maxaj / maxuj ); } if ( j >= ncols ) break; } SUPERLU_FREE(inv_perm_c); return (rpg); }
/*! \brief * * <pre> * Create the distributed modified sparse row (MSR) matrix: bindx/val. * For a submatrix of size m-by-n, the MSR arrays are as follows: * bindx[0] = m + 1 * bindx[0..m] = pointer to start of each row * bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k, * where, ks = bindx[k], ke = bindx[k+1]-1 * val[k] = A(k,k), k < m, diagonal elements * val[m] = not used * val[ki] = A(k, bindx[ki]), where ks <= ki <= ke * Both arrays are of length nnz + 1. * </pre> */ static void dcreate_msr_matrix ( SuperMatrix *A, /* Matrix A permuted by columns (input). The type of A can be: Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. */ int_t update[], /* input (local) */ int_t N_update, /* input (local) */ double **val, /* output */ int_t **bindx /* output */ ) { int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; NCPformat *Astore; double *nzval; int_t *rowcnt; double zero = 0.0; if ( !N_update ) return; n = A->ncol; Astore = A->Store; nzval = Astore->nzval; /* One pass of original matrix A to count nonzeros of each row. */ if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) ABORT("Malloc fails for rowcnt[]"); lo = update[0]; hi = update[N_update-1]; nnz_local = 0; nnz_diag = 0; for (j = 0; j < n; ++j) { for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { irow = Astore->rowind[i]; if ( irow >= lo && irow <= hi ) { if ( irow != j ) /* Exclude diagonal */ ++rowcnt[irow - lo]; else ++nnz_diag; /* Count nonzero diagonal entries */ ++nnz_local; } } } /* Add room for the logical diagonal zeros which are not counted in nnz_local. */ nnz_local += (N_update - nnz_diag); /* Allocate storage for bindx[] and val[]. */ if ( !(*val = (double *) doubleMalloc_dist(nnz_local+1)) ) ABORT("Malloc fails for val[]"); for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) ABORT("Malloc fails for bindx[]"); /* Set up row pointers. */ (*bindx)[0] = N_update + 1; for (j = 1; j <= N_update; ++j) { (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; rowcnt[j-1] = (*bindx)[j-1]; } /* One pass of original matrix A to fill in matrix entries. */ for (j = 0; j < n; ++j) { for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { irow = Astore->rowind[i]; if ( irow >= lo && irow <= hi ) { if ( irow == j ) /* Diagonal */ (*val)[irow - lo] = nzval[i]; else { irow -= lo; k = rowcnt[irow]; (*bindx)[k] = j; (*val)[k] = nzval[i]; ++rowcnt[irow]; } } } } SUPERLU_FREE(rowcnt); }
void f_destroy_SuperLUStat_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
void f_destroy_options_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
int main(int argc, char *argv[]) { superlu_options_t options; SuperLUStat_t stat; SuperMatrix A; ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; gridinfo_t grid; double *berr; doublecomplex *a, *b, *b1, *xtrue; int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol; int iam, info, ldb, ldx, nrhs; char trans[1]; char **cpp, c; FILE *fp, *fopen(); extern int cpp_defs(); nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { c = *(*cpp+1); ++cpp; switch (c) { case 'h': printf("Options:\n"); printf("\t-r <int>: process rows (default %d)\n", nprow); printf("\t-c <int>: process columns (default %d)\n", npcol); exit(0); break; case 'r': nprow = atoi(*cpp); break; case 'c': npcol = atoi(*cpp); break; } } else { /* Last arg is considered a filename */ if ( !(fp = fopen(*cpp, "r")) ) { ABORT("File does not exist"); } break; } } /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); /* Bail out if I do not belong in the grid. */ iam = grid.iam; if ( iam >= nprow * npcol ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); #endif /* ------------------------------------------------------------ PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL THE OTHER PROCESSES. ------------------------------------------------------------*/ if ( !iam ) { /* Print the CPP definitions. */ cpp_defs(); /* Read the matrix stored on disk in Harwell-Boeing format. */ zreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); printf("Input matrix file: %s\n", *cpp); printf("\tDimension\t%dx%d\t # nonzeros %d\n", m, n, nnz); printf("\tProcess grid\t%d X %d\n", grid.nprow, grid.npcol); /* Broadcast matrix A to the other PEs. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( a, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } else { /* Receive matrix A from PE 0. */ MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); /* Allocate storage for compressed column representation. */ zallocateA_dist(n, nnz, &a, &asub, &xa); MPI_Bcast( a, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid.comm ); MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); } /* Create compressed column matrix for A. */ zCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE); /* Generate the exact solution and compute the right-hand side. */ if ( !(b = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]"); if ( !(b1 = doublecomplexMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); if ( !(xtrue = doublecomplexMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]"); *trans = 'N'; ldx = n; ldb = m; zGenXtrue_dist(n, nrhs, xtrue, ldx); zFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); for (j = 0; j < nrhs; ++j) for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); /* ------------------------------------------------------------ WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag; options.ReplaceTinyPivot = YES; options.Trans = NOTRANS; options.IterRefine = DOUBLE; options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; */ set_default_options_dist(&options); if (!iam) { print_sp_ienv_dist(&options); print_options_dist(&options); } /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit(m, n, &ScalePermstruct); LUstructInit(n, &LUstruct); /* Initialize the statistics variables. */ PStatInit(&stat); /* Call the linear equation solver: factorize and solve. */ pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { zinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ pzgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) { printf("Solve the system with a different B.\n"); zinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompCol_Matrix_dist(&A); Destroy_LU(n, &grid, &LUstruct); ScalePermstructFree(&ScalePermstruct); LUstructFree(&LUstruct); SUPERLU_FREE(b); SUPERLU_FREE(b1); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: superlu_gridexit(&grid); /* ------------------------------------------------------------ TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ MPI_Finalize(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit main()"); #endif }
void f_destroy_SOLVEstruct_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
void zgstrf (superlu_options_t *options, SuperMatrix *A, double drop_tol, int relax, int panel_size, int *etree, void *work, int lwork, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info) { /* Local working arrays */ NCPformat *Astore; int *iperm_r = NULL; /* inverse of perm_r; used when options->Fact == SamePattern_SameRowPerm */ int *iperm_c; /* inverse of perm_c */ int *iwork; doublecomplex *zwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *xprune; int *marker; doublecomplex *dense, *tempv; int *relax_end; doublecomplex *a; int *asub; int *xa_begin, *xa_end; int *xsup, *supno; int *xlsub, *xlusup, *xusub; int nzlumax; static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */ /* Local scalars */ fact_t fact = options->Fact; double diag_pivot_thresh = options->DiagPivotThresh; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ register int jcol; register int kcol; /* end column of a relaxed snode */ register int icol; register int i, k, jj, new_next, iinfo; int m, n, min_mn, jsupno, fsupc, nextlu, nextu; int w_def; /* upper bound on panel width */ int usepr, iperm_r_allocated = 0; int nnzL, nnzU; int *panel_histo = stat->panel_histo; flops_t *ops = stat->ops; iinfo = 0; m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN(m, n); Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; /* Allocate storage common to the factor routines */ *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, L, U, &Glu, &iwork, &zwork); if ( *info ) return; xsup = Glu.xsup; supno = Glu.supno; xlsub = Glu.xlsub; xlusup = Glu.xlusup; xusub = Glu.xusub; SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &xprune, &marker); zSetRWork(m, panel_size, zwork, &dense, &tempv); usepr = (fact == SamePattern_SameRowPerm); if ( usepr ) { /* Compute the inverse of perm_r */ iperm_r = (int *) intMalloc(m); for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; iperm_r_allocated = 1; } iperm_c = (int *) intMalloc(n); for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; /* Identify relaxed snodes */ relax_end = (int *) intMalloc(n); if ( options->SymmetricMode == YES ) { heap_relax_snode(n, etree, relax, marker, relax_end); } else { relax_snode(n, etree, relax, marker, relax_end); } ifill (perm_r, m, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); supno[0] = -1; xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; w_def = panel_size; /* * Work on one "panel" at a time. A panel is one of the following: * (a) a relaxed supernode at the bottom of the etree, or * (b) panel_size contiguous columns, defined by the user */ for (jcol = 0; jcol < min_mn; ) { if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ kcol = relax_end[jcol]; /* end of the relaxed snode */ panel_histo[kcol-jcol+1]++; /* -------------------------------------- * Factorize the relaxed supernode(jcol:kcol) * -------------------------------------- */ /* Determine the union of the row structure of the snode */ if ( (*info = zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, xprune, marker, &Glu)) != 0 ) return; nextu = xusub[jcol]; nextlu = xlusup[jcol]; jsupno = supno[jcol]; fsupc = xsup[jsupno]; new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); nzlumax = Glu.nzlumax; while ( new_next > nzlumax ) { *info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu); if ( (*info) ) return; } for (icol = jcol; icol<= kcol; icol++) { xusub[icol+1] = nextu; /* Scatter into SPA dense[*] */ for (k = xa_begin[icol]; k < xa_end[icol]; k++) dense[asub[k]] = a[k]; /* Numeric update within the snode */ zsnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat); *info = zpivotL(icol, diag_pivot_thresh, &usepr, perm_r,iperm_r, iperm_c, &pivrow, &Glu, stat); if ( (*info) ) if ( iinfo == 0 ) iinfo = *info; #ifdef DEBUG zprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu); #endif } jcol = icol; } else { /* Work on one panel of panel_size columns */ /* Adjust panel_size so that a panel won't overlap with the next * relaxed snode. */ panel_size = w_def; for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) if ( relax_end[k] != EMPTY ) { panel_size = k - jcol; break; } if ( k == min_mn ) panel_size = min_mn - jcol; panel_histo[panel_size]++; /* symbolic factor on a panel of columns */ zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, dense, panel_lsub, segrep, repfnz, xprune, marker, parent, xplore, &Glu); /* numeric sup-panel updates in topological order */ zpanel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, &Glu, stat); /* Sparse LU within the panel, and below panel diagonal */ for ( jj = jcol; jj < jcol + panel_size; jj++) { k = (jj - jcol) * m; /* column index for w-wide arrays */ nseg = nseg1; /* Begin after all the panel segments */ if ((*info = zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], segrep, &repfnz[k], xprune, marker, parent, xplore, &Glu)) != 0) return; /* Numeric updates */ if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], tempv, &segrep[nseg1], &repfnz[k], jcol, &Glu, stat)) != 0) return; /* Copy the U-segments to ucol[*] */ if ((*info = zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], perm_r, &dense[k], &Glu)) != 0) return; *info = zpivotL(jj, diag_pivot_thresh, &usepr, perm_r,iperm_r, iperm_c, &pivrow, &Glu, stat); if ( (*info) ) if ( iinfo == 0 ) iinfo = *info; /* Prune columns (0:jj-1) using column jj */ zpruneL(jj, perm_r, pivrow, nseg, segrep, &repfnz[k], xprune, &Glu); /* Reset repfnz[] for this column */ resetrep_col (nseg, segrep, &repfnz[k]); #ifdef DEBUG zprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu); #endif } jcol += panel_size; /* Move to the next panel */ } /* else */ } /* for */ *info = iinfo; if ( m > n ) { k = 0; for (i = 0; i < m; ++i) if ( perm_r[i] == EMPTY ) { perm_r[i] = n + k; ++k; } } countnz(min_mn, xprune, &nnzL, &nnzU, &Glu); fixupL(min_mn, perm_r, &Glu); zLUWorkFree(iwork, zwork, &Glu); /* Free work space and compress storage */ if ( fact == SamePattern_SameRowPerm ) { /* L and U structures may have changed due to possibly different pivoting, even though the storage is available. There could also be memory expansions, so the array locations may have changed, */ ((SCformat *)L->Store)->nnz = nnzL; ((SCformat *)L->Store)->nsuper = Glu.supno[n]; ((SCformat *)L->Store)->nzval = Glu.lusup; ((SCformat *)L->Store)->nzval_colptr = Glu.xlusup; ((SCformat *)L->Store)->rowind = Glu.lsub; ((SCformat *)L->Store)->rowind_colptr = Glu.xlsub; ((NCformat *)U->Store)->nnz = nnzU; ((NCformat *)U->Store)->nzval = Glu.ucol; ((NCformat *)U->Store)->rowind = Glu.usub; ((NCformat *)U->Store)->colptr = Glu.xusub; } else { zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, Glu.lusup, Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno, Glu.xsup, SLU_SC, SLU_Z, SLU_TRLU); zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, Glu.usub, Glu.xusub, SLU_NC, SLU_Z, SLU_TRU); } ops[FACT] += ops[TRSV] + ops[GEMV]; if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); SUPERLU_FREE (iperm_c); SUPERLU_FREE (relax_end); }
void cgscon(char *norm, SuperMatrix *L, SuperMatrix *U, float anorm, float *rcond, SuperLUStat_t *stat, int *info) { /* Local variables */ int kase, kase1, onenrm, i; float ainvnm; complex *work; int isave[3]; extern int crscl_(int *, complex *, complex *, int *); extern int clacon2_(int *, complex *, complex *, float *, int *, int []); /* Test the input parameters. */ *info = 0; onenrm = *(unsigned char *)norm == '1' || strncmp(norm, "O", 1)==0; if (! onenrm && ! strncmp(norm, "I", 1)==0) *info = -1; else if (L->nrow < 0 || L->nrow != L->ncol || L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU) *info = -2; else if (U->nrow < 0 || U->nrow != U->ncol || U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU) *info = -3; if (*info != 0) { i = -(*info); input_error("cgscon", &i); return; } /* Quick return if possible */ *rcond = 0.; if ( L->nrow == 0 || U->nrow == 0) { *rcond = 1.; return; } work = complexCalloc( 3*L->nrow ); if ( !work ) ABORT("Malloc fails for work arrays in cgscon."); /* Estimate the norm of inv(A). */ ainvnm = 0.; if ( onenrm ) kase1 = 1; else kase1 = 2; kase = 0; do { clacon2_(&L->nrow, &work[L->nrow], &work[0], &ainvnm, &kase, isave); if (kase == 0) break; if (kase == kase1) { /* Multiply by inv(L). */ sp_ctrsv("L", "No trans", "Unit", L, U, &work[0], stat, info); /* Multiply by inv(U). */ sp_ctrsv("U", "No trans", "Non-unit", L, U, &work[0], stat, info); } else { /* Multiply by inv(U'). */ sp_ctrsv("U", "Transpose", "Non-unit", L, U, &work[0], stat, info); /* Multiply by inv(L'). */ sp_ctrsv("L", "Transpose", "Unit", L, U, &work[0], stat, info); } } while ( kase != 0 ); /* Compute the estimate of the reciprocal condition number. */ if (ainvnm != 0.) *rcond = (1. / ainvnm) / anorm; SUPERLU_FREE (work); return; } /* cgscon */
/*! \brief * * <pre> * Purpose * ======= * * SGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * sgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) float*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) float*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE. * On entry, the solution matrix X, as computed by sgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) float*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) float*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * </pre> */ void sgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, char *equed, float *R, float *C, SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 5 /* Table of constant values */ int ione = 1; float ndone = -1.; float done = 1.; /* Local variables */ NCformat *Astore; float *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; float *Bmat, *Xmat, *Bptr, *Xptr; int kase; float safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; float s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; float *work; float *rwork; int *iwork; extern int slacon_(int *, float *, float *, int *, float *, int *); #ifdef _CRAY extern int SCOPY(int *, float *, int *, float *, int *); extern int SSAXPY(int *, float *, float *, int *, float *, int *); #else extern int scopy_(int *, float *, int *, float *, int *); extern int saxpy_(int *, float *, float *, int *, float *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_S || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("sgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); /* Allocate working space */ work = floatMalloc(2*A->nrow); rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) ); iwork = intMalloc(2*A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else { *(unsigned char *)transc = 'T'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY SCOPY(&A->nrow, Bptr, &ione, work, &ione); #else scopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_sgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += fabs(Aval[i]) * fabs(Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) { s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); } else if ( rwork[i] != 0.0 ) { /* Adding SAFE1 to the numerator guards against spuriously zero residuals (underflow). */ s = SUPERLU_MAX( s, (safe1 + fabs(work[i])) / rwork[i] ); } /* If rwork[i] is exactly 0.0, then we know the true residual also must be exactly 0.0. */ } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); #ifdef _CRAY SAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else saxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ stat->RefineSteps = count; /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use SLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = fabs( Xptr[irow] ); s += fabs(Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { slacon_(&A->nrow, &work[A->nrow], work, &iwork[A->nrow], &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; sgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; sgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* sgsrfs */
void c_fortran_zgssv_(int *iopt, int *n, int *nnz, int *nrhs, doublecomplex *values, int *rowind, int *colptr, doublecomplex *b, int *ldb, fptr *f_factors, /* a handle containing the address pointing to the factored matrices */ int *info) { /* * This routine can be called from Fortran. * * iopt (input) int * Specifies the operation: * = 1, performs LU decomposition for the first time * = 2, performs triangular solve * = 3, free all the storage in the end * * f_factors (input/output) fptr* * If iopt == 1, it is an output and contains the pointer pointing to * the structure of the factored matrices. * Otherwise, it it an input. * */ SuperMatrix A, AC, B; SuperMatrix *L, *U; int *perm_r; /* row permutations from partial pivoting */ int *perm_c; /* column permutation vector */ int *etree; /* column elimination tree */ SCformat *Lstore; NCformat *Ustore; int i, panel_size, permc_spec, relax; trans_t trans; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; factors_t *LUfactors; trans = NOTRANS; if ( *iopt == 1 ) { /* LU decomposition */ /* Set the default input options. */ set_default_options(&options); /* Initialize the statistics variables. */ StatInit(&stat); /* Adjust to 0-based indexing */ for (i = 0; i < *nnz; ++i) --rowind[i]; for (i = 0; i <= *n; ++i) --colptr[i]; zCreate_CompCol_Matrix(&A, *n, *n, *nnz, values, rowind, colptr, SLU_NC, SLU_Z, SLU_GE); L = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); U = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); if ( !(perm_r = intMalloc(*n)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(*n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(etree = intMalloc(*n)) ) ABORT("Malloc fails for etree[]."); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = 0: natural ordering * permc_spec = 1: minimum degree on structure of A'*A * permc_spec = 2: minimum degree on structure of A'+A * permc_spec = 3: approximate minimum degree for unsymmetric matrices */ permc_spec = options.ColPerm; get_perm_c(permc_spec, &A, perm_c); sp_preorder(&options, &A, perm_c, etree, &AC); panel_size = sp_ienv(1); relax = sp_ienv(2); zgstrf(&options, &AC, relax, panel_size, etree, NULL, 0, perm_c, perm_r, L, U, &stat, info); if ( *info == 0 ) { Lstore = (SCformat *) L->Store; Ustore = (NCformat *) U->Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz); zQuerySpace(L, U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } else { printf("zgstrf() error returns INFO= %d\n", *info); if ( *info <= *n ) { /* factorization completes */ zQuerySpace(L, U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } } /* Restore to 1-based indexing */ for (i = 0; i < *nnz; ++i) ++rowind[i]; for (i = 0; i <= *n; ++i) ++colptr[i]; /* Save the LU factors in the factors handle */ LUfactors = (factors_t*) SUPERLU_MALLOC(sizeof(factors_t)); LUfactors->L = L; LUfactors->U = U; LUfactors->perm_c = perm_c; LUfactors->perm_r = perm_r; *f_factors = (fptr) LUfactors; /* Free un-wanted storage */ SUPERLU_FREE(etree); Destroy_SuperMatrix_Store(&A); Destroy_CompCol_Permuted(&AC); StatFree(&stat); } else if ( *iopt == 2 ) { /* Triangular solve */ /* Initialize the statistics variables. */ StatInit(&stat); /* Extract the LU factors in the factors handle */ LUfactors = (factors_t*) *f_factors; L = LUfactors->L; U = LUfactors->U; perm_c = LUfactors->perm_c; perm_r = LUfactors->perm_r; zCreate_Dense_Matrix(&B, *n, *nrhs, b, *ldb, SLU_DN, SLU_Z, SLU_GE); /* Solve the system A*X=B, overwriting B with X. */ zgstrs (trans, L, U, perm_c, perm_r, &B, &stat, info); Destroy_SuperMatrix_Store(&B); StatFree(&stat); } else if ( *iopt == 3 ) { /* Free storage */ /* Free the LU factors in the factors handle */ LUfactors = (factors_t*) *f_factors; SUPERLU_FREE (LUfactors->perm_r); SUPERLU_FREE (LUfactors->perm_c); Destroy_SuperNode_Matrix(LUfactors->L); Destroy_CompCol_Matrix(LUfactors->U); SUPERLU_FREE (LUfactors->L); SUPERLU_FREE (LUfactors->U); SUPERLU_FREE (LUfactors); } else { fprintf(stderr,"Invalid iopt=%d passed to c_fortran_zgssv()\n",*iopt); exit(-1); } }
void cgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, char *equed, float *R, float *C, SuperMatrix *B, SuperMatrix *X, float *ferr, float *berr, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * CGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_C, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * cgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = SLU_NC, Dtype = SLU_C, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) float*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) float*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_C, Mtype = SLU_GE. * On entry, the solution matrix X, as computed by cgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) float*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) float*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 5 /* Table of constant values */ int ione = 1; complex ndone = {-1., 0.}; complex done = {1., 0.}; /* Local variables */ NCformat *Astore; complex *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; complex *Bmat, *Xmat, *Bptr, *Xptr; int kase; float safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; float s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; complex *work; float *rwork; int *iwork; extern double slamch_(char *); extern int clacon_(int *, complex *, complex *, float *, int *); #ifdef _CRAY extern int CCOPY(int *, complex *, int *, complex *, int *); extern int CSAXPY(int *, complex *, complex *, int *, complex *, int *); #else extern int ccopy_(int *, complex *, int *, complex *, int *); extern int caxpy_(int *, complex *, complex *, int *, complex *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_C || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("cgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); /* Allocate working space */ work = complexMalloc(2*A->nrow); rwork = (float *) SUPERLU_MALLOC( A->nrow * sizeof(float) ); iwork = intMalloc(A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else { *(unsigned char *)transc = 'T'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = slamch_("Epsilon"); safmin = slamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY CCOPY(&A->nrow, Bptr, &ione, work, &ione); #else ccopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_cgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator and denominator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = slu_c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = slu_c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += slu_c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += slu_c_abs1(&Aval[i]) * slu_c_abs1(&Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) s = SUPERLU_MAX( s, slu_c_abs1(&work[i]) / rwork[i] ); else s = SUPERLU_MAX( s, (slu_c_abs1(&work[i]) + safe1) / (rwork[i] + safe1) ); } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); #ifdef _CRAY CAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else caxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ stat->RefineSteps = count; /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use CLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = slu_c_abs1( &Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = slu_c_abs1( &Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += slu_c_abs1(&Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = slu_c_abs1( &Xptr[irow] ); s += slu_c_abs1(&Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = slu_c_abs(&work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = slu_c_abs(&work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { clacon_(&A->nrow, &work[A->nrow], work, &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], R[i]); } cgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) { cs_mult(&work[i], &work[i], rwork[i]); } cgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], C[i]); } else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) { cs_mult(&work[i], &work[i], R[i]); } } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * slu_c_abs1( &Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * slu_c_abs1( &Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, slu_c_abs1( &Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* cgsrfs */
/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b * * <pre> * Purpose * ======= * * sp_strsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU. * * x - (input/output) float* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * </pre> */ int sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; float *Lval, *Uval; int incx = 1, incy = 1; float alpha = 1.0, beta = 1.0; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; float *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_strsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = floatCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_strsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1); solve_ops += 2 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; x[irow] -= x[fsupc] * Lval[luptr]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); x[irow] -= work[i]; /* Scatter */ work[i] = 0.0; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); x[irow] -= x[fsupc] * Uval[i]; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[irow] -= x[jcol] * Uval[i]; } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 2 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); x[jcol] -= x[irow] * Lval[i]; iptr++; } } if ( nsupc > 1 ) { solve_ops += nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); x[jcol] -= x[irow] * Uval[i]; } } solve_ops += nsupc * (nsupc + 1); if ( nsupc == 1 ) { x[fsupc] /= Lval[luptr]; } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
void pzgsrfs_ABXglobal(int_t n, SuperMatrix *A, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, doublecomplex *B, int_t ldb, doublecomplex *X, int_t ldx, int nrhs, double *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 20 Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; /* * Data structures used by matrix-vector multiply routine. */ int_t N_update; /* Number of variables updated on this process */ int_t *update; /* vector elements (global index) updated on this processor. */ int_t *bindx; doublecomplex *val; int_t *mv_sup_to_proc; /* Supernode to process mapping in matrix-vector multiply. */ /*-- end data structures for matrix-vector multiply --*/ doublecomplex *b, *ax, *R, *B_col, *temp, *work, *X_col, *x_trs, *dx_trs; double *rwork; int_t notran; int_t count, ii, j, jj, k, knsupc, lk, lwork, nprow, nsupers, nz, p; int i, iam, pkk; int_t *ilsum, *xsup; double eps, lstres; double s, safmin, safe1, safe2; /* NEW STUFF */ int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ int_t *diag_len; /* Length of the X vector on diagonal processes. */ /*-- Function prototypes --*/ extern void pzgstrs1(int_t, LUstruct_t *, gridinfo_t *, doublecomplex *, int, SuperLUStat_t *, int *); /*extern double dlamch_(char *);*/ /* Test the input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NCP || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; else if ( nrhs < 0 ) *info = -13; if (*info != 0) { i = -(*info); xerbla_("pzgsrfs_ABXglobal", &i); return; } /* Quick return if possible. */ if ( n == 0 || nrhs == 0 ) { return; } /* Initialization. */ iam = grid->iam; nprow = grid->nprow; nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; ilsum = Llu->ilsum; notran = 1; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgsrfs_ABXglobal()"); #endif get_diag_procs(n, Glu_persist, grid, &num_diag_procs, &diag_procs, &diag_len); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. number of diag processes = %d\n", num_diag_procs); PrintInt10("diag_procs", num_diag_procs, diag_procs); PrintInt10("diag_len", num_diag_procs, diag_len); } #endif if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) ABORT("Calloc fails for mv_sup_to_proc[]"); pzgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, &val, &bindx, mv_sup_to_proc); i = CEILING( nsupers, nprow ); /* Number of local block rows */ ii = Llu->ldalsum + i * XK_H; k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); jj = diag_len[0]; for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); jj = SUPERLU_MAX( jj, N_update ); lwork = N_update /* For ax and R */ + ii /* For dx_trs */ + ii /* For x_trs */ + k /* For b */ + jj; /* for temp */ if ( !(work = doublecomplexMalloc_dist(lwork)) ) ABORT("Malloc fails for work[]"); ax = R = work; dx_trs = work + N_update; x_trs = dx_trs + ii; b = x_trs + ii; temp = b + k; if ( !(rwork = SUPERLU_MALLOC(N_update * sizeof(double))) ) ABORT("Malloc fails for rwork[]"); #if ( DEBUGlevel>=2 ) { doublecomplex *dwork = doublecomplexMalloc_dist(n); for (i = 0; i < n; ++i) { if ( i & 1 ) dwork[i].r = 1.; else dwork[i].r = 2.; dwork[i].i = 0.; } /* Check correctness of matrix-vector multiply. */ pzgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); PrintDouble5("Mult A*x", N_update, ax); SUPERLU_FREE(dwork); } #endif /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; #if ( DEBUGlevel>=1 ) if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", eps, anorm, safe1, safe2); #endif /* Do for each right-hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; /* Copy X into x on the diagonal processes. */ B_col = &B[j*ldb]; X_col = &X[j*ldx]; for (p = 0; p < num_diag_procs; ++p) { pkk = diag_procs[p]; if ( iam == pkk ) { for (k = p; k < nsupers; k += num_diag_procs) { knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; jj = FstBlockC( k ); for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; dx_trs[ii-XK_H].r = k;/* Block number prepended in header. */ } } } /* Copy B into b distributed the same way as matrix-vector product. */ if ( N_update ) ii = update[0]; for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ /* Matrix-vector multiply. */ pzgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); /* Compute residual. */ for (i = 0; i < N_update; ++i) z_sub(&R[i], &b[i], &ax[i]); /* Compute abs(op(A))*abs(X) + abs(B). */ pzgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, rwork); for (i = 0; i < N_update; ++i) rwork[i] += slud_z_abs1(&b[i]); s = 0.0; for (i = 0; i < N_update; ++i) { if ( rwork[i] > safe2 ) { s = SUPERLU_MAX(s, slud_z_abs1(&R[i]) / rwork[i]); } else if ( rwork[i] != 0.0 ) { s = SUPERLU_MAX(s, (safe1 + slud_z_abs1(&R[i])) / rwork[i]); } /* If temp[i] is exactly 0.0 (computed by PxGSMV), then we know the true residual also must be exactly 0.0. */ } MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); #if ( PRNTlevel>= 1 ) if ( !iam ) printf("(%2d) .. Step %2d: berr[j] = %e\n", iam, count, berr[j]); #endif if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { /* Compute new dx. */ redist_all_to_diag(n, R, Glu_persist, Llu, grid, mv_sup_to_proc, dx_trs); pzgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); /* Update solution. */ for (p = 0; p < num_diag_procs; ++p) if ( iam == diag_procs[p] ) for (k = p; k < nsupers; k += num_diag_procs) { lk = LBi( k, grid ); ii = ilsum[lk] + (lk+1)*XK_H; knsupc = SuperSize( k ); for (i = 0; i < knsupc; ++i) z_add(&x_trs[i + ii], &x_trs[i + ii], &dx_trs[i + ii]); } lstres = berr[j]; ++count; /* Transfer x_trs (on diagonal processes) into X (on all processes). */ gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, num_diag_procs, diag_procs, diag_len, X_col, temp); } else { break; } } /* end while */ stat->RefineSteps = count; } /* for j ... */ /* Deallocate storage used by matrix-vector multiplication. */ SUPERLU_FREE(diag_procs); SUPERLU_FREE(diag_len); if ( N_update ) { SUPERLU_FREE(update); SUPERLU_FREE(bindx); SUPERLU_FREE(val); } SUPERLU_FREE(mv_sup_to_proc); SUPERLU_FREE(work); SUPERLU_FREE(rwork); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pzgsrfs_ABXglobal()"); #endif } /* PZGSRFS_ABXGLOBAL */
void tlin::freeS(SuperMatrix *m) { if (!m) return; Destroy_CompCol_Matrix(m); SUPERLU_FREE(m); }
void pdgssvx(int nprocs, pdgstrf_options_t *pdgstrf_options, SuperMatrix *A, int *perm_c, int *perm_r, equed_t *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, double *ferr, double *berr, superlu_memusage_t *superlu_memusage, int *info) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * pdgssvx() solves the system of linear equations A*X=B or A'*X=B, using * the LU factorization from dgstrf(). Error bounds on the solution and * a condition estimate are also provided. It performs the following steps: * * 1. If A is stored column-wise (A->Stype = NC): * * 1.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate * the system: * trans = NOTRANS: diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B * trans = TRANS: (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = CONJ: (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B * (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ). * * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation matrix * that usually preserves sparsity. * For more details of this step, see sp_colorder.c. * * 1.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to * factor the matrix A (after equilibration if fact = EQUILIBRATE) as * Pr*A*Pc = L*U, with Pr determined by partial pivoting. * * 1.4. Compute the reciprocal pivot growth factor. * * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the routine * returns with info = i. Otherwise, the factored form of A is used to * estimate the condition number of the matrix A. If the reciprocal of * the condition number is less than machine precision, * info = A->ncol+1 is returned as a warning, but the routine still * goes on to solve for X and computes error bounds as described below. * * 1.6. The system of equations is solved for X using the factored form * of A. * * 1.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 1.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ) * so that it solves the original system before equilibration. * * 2. If A is stored row-wise (A->Stype = NR), apply the above algorithm * to the tranpose of A: * * 2.1. If fact = EQUILIBRATE, scaling factors are computed to equilibrate * the system: * trans = NOTRANS:diag(R)*A'*diag(C)*inv(diag(C))*X = diag(R)*B * trans = TRANS: (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = CONJ: (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A' is * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B * (if trans = NOTRANS) or diag(C)*B (if trans = TRANS or CONJ). * * 2.2. Permute columns of transpose(A) (rows of A), * forming transpose(A)*Pc, where Pc is a permutation matrix that * usually preserves sparsity. * For more details of this step, see sp_colorder.c. * * 2.3. If fact = DOFACT or EQUILIBRATE, the LU decomposition is used to * factor the matrix A (after equilibration if fact = EQUILIBRATE) as * Pr*transpose(A)*Pc = L*U, with the permutation Pr determined by * partial pivoting. * * 2.4. Compute the reciprocal pivot growth factor. * * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the routine * returns with info = i. Otherwise, the factored form of transpose(A) * is used to estimate the condition number of the matrix A. * If the reciprocal of the condition number is less than machine * precision, info = A->nrow+1 is returned as a warning, but the * routine still goes on to solve for X and computes error bounds * as described below. * * 2.6. The system of equations is solved for X using the factored form * of transpose(A). * * 2.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 2.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = NOTRANS) or diag(R) (if trans = TRANS or CONJ) * so that it solves the original system before equilibration. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * nprocs (input) int * Number of processes (or threads) to be spawned and used to perform * the LU factorization by pdgstrf(). There is a single thread of * control to call pdgstrf(), and all threads spawned by pdgstrf() * are terminated before returning from pdgstrf(). * * pdgstrf_options (input) pdgstrf_options_t* * The structure defines the input parameters and data structure * to control how the LU factorization will be performed. * The following fields should be defined for this structure: * * o fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, whether the matrix A should * be equilibrated before it is factored. * = FACTORED: On entry, L, U, perm_r and perm_c contain the * factored form of A. If equed is not NOEQUIL, the matrix A has * been equilibrated with scaling factors R and C. * A, L, U, perm_r are not modified. * = DOFACT: The matrix A will be factored, and the factors will be * stored in L and U. * = EQUILIBRATE: The matrix A will be equilibrated if necessary, * then factored into L and U. * * o trans (trans_t) * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A**T * X = B (Transpose) * = CONJ: A**H * X = B (Transpose) * * o refact (yes_no_t) * Specifies whether this is first time or subsequent factorization. * = NO: this factorization is treated as the first one; * = YES: it means that a factorization was performed prior to this * one. Therefore, this factorization will re-use some * existing data structures, such as L and U storage, column * elimination tree, and the symbolic information of the * Householder matrix. * * o panel_size (int) * A panel consists of at most panel_size consecutive columns. * * o relax (int) * To control degree of relaxing supernodes. If the number * of nodes (columns) in a subtree of the elimination tree is less * than relax, this subtree is considered as one supernode, * regardless of the row structures of those columns. * * o diag_pivot_thresh (double) * Diagonal pivoting threshold. At step j of the Gaussian * elimination, if * abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * use A_jj as pivot, else use A_ij with maximum magnitude. * 0 <= diag_pivot_thresh <= 1. The default value is 1, * corresponding to partial pivoting. * * o usepr (yes_no_t) * Whether the pivoting will use perm_r specified by the user. * = YES: use perm_r; perm_r is input, unchanged on exit. * = NO: perm_r is determined by partial pivoting, and is output. * * o drop_tol (double) (NOT IMPLEMENTED) * Drop tolerance parameter. At step j of the Gaussian elimination, * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. * 0 <= drop_tol <= 1. The default value of drop_tol is 0, * corresponding to not dropping any entry. * * o work (void*) of size lwork * User-supplied work space and space for the output data structures. * Not referenced if lwork = 0; * * o lwork (int) * Specifies the length of work array. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * superlu_memusage->total_needed; no other side effects. * * A (input/output) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol), where * A->nrow = A->ncol. Currently, the type of A can be: * Stype = NC or NR, Dtype = _D, Mtype = GE. In the future, * more general A will be handled. * * On entry, If pdgstrf_options->fact = FACTORED and equed is not * NOEQUIL, then A must have been equilibrated by the scaling factors * in R and/or C. On exit, A is not modified * if pdgstrf_options->fact = FACTORED or DOFACT, or * if pdgstrf_options->fact = EQUILIBRATE and equed = NOEQUIL. * * On exit, if pdgstrf_options->fact = EQUILIBRATE and equed is not * NOEQUIL, A is scaled as follows: * If A->Stype = NC: * equed = ROW: A := diag(R) * A * equed = COL: A := A * diag(C) * equed = BOTH: A := diag(R) * A * diag(C). * If A->Stype = NR: * equed = ROW: transpose(A) := diag(R) * transpose(A) * equed = COL: transpose(A) := transpose(A) * diag(C) * equed = BOTH: transpose(A) := diag(R) * transpose(A) * diag(C). * * perm_c (input/output) int* * If A->Stype = NC, Column permutation vector of size A->ncol, * which defines the permutation matrix Pc; perm_c[i] = j means * column i of A is in position j in A*Pc. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree * is already in postorder. * * If A->Stype = NR, column permutation vector of size A->nrow, * which describes permutation of columns of tranpose(A) * (rows of A) as described above. * * perm_r (input/output) int* * If A->Stype = NC, row permutation vector of size A->nrow, * which defines the permutation matrix Pr, and is determined * by partial pivoting. perm_r[i] = j means row i of A is in * position j in Pr*A. * * If A->Stype = NR, permutation vector of size A->ncol, which * determines permutation of rows of transpose(A) * (columns of A) as described above. * * If pdgstrf_options->usepr = NO, perm_r is output argument; * If pdgstrf_options->usepr = YES, the pivoting routine will try * to use the input perm_r, unless a certain threshold criterion * is violated. In that case, perm_r is overwritten by a new * permutation determined by partial pivoting or diagonal * threshold pivoting. * * equed (input/output) equed_t* * Specifies the form of equilibration that was done. * = NOEQUIL: No equilibration. * = ROW: Row equilibration, i.e., A was premultiplied by diag(R). * = COL: Column equilibration, i.e., A was postmultiplied by diag(C). * = BOTH: Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * If pdgstrf_options->fact = FACTORED, equed is an input argument, * otherwise it is an output argument. * * R (input/output) double*, dimension (A->nrow) * The row scale factors for A or transpose(A). * If equed = ROW or BOTH, A (if A->Stype = NC) or transpose(A) * (if A->Stype = NR) is multiplied on the left by diag(R). * If equed = NOEQUIL or COL, R is not accessed. * If fact = FACTORED, R is an input argument; otherwise, R is output. * If fact = FACTORED and equed = ROW or BOTH, each element of R must * be positive. * * C (input/output) double*, dimension (A->ncol) * The column scale factors for A or transpose(A). * If equed = COL or BOTH, A (if A->Stype = NC) or trnspose(A) * (if A->Stype = NR) is multiplied on the right by diag(C). * If equed = NOEQUIL or ROW, C is not accessed. * If fact = FACTORED, C is an input argument; otherwise, C is output. * If fact = FACTORED and equed = COL or BOTH, each element of C must * be positive. * * L (output) SuperMatrix* * The factor L from the factorization * Pr*A*Pc=L*U (if A->Stype = NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = NR). * Uses compressed row subscripts storage for supernodes, i.e., * L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization * Pr*A*Pc=L*U (if A->Stype = NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = NR). * Uses column-wise storage scheme, i.e., U has types: * Stype = NCP, Dtype = _D, Mtype = TRU. * * B (input/output) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the right hand side matrix. * On exit, * if equed = NOEQUIL, B is not modified; otherwise * if A->Stype = NC: * if trans = NOTRANS and equed = ROW or BOTH, B is overwritten * by diag(R)*B; * if trans = TRANS or CONJ and equed = COL of BOTH, B is * overwritten by diag(C)*B; * if A->Stype = NR: * if trans = NOTRANS and equed = COL or BOTH, B is overwritten * by diag(C)*B; * if trans = TRANS or CONJ and equed = ROW of BOTH, B is * overwritten by diag(R)*B. * * X (output) SuperMatrix* * X has types: Stype = DN, Dtype = _D, Mtype = GE. * If info = 0 or info = A->ncol+1, X contains the solution matrix * to the original system of equations. Note that A and B are modified * on exit if equed is not NOEQUIL, and the solution to the * equilibrated system is inv(diag(C))*X if trans = NOTRANS and * equed = COL or BOTH, or inv(diag(R))*X if trans = TRANS or CONJ * and equed = ROW or BOTH. * * recip_pivot_growth (output) double* * The reciprocal pivot growth factor computed as * max_j ( max_i(abs(A_ij)) / max_i(abs(U_ij)) ). * If recip_pivot_growth is much less than 1, the stability of the * LU factorization could be poor. * * rcond (output) double* * The estimate of the reciprocal condition number of the matrix A * after equilibration (if done). If rcond is less than the machine * precision (in particular, if rcond = 0), the matrix is singular * to working precision. This condition is indicated by a return * code of info > 0. * * ferr (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * berr (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * superlu_memusage (output) superlu_memusage_t* * Record the memory usage statistics, consisting of following fields: * - for_lu (float) * The amount of space used in bytes for L\U data structures. * - total_needed (float) * The amount of space needed in bytes to perform factorization. * - expansions (int) * The number of memory expansions during the LU factorization. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly * singular, so the solution and error bounds * could not be computed. * = A->ncol+1: U is nonsingular, but RCOND is less than machine * precision, meaning that the matrix is singular to * working precision. Nevertheless, the solution and * error bounds are computed because there are a number * of situations where the computed solution can be more * accurate than the value of RCOND would suggest. * > A->ncol+1: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ NCformat *Astore; DNformat *Bstore, *Xstore; double *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA; /* A in NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, dofact, notran, rowequ; char norm[1]; trans_t trant; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int n, relax, panel_size; Gstat_t Gstat; double t0; /* temporary time */ double *utime; flops_t *ops, flopcnt; /* External functions */ extern double dlangs(char *, SuperMatrix *); extern double dlamch_(char *); Astore = A->Store; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; n = A->ncol; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; pdgstrf_options->perm_c = perm_c; pdgstrf_options->perm_r = perm_r; *info = 0; dofact = (pdgstrf_options->fact == DOFACT); equil = (pdgstrf_options->fact == EQUILIBRATE); notran = (pdgstrf_options->trans == NOTRANS); if (dofact || equil) { *equed = NOEQUIL; rowequ = FALSE; colequ = FALSE; } else { rowequ = (*equed == ROW) || (*equed == BOTH); colequ = (*equed == COL) || (*equed == BOTH); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* ------------------------------------------------------------ Test the input parameters. ------------------------------------------------------------*/ if ( nprocs <= 0 ) *info = -1; else if ( (!dofact && !equil && (pdgstrf_options->fact != FACTORED)) || (!notran && (pdgstrf_options->trans != TRANS) && (pdgstrf_options->trans != CONJ)) || (pdgstrf_options->refact != YES && pdgstrf_options->refact != NO) || (pdgstrf_options->usepr != YES && pdgstrf_options->usepr != NO) || pdgstrf_options->lwork < -1 ) *info = -2; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -3; else if ((pdgstrf_options->fact == FACTORED) && !(rowequ || colequ || (*equed == NOEQUIL))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = MIN(rcmin, R[j]); rcmax = MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = MAX(rcmin,smlnum) / MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = MIN(rcmin, C[j]); rcmax = MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = MAX(rcmin,smlnum) / MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( B->ncol < 0 || Bstore->lda < MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -11; else if ( X->ncol < 0 || Xstore->lda < MAX(0, A->nrow) || B->ncol != X->ncol || X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) *info = -12; } } if (*info != 0) { i = -(*info); xerbla_("dgssvx", &i); return; } /* ------------------------------------------------------------ Allocate storage and initialize statistics variables. ------------------------------------------------------------*/ panel_size = pdgstrf_options->panel_size; relax = pdgstrf_options->relax; StatAlloc(n, nprocs, panel_size, relax, &Gstat); StatInit(n, nprocs, &Gstat); utime = Gstat.utime; ops = Gstat.ops; /* ------------------------------------------------------------ Convert A to NC format when necessary. ------------------------------------------------------------*/ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == NC */ trant = pdgstrf_options->trans; AA = A; } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = (*equed == ROW) || (*equed == BOTH); colequ = (*equed == COL) || (*equed == BOTH); } utime[EQUIL] = SuperLU_timer_() - t0; } /* ------------------------------------------------------------ Scale the right hand side. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Bmat[i + j*ldb] *= R[i]; } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Bmat[i + j*ldb] *= C[i]; } } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( dofact || equil ) { /* Obtain column etree, the column count (colcnt_h) and supernode partition (part_super_h) for the Householder matrix. */ t0 = SuperLU_timer_(); sp_colorder(AA, perm_c, pdgstrf_options, &AC); utime[ETREE] = SuperLU_timer_() - t0; #if ( PRNTlevel >= 2 ) printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); #endif /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); pdgstrf(pdgstrf_options, &AC, perm_r, L, U, &Gstat, info); utime[FACT] = SuperLU_timer_() - t0; flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; ops[FACT] = flopcnt; if ( pdgstrf_options->lwork == -1 ) { superlu_memusage->total_needed = *info - A->ncol; return; } } if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = dPivotGrowth(*info, AA, perm_c, L, U); } } else { /* ------------------------------------------------------------ Compute the reciprocal pivot growth factor *recip_pivot_growth. ------------------------------------------------------------*/ *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); /* ------------------------------------------------------------ Estimate the reciprocal of the condition number of A. ------------------------------------------------------------*/ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = dlangs(norm, AA); dgscon(norm, L, U, anorm, rcond, info); utime[RCOND] = SuperLU_timer_() - t0; /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); dgstrs(trant, L, U, perm_r, perm_c, X, &Gstat, info); utime[SOLVE] = SuperLU_timer_() - t0; ops[SOLVE] = ops[TRISOLVE]; /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ t0 = SuperLU_timer_(); dgsrfs(trant, AA, L, U, perm_r, perm_c, *equed, R, C, B, X, ferr, berr, &Gstat, info); utime[REFINE] = SuperLU_timer_() - t0; /* ------------------------------------------------------------ Transform the solution matrix X to a solution of the original system. ------------------------------------------------------------*/ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Xmat[i + j*ldx] *= C[i]; } } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Xmat[i + j*ldx] *= R[i]; } } /* Set INFO = A->ncol+1 if the matrix is singular to working precision.*/ if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; } superlu_QuerySpace(nprocs, L, U, panel_size, superlu_memusage); /* ------------------------------------------------------------ Deallocate storage after factorization. ------------------------------------------------------------*/ if ( pdgstrf_options->refact == NO ) { SUPERLU_FREE(pdgstrf_options->etree); SUPERLU_FREE(pdgstrf_options->colcnt_h); SUPERLU_FREE(pdgstrf_options->part_super_h); } if ( dofact || equil ) { Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } /* ------------------------------------------------------------ Print timings, then deallocate statistic variables. ------------------------------------------------------------*/ PrintStat(&Gstat); StatFree(&Gstat); }
void tlin::freeD(SuperMatrix *m) { if (!m) return; Destroy_Dense_Matrix(m); SUPERLU_FREE(m); }
void cgssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, char *equed, float *R, float *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, float *recip_pivot_growth, float *rcond, float *ferr, float *berr, mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info ) { DNformat *Bstore, *Xstore; complex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ, permc_spec; trans_t trant; char norm[1]; int i, j, info1; float amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; float diag_pivot_thresh; double t0; /* temporary time */ double *utime; /* External functions */ extern float clangs(char *, SuperMatrix *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; *info = 0; nofact = (options->Fact != FACTORED); equil = (options->Equil == YES); notran = (options->Trans == NOTRANS); if ( nofact ) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = slamch_("Safe minimum"); bignum = 1. / smlnum; } #if 0 printf("dgssvx: Fact=%4d, Trans=%4d, equed=%c\n", options->Fact, options->Trans, *equed); #endif /* Test the input parameters */ if (options->Fact != DOFACT && options->Fact != SamePattern && options->Fact != SamePattern_SameRowPerm && options->Fact != FACTORED && options->Trans != NOTRANS && options->Trans != TRANS && options->Trans != CONJ && options->Equil != NO && options->Equil != YES) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_C || A->Mtype != SLU_GE ) *info = -2; else if (options->Fact == FACTORED && !(rowequ || colequ || lsame_(equed, "N"))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -12; else if ( B->ncol < 0 ) *info = -13; else if ( B->ncol > 0 ) { /* no checking if B->ncol=0 */ if ( Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -13; } if ( X->ncol < 0 ) *info = -14; else if ( X->ncol > 0 ) { /* no checking if X->ncol=0 */ if ( Xstore->lda < SUPERLU_MAX(0, A->nrow) || (B->ncol != 0 && B->ncol != X->ncol) || X->Stype != SLU_DN || X->Dtype != SLU_C || X->Mtype != SLU_GE ) *info = -14; } } } if (*info != 0) { i = -(*info); xerbla_("cgssvx", &i); return; } /* Initialization for factor parameters */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = options->DiagPivotThresh; utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == SLU_NC */ trant = options->Trans; AA = A; } if ( nofact && equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ cgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ claqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } if ( nofact ) { t0 = SuperLU_timer_(); /* * Gnet column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t0; t0 = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); */ /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); cgstrf(options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, L, U, stat, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( options->PivotGrowth ) { if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = cPivotGrowth(*info, AA, perm_c, L, U); } return; } /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = cPivotGrowth(A->ncol, AA, perm_c, L, U); } if ( options->ConditionNumber ) { /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = clangs(norm, AA); cgscon(norm, L, U, anorm, rcond, stat, info); utime[RCOND] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Scale the right hand side if equilibration was performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], R[i]); } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) cs_mult(&Bmat[i+j*ldb], &Bmat[i+j*ldb], C[i]); } /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); cgstrs (trant, L, U, perm_c, perm_r, X, stat, info); utime[SOLVE] = SuperLU_timer_() - t0; /* Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. */ t0 = SuperLU_timer_(); if ( options->IterRefine != NOREFINE ) { cgsrfs(trant, AA, L, U, perm_c, perm_r, equed, R, C, B, X, ferr, berr, stat, info); } else { for (j = 0; j < nrhs; ++j) ferr[j] = berr[j] = 1.0; } utime[REFINE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], C[i]); } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) cs_mult(&Xmat[i+j*ldx], &Xmat[i+j*ldx], R[i]); } } /* end if nrhs > 0 */ if ( options->ConditionNumber ) { /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < slamch_("E") ) *info = A->ncol + 1; } if ( nofact ) { cQuerySpace(L, U, mem_usage); Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
main(int argc, char *argv[]) { /* * Purpose * ======= * * ZDRIVE is the main test program for the DOUBLE COMPLEX linear * equation driver routines ZGSSV and ZGSSVX. * * The program is invoked by a shell script file -- ztest.csh. * The output from the tests are written into a file -- ztest.out. * * ===================================================================== */ doublecomplex *a, *a_save; int *asub, *asub_save; int *xa, *xa_save; SuperMatrix A, B, X, L, U; SuperMatrix ASAV, AC; mem_usage_t mem_usage; int *perm_r; /* row permutation from partial pivoting */ int *perm_c, *pc_save; /* column permutation */ int *etree; doublecomplex zero = {0.0, 0.0}; double *R, *C; double *ferr, *berr; double *rwork; doublecomplex *wwork; void *work; int info, lwork, nrhs, panel_size, relax; int m, n, nnz; doublecomplex *xact; doublecomplex *rhsb, *solx, *bsav; int ldb, ldx; double rpg, rcond; int i, j, k1; double rowcnd, colcnd, amax; int maxsuper, rowblk, colblk; int prefact, nofact, equil, iequed; int nt, nrun, nfail, nerrs, imat, fimat, nimat; int nfact, ifact, itran; int kl, ku, mode, lda; int zerot, izero, ioff; double u; double anorm, cndnum; doublecomplex *Afull; double result[NTESTS]; superlu_options_t options; fact_t fact; trans_t trans; SuperLUStat_t stat; static char matrix_type[8]; static char equed[1], path[4], sym[1], dist[1]; /* Fixed set of parameters */ int iseed[] = {1988, 1989, 1990, 1991}; static char equeds[] = {'N', 'R', 'C', 'B'}; static fact_t facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm}; static trans_t transs[] = {NOTRANS, TRANS, CONJ}; /* Some function prototypes */ extern int zgst01(int, int, SuperMatrix *, SuperMatrix *, SuperMatrix *, int *, int *, double *); extern int zgst02(trans_t, int, int, int, SuperMatrix *, doublecomplex *, int, doublecomplex *, int, double *resid); extern int zgst04(int, int, doublecomplex *, int, doublecomplex *, int, double rcond, double *resid); extern int zgst07(trans_t, int, int, SuperMatrix *, doublecomplex *, int, doublecomplex *, int, doublecomplex *, int, double *, double *, double *); extern int zlatb4_(char *, int *, int *, int *, char *, int *, int *, double *, int *, double *, char *); extern int zlatms_(int *, int *, char *, int *, char *, double *d, int *, double *, double *, int *, int *, char *, doublecomplex *, int *, doublecomplex *, int *); extern int sp_zconvert(int, int, doublecomplex *, int, int, int, doublecomplex *a, int *, int *, int *); /* Executable statements */ strcpy(path, "ZGE"); nrun = 0; nfail = 0; nerrs = 0; /* Defaults */ lwork = 0; n = 1; nrhs = 1; panel_size = sp_ienv(1); relax = sp_ienv(2); u = 1.0; strcpy(matrix_type, "LA"); parse_command_line(argc, argv, matrix_type, &n, &panel_size, &relax, &nrhs, &maxsuper, &rowblk, &colblk, &lwork, &u); if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) { fprintf(stderr, "expert: cannot allocate %d bytes\n", lwork); exit (-1); } } /* Set the default input options. */ set_default_options(&options); options.DiagPivotThresh = u; options.PrintStat = NO; options.PivotGrowth = YES; options.ConditionNumber = YES; options.IterRefine = DOUBLE; if ( strcmp(matrix_type, "LA") == 0 ) { /* Test LAPACK matrix suite. */ m = n; lda = SUPERLU_MAX(n, 1); nnz = n * n; /* upper bound */ fimat = 1; nimat = NTYPES; Afull = doublecomplexCalloc(lda * n); zallocateA(n, nnz, &a, &asub, &xa); } else { /* Read a sparse matrix */ fimat = nimat = 0; zreadhb(&m, &n, &nnz, &a, &asub, &xa); } zallocateA(n, nnz, &a_save, &asub_save, &xa_save); rhsb = doublecomplexMalloc(m * nrhs); bsav = doublecomplexMalloc(m * nrhs); solx = doublecomplexMalloc(n * nrhs); ldb = m; ldx = n; zCreate_Dense_Matrix(&B, m, nrhs, rhsb, ldb, SLU_DN, SLU_Z, SLU_GE); zCreate_Dense_Matrix(&X, n, nrhs, solx, ldx, SLU_DN, SLU_Z, SLU_GE); xact = doublecomplexMalloc(n * nrhs); etree = intMalloc(n); perm_r = intMalloc(n); perm_c = intMalloc(n); pc_save = intMalloc(n); R = (double *) SUPERLU_MALLOC(m*sizeof(double)); C = (double *) SUPERLU_MALLOC(n*sizeof(double)); ferr = (double *) SUPERLU_MALLOC(nrhs*sizeof(double)); berr = (double *) SUPERLU_MALLOC(nrhs*sizeof(double)); j = SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs); rwork = (double *) SUPERLU_MALLOC(j*sizeof(double)); for (i = 0; i < j; ++i) rwork[i] = 0.; if ( !R ) ABORT("SUPERLU_MALLOC fails for R"); if ( !C ) ABORT("SUPERLU_MALLOC fails for C"); if ( !ferr ) ABORT("SUPERLU_MALLOC fails for ferr"); if ( !berr ) ABORT("SUPERLU_MALLOC fails for berr"); if ( !rwork ) ABORT("SUPERLU_MALLOC fails for rwork"); wwork = doublecomplexCalloc( SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs) ); for (i = 0; i < n; ++i) perm_c[i] = pc_save[i] = i; options.ColPerm = MY_PERMC; for (imat = fimat; imat <= nimat; ++imat) { /* All matrix types */ if ( imat ) { /* Skip types 5, 6, or 7 if the matrix size is too small. */ zerot = (imat >= 5 && imat <= 7); if ( zerot && n < imat-4 ) continue; /* Set up parameters with ZLATB4 and generate a test matrix with ZLATMS. */ zlatb4_(path, &imat, &n, &n, sym, &kl, &ku, &anorm, &mode, &cndnum, dist); zlatms_(&n, &n, dist, iseed, sym, &rwork[0], &mode, &cndnum, &anorm, &kl, &ku, "No packing", Afull, &lda, &wwork[0], &info); if ( info ) { printf(FMT3, "ZLATMS", info, izero, n, nrhs, imat, nfail); continue; } /* For types 5-7, zero one or more columns of the matrix to test that INFO is returned correctly. */ if ( zerot ) { if ( imat == 5 ) izero = 1; else if ( imat == 6 ) izero = n; else izero = n / 2 + 1; ioff = (izero - 1) * lda; if ( imat < 7 ) { for (i = 0; i < n; ++i) Afull[ioff + i] = zero; } else { for (j = 0; j < n - izero + 1; ++j) for (i = 0; i < n; ++i) Afull[ioff + i + j*lda] = zero; } } else { izero = 0; } /* Convert to sparse representation. */ sp_zconvert(n, n, Afull, lda, kl, ku, a, asub, xa, &nnz); } else { izero = 0; zerot = 0; } zCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_Z, SLU_GE); /* Save a copy of matrix A in ASAV */ zCreate_CompCol_Matrix(&ASAV, m, n, nnz, a_save, asub_save, xa_save, SLU_NC, SLU_Z, SLU_GE); zCopy_CompCol_Matrix(&A, &ASAV); /* Form exact solution. */ zGenXtrue(n, nrhs, xact, ldx); StatInit(&stat); for (iequed = 0; iequed < 4; ++iequed) { *equed = equeds[iequed]; if (iequed == 0) nfact = 4; else nfact = 1; /* Only test factored, pre-equilibrated matrix */ for (ifact = 0; ifact < nfact; ++ifact) { fact = facts[ifact]; options.Fact = fact; for (equil = 0; equil < 2; ++equil) { options.Equil = equil; prefact = ( options.Fact == FACTORED || options.Fact == SamePattern_SameRowPerm ); /* Need a first factor */ nofact = (options.Fact != FACTORED); /* Not factored */ /* Restore the matrix A. */ zCopy_CompCol_Matrix(&ASAV, &A); if ( zerot ) { if ( prefact ) continue; } else if ( options.Fact == FACTORED ) { if ( equil || iequed ) { /* Compute row and column scale factors to equilibrate matrix A. */ zgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &info); /* Force equilibration. */ if ( !info && n > 0 ) { if ( lsame_(equed, "R") ) { rowcnd = 0.; colcnd = 1.; } else if ( lsame_(equed, "C") ) { rowcnd = 1.; colcnd = 0.; } else if ( lsame_(equed, "B") ) { rowcnd = 0.; colcnd = 0.; } } /* Equilibrate the matrix. */ zlaqgs(&A, R, C, rowcnd, colcnd, amax, equed); } } if ( prefact ) { /* Need a factor for the first time */ /* Save Fact option. */ fact = options.Fact; options.Fact = DOFACT; /* Preorder the matrix, obtain the column etree. */ sp_preorder(&options, &A, perm_c, etree, &AC); /* Factor the matrix AC. */ zgstrf(&options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, &L, &U, &stat, &info); if ( info ) { printf("** First factor: info %d, equed %c\n", info, *equed); if ( lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); exit(0); } } Destroy_CompCol_Permuted(&AC); /* Restore Fact option. */ options.Fact = fact; } /* if .. first time factor */ for (itran = 0; itran < NTRAN; ++itran) { trans = transs[itran]; options.Trans = trans; /* Restore the matrix A. */ zCopy_CompCol_Matrix(&ASAV, &A); /* Set the right hand side. */ zFillRHS(trans, nrhs, xact, ldx, &A, &B); zCopy_Dense_Matrix(m, nrhs, rhsb, ldb, bsav, ldb); /*---------------- * Test zgssv *----------------*/ if ( options.Fact == DOFACT && itran == 0) { /* Not yet factored, and untransposed */ zCopy_Dense_Matrix(m, nrhs, rhsb, ldb, solx, ldx); zgssv(&options, &A, perm_c, perm_r, &L, &U, &X, &stat, &info); if ( info && info != izero ) { printf(FMT3, "zgssv", info, izero, n, nrhs, imat, nfail); } else { /* Reconstruct matrix from factors and compute residual. */ zgst01(m, n, &A, &L, &U, perm_c, perm_r, &result[0]); nt = 1; if ( izero == 0 ) { /* Compute residual of the computed solution. */ zCopy_Dense_Matrix(m, nrhs, rhsb, ldb, wwork, ldb); zgst02(trans, m, n, nrhs, &A, solx, ldx, wwork,ldb, &result[1]); nt = 2; } /* Print information about the tests that did not pass the threshold. */ for (i = 0; i < nt; ++i) { if ( result[i] >= THRESH ) { printf(FMT1, "zgssv", n, i, result[i]); ++nfail; } } nrun += nt; } /* else .. info == 0 */ /* Restore perm_c. */ for (i = 0; i < n; ++i) perm_c[i] = pc_save[i]; if (lwork == 0) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } } /* if .. end of testing zgssv */ /*---------------- * Test zgssvx *----------------*/ /* Equilibrate the matrix if fact = FACTORED and equed = 'R', 'C', or 'B'. */ if ( options.Fact == FACTORED && (equil || iequed) && n > 0 ) { zlaqgs(&A, R, C, rowcnd, colcnd, amax, equed); } /* Solve the system and compute the condition number and error bounds using zgssvx. */ zgssvx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, ferr, berr, &mem_usage, &stat, &info); if ( info && info != izero ) { printf(FMT3, "zgssvx", info, izero, n, nrhs, imat, nfail); if ( lwork == -1 ) { printf("** Estimated memory: %.0f bytes\n", mem_usage.total_needed); exit(0); } } else { if ( !prefact ) { /* Reconstruct matrix from factors and compute residual. */ zgst01(m, n, &A, &L, &U, perm_c, perm_r, &result[0]); k1 = 0; } else { k1 = 1; } if ( !info ) { /* Compute residual of the computed solution.*/ zCopy_Dense_Matrix(m, nrhs, bsav, ldb, wwork, ldb); zgst02(trans, m, n, nrhs, &ASAV, solx, ldx, wwork, ldb, &result[1]); /* Check solution from generated exact solution. */ zgst04(n, nrhs, solx, ldx, xact, ldx, rcond, &result[2]); /* Check the error bounds from iterative refinement. */ zgst07(trans, n, nrhs, &ASAV, bsav, ldb, solx, ldx, xact, ldx, ferr, berr, &result[3]); /* Print information about the tests that did not pass the threshold. */ for (i = k1; i < NTESTS; ++i) { if ( result[i] >= THRESH ) { printf(FMT2, "zgssvx", options.Fact, trans, *equed, n, imat, i, result[i]); ++nfail; } } nrun += NTESTS; } /* if .. info == 0 */ } /* else .. end of testing zgssvx */ } /* for itran ... */ if ( lwork == 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } } /* for equil ... */ } /* for ifact ... */ } /* for iequed ... */ #if 0 if ( !info ) { PrintPerf(&L, &U, &mem_usage, rpg, rcond, ferr, berr, equed); } #endif } /* for imat ... */ /* Print a summary of the results. */ PrintSumm("ZGE", nfail, nrun, nerrs); SUPERLU_FREE (rhsb); SUPERLU_FREE (bsav); SUPERLU_FREE (solx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (pc_save); SUPERLU_FREE (R); SUPERLU_FREE (C); SUPERLU_FREE (ferr); SUPERLU_FREE (berr); SUPERLU_FREE (rwork); SUPERLU_FREE (wwork); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); Destroy_CompCol_Matrix(&A); Destroy_CompCol_Matrix(&ASAV); if ( lwork > 0 ) { SUPERLU_FREE (work); Destroy_SuperMatrix_Store(&L); Destroy_SuperMatrix_Store(&U); } StatFree(&stat); return 0; }
void heap_relax_snode ( const int n, int *et, /* column elimination tree */ const int relax_columns, /* max no of columns allowed in a relaxed snode */ int *descendants, /* no of descendants of each node in the etree */ int *relax_end /* last column in a supernode */ ) { register int i, j, k, l, parent; register int snode_start; /* beginning of a snode */ int *et_save, *post, *inv_post, *iwork; int nsuper_et = 0, nsuper_et_post = 0; /* The etree may not be postordered, but is heap ordered. */ iwork = (int*) intMalloc(3*n+2); if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]"); inv_post = iwork + n+1; et_save = inv_post + n+1; /* Post order etree */ post = (int *) TreePostorder(n, et); for (i = 0; i < n+1; ++i) inv_post[post[i]] = i; /* Renumber etree in postorder */ for (i = 0; i < n; ++i) { iwork[post[i]] = post[et[i]]; et_save[i] = et[i]; /* Save the original etree */ } for (i = 0; i < n; ++i) et[i] = iwork[i]; /* Compute the number of descendants of each node in the etree */ ifill (relax_end, n, EMPTY); for (j = 0; j < n; j++) descendants[j] = 0; for (j = 0; j < n; j++) { parent = et[j]; if ( parent != n ) /* not the dummy root */ descendants[parent] += descendants[j] + 1; } /* Identify the relaxed supernodes by postorder traversal of the etree. */ for (j = 0; j < n; ) { parent = et[j]; snode_start = j; while ( parent != n && descendants[parent] < relax_columns ) { j = parent; parent = et[j]; } /* Found a supernode in postordered etree; j is the last column. */ ++nsuper_et_post; k = n; for (i = snode_start; i <= j; ++i) k = SUPERLU_MIN(k, inv_post[i]); l = inv_post[j]; if ( (l - k) == (j - snode_start) ) { /* It's also a supernode in the original etree */ relax_end[k] = l; /* Last column is recorded */ ++nsuper_et; } else { for (i = snode_start; i <= j; ++i) { l = inv_post[i]; if ( descendants[i] == 0 ) { relax_end[l] = l; ++nsuper_et; } } } j++; /* Search for a new leaf */ while ( descendants[j] != 0 && j < n ) j++; } #if ( PRNTlevel>=1 ) printf(".. heap_snode_relax:\n" "\tNo of relaxed snodes in postordered etree:\t%d\n" "\tNo of relaxed snodes in original etree:\t%d\n", nsuper_et_post, nsuper_et); #endif /* Recover the original etree */ for (i = 0; i < n; ++i) et[i] = et_save[i]; SUPERLU_FREE(post); SUPERLU_FREE(iwork); }
void f_destroy_SuperMatrix_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
main(int argc, char *argv[]) { SuperMatrix A; NCformat *Astore; double *a; int *asub, *xa; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ SuperMatrix L; /* factor L */ SCformat *Lstore; SuperMatrix U; /* factor U */ NCformat *Ustore; SuperMatrix B; int nrhs, ldx, info, m, n, nnz; double *xact, *rhs; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 1.0; options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; */ set_default_options(&options); /* Read the matrix in Harwell-Boeing format. */ dreadhb(&m, &n, &nnz, &a, &asub, &xa); dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); Astore = A.Store; printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); nrhs = 1; if ( !(rhs = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhs[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhs, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(options.Trans, nrhs, xact, ldx, &A, &B); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); /* Initialize the statistics variables. */ StatInit(&stat); dgssv(&options, &A, perm_c, perm_r, &L, &U, &B, &stat, &info); if ( info == 0 ) { /* This is how you could access the solution matrix. */ double *sol = (double*) ((DNformat*) B.Store)->nzval; /* Compute the infinity norm of the error. */ dinf_norm_error(nrhs, &B, xact); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("FILL ratio = %.1f\n", (float)(Lstore->nnz + Ustore->nnz - n)/nnz); dQuerySpace(&L, &U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } else { printf("dgssv() error returns INFO= %d\n", info); if ( info <= n ) { /* factorization completes */ dQuerySpace(&L, &U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } } if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhs); SUPERLU_FREE (xact); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif }
void f_destroy_gridinfo_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
void zgssvx(char *fact, char *trans, char *refact, SuperMatrix *A, factor_param_t *factor_params, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, double *ferr, double *berr, mem_usage_t *mem_usage, int *info ) { /* * Purpose * ======= * * ZGSSVX solves the system of linear equations A*X=B or A'*X=B, using * the LU factorization from zgstrf(). Error bounds on the solution and * a condition estimate are also provided. It performs the following steps: * * 1. If A is stored column-wise (A->Stype = SLU_NC): * * 1.1. If fact = 'E', scaling factors are computed to equilibrate the * system: * trans = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B * trans = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A is * overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if trans='N') * or diag(C)*B (if trans = 'T' or 'C'). * * 1.2. Permute columns of A, forming A*Pc, where Pc is a permutation * matrix that usually preserves sparsity. * For more details of this step, see sp_preorder.c. * * 1.3. If fact = 'N' or 'E', the LU decomposition is used to factor the * matrix A (after equilibration if fact = 'E') as Pr*A*Pc = L*U, * with Pr determined by partial pivoting. * * 1.4. Compute the reciprocal pivot growth factor. * * 1.5. If some U(i,i) = 0, so that U is exactly singular, then the * routine returns with info = i. Otherwise, the factored form of * A is used to estimate the condition number of the matrix A. If * the reciprocal of the condition number is less than machine * precision, info = A->ncol+1 is returned as a warning, but the * routine still goes on to solve for X and computes error bounds * as described below. * * 1.6. The system of equations is solved for X using the factored form * of A. * * 1.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 1.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so * that it solves the original system before equilibration. * * 2. If A is stored row-wise (A->Stype = SLU_NR), apply the above algorithm * to the transpose of A: * * 2.1. If fact = 'E', scaling factors are computed to equilibrate the * system: * trans = 'N': diag(R)*A'*diag(C) *inv(diag(C))*X = diag(R)*B * trans = 'T': (diag(R)*A'*diag(C))**T *inv(diag(R))*X = diag(C)*B * trans = 'C': (diag(R)*A'*diag(C))**H *inv(diag(R))*X = diag(C)*B * Whether or not the system will be equilibrated depends on the * scaling of the matrix A, but if equilibration is used, A' is * overwritten by diag(R)*A'*diag(C) and B by diag(R)*B * (if trans='N') or diag(C)*B (if trans = 'T' or 'C'). * * 2.2. Permute columns of transpose(A) (rows of A), * forming transpose(A)*Pc, where Pc is a permutation matrix that * usually preserves sparsity. * For more details of this step, see sp_preorder.c. * * 2.3. If fact = 'N' or 'E', the LU decomposition is used to factor the * transpose(A) (after equilibration if fact = 'E') as * Pr*transpose(A)*Pc = L*U with the permutation Pr determined by * partial pivoting. * * 2.4. Compute the reciprocal pivot growth factor. * * 2.5. If some U(i,i) = 0, so that U is exactly singular, then the * routine returns with info = i. Otherwise, the factored form * of transpose(A) is used to estimate the condition number of the * matrix A. If the reciprocal of the condition number * is less than machine precision, info = A->nrow+1 is returned as * a warning, but the routine still goes on to solve for X and * computes error bounds as described below. * * 2.6. The system of equations is solved for X using the factored form * of transpose(A). * * 2.7. Iterative refinement is applied to improve the computed solution * matrix and calculate error bounds and backward error estimates * for it. * * 2.8. If equilibration was used, the matrix X is premultiplied by * diag(C) (if trans = 'N') or diag(R) (if trans = 'T' or 'C') so * that it solves the original system before equilibration. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * fact (input) char* * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, whether the matrix A should * be equilibrated before it is factored. * = 'F': On entry, L, U, perm_r and perm_c contain the factored * form of A. If equed is not 'N', the matrix A has been * equilibrated with scaling factors R and C. * A, L, U, perm_r are not modified. * = 'N': The matrix A will be factored, and the factors will be * stored in L and U. * = 'E': The matrix A will be equilibrated if necessary, then * factored into L and U. * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Transpose) * * refact (input) char* * Specifies whether we want to re-factor the matrix. * = 'N': Factor the matrix A. * = 'Y': Matrix A was factored before, now we want to re-factor * matrix A with perm_r and etree as inputs. Use * the same storage for the L\U factors previously allocated, * expand it if necessary. User should insure to use the same * memory model. In this case, perm_r may be modified due to * different pivoting determined by diagonal threshold. * If fact = 'F', then refact is not accessed. * * A (input/output) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Currently, the type of A can be: * Stype = SLU_NC or SLU_NR, Dtype = SLU_Z, Mtype = SLU_GE. * In the future, more general A may be handled. * * On entry, If fact = 'F' and equed is not 'N', then A must have * been equilibrated by the scaling factors in R and/or C. * A is not modified if fact = 'F' or 'N', or if fact = 'E' and * equed = 'N' on exit. * * On exit, if fact = 'E' and equed is not 'N', A is scaled as follows: * If A->Stype = SLU_NC: * equed = 'R': A := diag(R) * A * equed = 'C': A := A * diag(C) * equed = 'B': A := diag(R) * A * diag(C). * If A->Stype = SLU_NR: * equed = 'R': transpose(A) := diag(R) * transpose(A) * equed = 'C': transpose(A) := transpose(A) * diag(C) * equed = 'B': transpose(A) := diag(R) * transpose(A) * diag(C). * * factor_params (input) factor_param_t* * The structure defines the input scalar parameters, consisting of * the following fields. If factor_params = NULL, the default * values are used for all the fields; otherwise, the values * are given by the user. * - panel_size (int): Panel size. A panel consists of at most * panel_size consecutive columns. If panel_size = -1, use * default value 8. * - relax (int): To control degree of relaxing supernodes. If the * number of nodes (columns) in a subtree of the elimination * tree is less than relax, this subtree is considered as one * supernode, regardless of the row structures of those columns. * If relax = -1, use default value 8. * - diag_pivot_thresh (double): Diagonal pivoting threshold. * At step j of the Gaussian elimination, if * abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * then use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. * If diag_pivot_thresh = -1, use default value 1.0, * which corresponds to standard partial pivoting. * - drop_tol (double): Drop tolerance threshold. (NOT IMPLEMENTED) * At step j of the Gaussian elimination, if * abs(A_ij)/(max_i abs(A_ij)) < drop_tol, * then drop entry A_ij. 0 <= drop_tol <= 1. * If drop_tol = -1, use default value 0.0, which corresponds to * standard Gaussian elimination. * * perm_c (input/output) int* * If A->Stype = SLU_NC, Column permutation vector of size A->ncol, * which defines the permutation matrix Pc; perm_c[i] = j means * column i of A is in position j in A*Pc. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree * is already in postorder. * * If A->Stype = SLU_NR, column permutation vector of size A->nrow, * which describes permutation of columns of transpose(A) * (rows of A) as described above. * * perm_r (input/output) int* * If A->Stype = SLU_NC, row permutation vector of size A->nrow, * which defines the permutation matrix Pr, and is determined * by partial pivoting. perm_r[i] = j means row i of A is in * position j in Pr*A. * * If A->Stype = SLU_NR, permutation vector of size A->ncol, which * determines permutation of rows of transpose(A) * (columns of A) as described above. * * If refact is not 'Y', perm_r is output argument; * If refact = 'Y', the pivoting routine will try to use the input * perm_r, unless a certain threshold criterion is violated. * In that case, perm_r is overwritten by a new permutation * determined by partial pivoting or diagonal threshold pivoting. * * etree (input/output) int*, dimension (A->ncol) * Elimination tree of Pc'*A'*A*Pc. * If fact is not 'F' and refact = 'Y', etree is an input argument, * otherwise it is an output argument. * Note: etree is a vector of parent pointers for a forest whose * vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * equed (input/output) char* * Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * If fact = 'F', equed is an input argument, otherwise it is * an output argument. * * R (input/output) double*, dimension (A->nrow) * The row scale factors for A or transpose(A). * If equed = 'R' or 'B', A (if A->Stype = SLU_NC) or transpose(A) * (if A->Stype = SLU_NR) is multiplied on the left by diag(R). * If equed = 'N' or 'C', R is not accessed. * If fact = 'F', R is an input argument; otherwise, R is output. * If fact = 'F' and equed = 'R' or 'B', each element of R must * be positive. * * C (input/output) double*, dimension (A->ncol) * The column scale factors for A or transpose(A). * If equed = 'C' or 'B', A (if A->Stype = SLU_NC) or transpose(A) * (if A->Stype = SLU_NR) is multiplied on the right by diag(C). * If equed = 'N' or 'R', C is not accessed. * If fact = 'F', C is an input argument; otherwise, C is output. * If fact = 'F' and equed = 'C' or 'B', each element of C must * be positive. * * L (output) SuperMatrix* * The factor L from the factorization * Pr*A*Pc=L*U (if A->Stype SLU_= NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). * Uses compressed row subscripts storage for supernodes, i.e., * L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization * Pr*A*Pc=L*U (if A->Stype = SLU_NC) or * Pr*transpose(A)*Pc=L*U (if A->Stype = SLU_NR). * Uses column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_Z, Mtype = TRU. * * work (workspace/output) void*, size (lwork) (in bytes) * User supplied workspace, should be large enough * to hold data structures for factors L and U. * On exit, if fact is not 'F', L and U point to this array. * * lwork (input) int * Specifies the size of work array in bytes. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * mem_usage->total_needed; no other side effects. * * See argument 'mem_usage' for memory usage statistics. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, * if equed = 'N', B is not modified; otherwise * if A->Stype = SLU_NC: * if trans = 'N' and equed = 'R' or 'B', B is overwritten by * diag(R)*B; * if trans = 'T' or 'C' and equed = 'C' of 'B', B is * overwritten by diag(C)*B; * if A->Stype = SLU_NR: * if trans = 'N' and equed = 'C' or 'B', B is overwritten by * diag(C)*B; * if trans = 'T' or 'C' and equed = 'R' of 'B', B is * overwritten by diag(R)*B. * * X (output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_Z, Mtype = SLU_GE. * If info = 0 or info = A->ncol+1, X contains the solution matrix * to the original system of equations. Note that A and B are modified * on exit if equed is not 'N', and the solution to the equilibrated * system is inv(diag(C))*X if trans = 'N' and equed = 'C' or 'B', * or inv(diag(R))*X if trans = 'T' or 'C' and equed = 'R' or 'B'. * * recip_pivot_growth (output) double* * The reciprocal pivot growth factor max_j( norm(A_j)/norm(U_j) ). * The infinity norm is used. If recip_pivot_growth is much less * than 1, the stability of the LU factorization could be poor. * * rcond (output) double* * The estimate of the reciprocal condition number of the matrix A * after equilibration (if done). If rcond is less than the machine * precision (in particular, if rcond = 0), the matrix is singular * to working precision. This condition is indicated by a return * code of info > 0. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * mem_usage (output) mem_usage_t* * Record the memory usage statistics, consisting of following fields: * - for_lu (float) * The amount of space used in bytes for L\U data structures. * - total_needed (float) * The amount of space needed in bytes to perform factorization. * - expansions (int) * The number of memory expansions during the LU factorization. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly * singular, so the solution and error bounds * could not be computed. * = A->ncol+1: U is nonsingular, but RCOND is less than machine * precision, meaning that the matrix is singular to * working precision. Nevertheless, the solution and * error bounds are computed because there are a number * of situations where the computed solution can be more * accurate than the value of RCOND would suggest. * > A->ncol+1: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ DNformat *Bstore, *Xstore; doublecomplex *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ; char trant[1], norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh, drop_tol; double t0; /* temporary time */ double *utime; extern SuperLUStat_t SuperLUStat; /* External functions */ extern double zlangs(char *, SuperMatrix *); extern double dlamch_(char *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; #if 0 printf("zgssvx: fact=%c, trans=%c, refact=%c, equed=%c\n", *fact, *trans, *refact, *equed); #endif *info = 0; nofact = lsame_(fact, "N"); equil = lsame_(fact, "E"); notran = lsame_(trans, "N"); if (nofact || equil) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* Test the input parameters */ if (!nofact && !equil && !lsame_(fact, "F")) *info = -1; else if (!notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -2; else if ( !(lsame_(refact,"Y") || lsame_(refact, "N")) ) *info = -3; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_Z || A->Mtype != SLU_GE ) *info = -4; else if (lsame_(fact, "F") && !(rowequ || colequ || lsame_(equed, "N"))) *info = -9; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -10; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -11; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -15; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_Z || B->Mtype != SLU_GE ) *info = -16; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || B->ncol != X->ncol || X->Stype != SLU_DN || X->Dtype != SLU_Z || X->Mtype != SLU_GE ) *info = -17; } } if (*info != 0) { i = -(*info); xerbla_("zgssvx", &i); return; } /* Default values for factor_params */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = 1.0; drop_tol = 0.0; if ( factor_params != NULL ) { if ( factor_params->panel_size != -1 ) panel_size = factor_params->panel_size; if ( factor_params->relax != -1 ) relax = factor_params->relax; if ( factor_params->diag_pivot_thresh != -1 ) diag_pivot_thresh = factor_params->diag_pivot_thresh; if ( factor_params->drop_tol != -1 ) drop_tol = factor_params->drop_tol; } StatInit(panel_size, relax); utime = SuperLUStat.utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); zCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ *trant = 'T'; notran = 0; } else { *trant = 'N'; notran = 1; } } else { /* A->Stype == SLU_NC */ *trant = *trans; AA = A; } if ( equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ zgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ zlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } /* Scale the right hand side if equilibration was performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], R[i]); } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Bmat[i + j*ldb], &Bmat[i + j*ldb], C[i]); } } if ( nofact || equil ) { t0 = SuperLU_timer_(); sp_preorder(refact, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4)); fflush(stdout); */ /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); zgstrf(refact, &AC, diag_pivot_thresh, drop_tol, relax, panel_size, etree, work, lwork, perm_r, perm_c, L, U, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( *info > 0 ) { if ( *info <= A->ncol ) { /* Compute the reciprocal pivot growth factor of the leading rank-deficient *info columns of A. */ *recip_pivot_growth = zPivotGrowth(*info, AA, perm_c, L, U); } return; } /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = zPivotGrowth(A->ncol, AA, perm_c, L, U); /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = zlangs(norm, AA); zgscon(norm, L, U, anorm, rcond, info); utime[RCOND] = SuperLU_timer_() - t0; /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); zgstrs (trant, L, U, perm_r, perm_c, X, info); utime[SOLVE] = SuperLU_timer_() - t0; /* Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. */ t0 = SuperLU_timer_(); zgsrfs(trant, AA, L, U, perm_r, perm_c, equed, R, C, B, X, ferr, berr, info); utime[REFINE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i + j*ldx], &Xmat[i + j*ldx], C[i]); } } } else if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { zd_mult(&Xmat[i+ j*ldx], &Xmat[i+ j*ldx], R[i]); } } /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < dlamch_("E") ) *info = A->ncol + 1; zQuerySpace(L, U, panel_size, mem_usage); if ( nofact || equil ) Destroy_CompCol_Permuted(&AC); if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } PrintStat( &SuperLUStat ); StatFree(); }
void f_destroy_ScalePerm_handle(fptr *handle) { SUPERLU_FREE((void *)*handle); }
void tlin::destroyS(SuperMatrix &A, bool destroyData) { if (destroyData) Destroy_CompCol_Matrix(&A); else SUPERLU_FREE(A.Store); }
void pdgssvx(superlu_options_t_Distributed *options, SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, double B[], int ldb, int nrhs, gridinfo_t *grid, LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, double *berr, SuperLUStat_t *stat, int *info) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * * Purpose * ======= * * PDGSSVX solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * * Static pivoting is a technique that combines the numerical stability * of partial pivoting with the scalability of Cholesky (no pivoting), * to run accurately and efficiently on large numbers of processors. * * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed * description of the parallel algorithms. * * Here are the options for using this code: * * 1. Independent of all the other options specified below, the * user must supply * * - B, the matrix of right-hand sides, distributed by block rows, * and its dimensions ldb (local) and nrhs (global) * - grid, a structure describing the 2D processor mesh * - options->IterRefine, which determines whether or not to * improve the accuracy of the computed solution using * iterative refinement * * On output, B is overwritten with the solution X. * * 2. Depending on options->Fact, the user has four options * for solving A*X=B. The standard option is for factoring * A "from scratch". (The other options, described below, * are used when A is sufficiently similar to a previously * solved problem to save time by reusing part or all of * the previous factorization.) * * - options->Fact = DOFACT: A is factored "from scratch" * * In this case the user must also supply * * o A, the input matrix * * as well as the following options to determine what matrix to * factorize. * * o options->Equil, to specify how to scale the rows and columns * of A to "equilibrate" it (to try to reduce its * condition number and so improve the * accuracy of the computed solution) * * o options->RowPerm, to specify how to permute the rows of A * (typically to control numerical stability) * * o options->ColPerm, to specify how to permute the columns of A * (typically to control fill-in and enhance * parallelism during factorization) * * o options->ReplaceTinyPivot, to specify how to deal with tiny * pivots encountered during factorization * (to control numerical stability) * * The outputs returned include * * o ScalePermstruct, modified to describe how the input matrix A * was equilibrated and permuted: * . ScalePermstruct->DiagScale, indicates whether the rows and/or * columns of A were scaled * . ScalePermstruct->R, array of row scale factors * . ScalePermstruct->C, array of column scale factors * . ScalePermstruct->perm_r, row permutation vector * . ScalePermstruct->perm_c, column permutation vector * * (part of ScalePermstruct may also need to be supplied on input, * depending on options->RowPerm and options->ColPerm as described * later). * * o A, the input matrix A overwritten by the scaled and permuted * matrix Pc*Pr*diag(R)*A*diag(C), where * Pr and Pc are row and columns permutation matrices determined * by ScalePermstruct->perm_r and ScalePermstruct->perm_c, * respectively, and * diag(R) and diag(C) are diagonal scaling matrices determined * by ScalePermstruct->DiagScale, ScalePermstruct->R and * ScalePermstruct->C * * o LUstruct, which contains the L and U factorization of A1 where * * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * (Note that A1 = Aout * Pc^T, where Aout is the matrix stored * in A on output.) * * 3. The second value of options->Fact assumes that a matrix with the same * sparsity pattern as A has already been factored: * * - options->Fact = SamePattern: A is factored, assuming that it has * the same nonzero pattern as a previously factored matrix. In * this case the algorithm saves time by reusing the previously * computed column permutation vector stored in * ScalePermstruct->perm_c and the "elimination tree" of A * stored in LUstruct->etree * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->RowPerm * o options->ReplaceTinyPivot * * but not options->ColPerm, whose value is ignored. This is because the * previous column permutation from ScalePermstruct->perm_c is used as * input. The user must also supply * * o A, the input matrix * o ScalePermstruct->perm_c, the column permutation * o LUstruct->etree, the elimination tree * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated and row permuted * o LUstruct, modified to contain the new L and U factors * * 4. The third value of options->Fact assumes that a matrix B with the same * sparsity pattern as A has already been factored, and where the * row permutation of B can be reused for A. This is useful when A and B * have similar numerical values, so that the same row permutation * will make both factorizations numerically stable. This lets us reuse * all of the previously computed structure of L and U. * * - options->Fact = SamePattern_SameRowPerm: A is factored, * assuming not only the same nonzero pattern as the previously * factored matrix B, but reusing B's row permutation. * * In this case the user must still specify the following options * as before: * * o options->Equil * o options->ReplaceTinyPivot * * but not options->RowPerm or options->ColPerm, whose values are * ignored. This is because the permutations from ScalePermstruct->perm_r * and ScalePermstruct->perm_c are used as input. * * The user must also supply * * o A, the input matrix * o ScalePermstruct->DiagScale, how the previous matrix was row * and/or column scaled * o ScalePermstruct->R, the row scalings of the previous matrix, * if any * o ScalePermstruct->C, the columns scalings of the previous matrix, * if any * o ScalePermstruct->perm_r, the row permutation of the previous * matrix * o ScalePermstruct->perm_c, the column permutation of the previous * matrix * o all of LUstruct, the previously computed information about * L and U (the actual numerical values of L and U * stored in LUstruct->Llu are ignored) * * The outputs returned include * * o A, the input matrix A overwritten by the scaled and permuted * matrix as described above * o ScalePermstruct, modified to describe how the input matrix A was * equilibrated (thus ScalePermstruct->DiagScale, * R and C may be modified) * o LUstruct, modified to contain the new L and U factors * * 5. The fourth and last value of options->Fact assumes that A is * identical to a matrix that has already been factored on a previous * call, and reuses its entire LU factorization * * - options->Fact = Factored: A is identical to a previously * factorized matrix, so the entire previous factorization * can be reused. * * In this case all the other options mentioned above are ignored * (options->Equil, options->RowPerm, options->ColPerm, * options->ReplaceTinyPivot) * * The user must also supply * * o A, the unfactored matrix, only in the case that iterative * refinment is to be done (specifically A must be the output * A from the previous call, so that it has been scaled and * permuted) * o all of ScalePermstruct * o all of LUstruct, including the actual numerical values of * L and U * * all of which are unmodified on output. * * Arguments * ========= * * options (input) superlu_options_t_Distributed* (global) * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following fields should be defined for this structure: * * o Fact (fact_t) * Specifies whether or not the factored form of the matrix * A is supplied on entry, and if not, how the matrix A should * be factorized based on the previous history. * * = DOFACT: The matrix A will be factorized from scratch. * Inputs: A * options->Equil, RowPerm, ColPerm, ReplaceTinyPivot * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * all of ScalePermstruct * all of LUstruct * * = SamePattern: the matrix A will be factorized assuming * that a factorization of a matrix with the same sparsity * pattern was performed prior to this one. Therefore, this * factorization will reuse column permutation vector * ScalePermstruct->perm_c and the elimination tree * LUstruct->etree * Inputs: A * options->Equil, RowPerm, ReplaceTinyPivot * ScalePermstruct->perm_c * LUstruct->etree * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * rest of ScalePermstruct (DiagScale, R, C, perm_r) * rest of LUstruct (GLU_persist, Llu) * * = SamePattern_SameRowPerm: the matrix A will be factorized * assuming that a factorization of a matrix with the same * sparsity pattern and similar numerical values was performed * prior to this one. Therefore, this factorization will reuse * both row and column scaling factors R and C, and the * both row and column permutation vectors perm_r and perm_c, * distributed data structure set up from the previous symbolic * factorization. * Inputs: A * options->Equil, ReplaceTinyPivot * all of ScalePermstruct * all of LUstruct * Outputs: modified A * (possibly row and/or column scaled and/or * permuted) * modified LUstruct->Llu * = FACTORED: the matrix A is already factored. * Inputs: all of ScalePermstruct * all of LUstruct * * o Equil (yes_no_t) * Specifies whether to equilibrate the system. * = NO: no equilibration. * = YES: scaling factors are computed to equilibrate the system: * diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B. * Whether or not the system will be equilibrated depends * on the scaling of the matrix A, but if equilibration is * used, A is overwritten by diag(R)*A*diag(C) and B by * diag(R)*B. * * o RowPerm (rowperm_t) * Specifies how to permute rows of the matrix A. * = NATURAL: use the natural ordering. * = LargeDiag: use the Duff/Koster algorithm to permute rows of * the original matrix to make the diagonal large * relative to the off-diagonal. * = MY_PERMR: use the ordering given in ScalePermstruct->perm_r * input by the user. * * o ColPerm (colperm_t) * Specifies what type of column permutation to use to reduce fill. * = NATURAL: natural ordering. * = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A. * = MMD_ATA: minimum degree ordering on structure of A'*A. * = COLAMD: approximate minimum degree column ordering. * = MY_PERMC: the ordering given in ScalePermstruct->perm_c. * * o ReplaceTinyPivot (yes_no_t) * = NO: do not modify pivots * = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during * LU factorization. * * o IterRefine (IterRefine_t) * Specifies how to perform iterative refinement. * = NO: no iterative refinement. * = DOUBLE: accumulate residual in double precision. * = EXTRA: accumulate residual in extra precision. * * NOTE: all options must be indentical on all processes when * calling this routine. * * A (input/output) SuperMatrix* (local) * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. * On exit, A may be overwtirren by Pc*Pr*diag(R)*A*diag(C), * depending on ScalePermstruct->DiagScale, options->RowPerm and * options->ColPerm: * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by * diag(R)*A*diag(C). * if options->RowPerm != NATURAL, A is further overwritten by * Pr*diag(R)*A*diag(C). * if options->ColPerm != NATURAL, A is further overwritten by * Pc*Pr*diag(R)*A*diag(C). * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * * ScalePermstruct (input/output) ScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: * * o DiagScale (DiagScale_t) * Specifies the form of equilibration that was done. * = NOEQUIL: no equilibration. * = ROW: row equilibration, i.e., A was premultiplied by * diag(R). * = COL: Column equilibration, i.e., A was postmultiplied * by diag(C). * = BOTH: both row and column equilibration, i.e., A was * replaced by diag(R)*A*diag(C). * If options->Fact = FACTORED or SamePattern_SameRowPerm, * DiagScale is an input argument; otherwise it is an output * argument. * * o perm_r (int*) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * If options->RowPerm = MY_PERMR, or * options->Fact = SamePattern_SameRowPerm, perm_r is an * input argument; otherwise it is an output argument. * * o perm_c (int*) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * If options->ColPerm = MY_PERMC or options->Fact = SamePattern * or options->Fact = SamePattern_SameRowPerm, perm_c is an * input argument; otherwise, it is an output argument. * On exit, perm_c may be overwritten by the product of the input * perm_c and a permutation that postorders the elimination tree * of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree * is already in postorder. * * o R (double*) dimension (A->nrow) * The row scale factors for A. * If DiagScale = ROW or BOTH, A is multiplied on the left by * diag(R). * If DiagScale = NOEQUIL or COL, R is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, R is * an input argument; otherwise, R is an output argument. * * o C (double*) dimension (A->ncol) * The column scale factors for A. * If DiagScale = COL or BOTH, A is multiplied on the right by * diag(C). * If DiagScale = NOEQUIL or ROW, C is not defined. * If options->Fact = FACTORED or SamePattern_SameRowPerm, C is * an input argument; otherwise, C is an output argument. * * B (input/output) double* (local) * On entry, the right-hand side matrix of dimension (m_loc, nrhs), * where, m_loc is the number of rows stored locally on my * process and is defined in the data structure of matrix A. * On exit, the solution matrix if info = 0; * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * The number of right-hand sides. * If nrhs = 0, only LU decomposition is performed, the forward * and back substitutions are skipped. * * grid (input) gridinfo_t* (global) * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * It contains the following fields: * * o etree (int*) dimension (A->ncol) (global) * Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'. * It is computed in sp_colorder() during the first factorization, * and is reused in the subsequent factorizations of the matrices * with the same nonzero pattern. * On exit of sp_colorder(), the columns of A are permuted so that * the etree is in a certain postorder. This postorder is reflected * in ScalePermstruct->perm_c. * NOTE: * Etree is a vector of parent pointers for a forest whose vertices * are the integers 0 to A->ncol-1; etree[root]==A->ncol. * * o Glu_persist (Glu_persist_t*) (global) * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (LocalLU_t*) (local) * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * SOLVEstruct (input/output) SOLVEstruct_t* * The data structure to hold the communication pattern used * in the phases of triangular solution and iterative refinement. * This pattern should be intialized only once for repeated solutions. * If options->SolveInitialized = YES, it is an input argument. * If options->SolveInitialized = NO and nrhs != 0, it is an output * argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'. * * berr (output) double*, dimension (nrhs) (global) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * so the solution could not be computed. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * * See superlu_ddefs.h for the definitions of varioous data types. * */ NRformat_loc *Astore; SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of supernodes in L. (usub, xusub) contains the compressed subscript of nonzero segments in U. If options->Fact != SamePattern_SameRowPerm, they are computed by SYMBFACT routine, and then used by PDDISTRIBUTE routine. They will be freed after PDDISTRIBUTE routine. If options->Fact == SamePattern_SameRowPerm, these structures are not used. */ fact_t Fact; double *a; int_t *colptr, *rowind; int_t *perm_r; /* row permutations from partial pivoting */ int_t *perm_c; /* column permutation vector */ int_t *etree; /* elimination tree */ int_t *rowptr, *colind; /* Local A in NR*/ int_t *rowind_loc, *colptr_loc; int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; int_t nnz_loc, m_loc, fst_row, icol; int iam; int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; static mem_usage_t_Distributed num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif int_t procs; /* Initialization. */ m = A->nrow; n = A->ncol; Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; a = Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; /* Test the input parameters. */ *info = 0; Fact = options->Fact; if ( Fact < 0 || Fact > FACTORED ) *info = -1; else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) *info = -1; else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) *info = -1; else if ( options->IterRefine < 0 || options->IterRefine > EXTRA ) *info = -1; else if ( options->IterRefine == EXTRA ) { *info = -1; fprintf(stderr, "Extra precise iterative refinement yet to support."); } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( ldb < m_loc ) *info = -5; else if ( nrhs < 0 ) *info = -6; if ( *info ) { i = -(*info); pxerbla("pdgssvx", grid, -*info); return; } factored = (Fact == FACTORED); Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); iam = grid->iam; job = 5; if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { rowequ = (ScalePermstruct->DiagScale == ROW) || (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; /********/ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgssvx()"); #endif if ( Equil ) { /* Allocate storage if not done so before. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: if ( !(C = (double *) doubleMalloc_dist(n)) ) ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: if ( !(R = (double *) doubleMalloc_dist(m)) ) ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------*/ if ( Equil ) { #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter equil"); #endif t = SuperLU_timer_(); if ( Fact == SamePattern_SameRowPerm ) { /* Reuse R and C. */ switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: break; case ROW: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { a[i] *= R[irow]; /* Scale rows. */ } ++irow; } break; case COL: for (j = 0; j < m_loc; ++j) for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ } ++irow; } break; } } else { /* Compute the row and column scalings. */ pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); /* Equilibrate matrix A if it is badly-scaled. */ pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); if ( lsame_(equed, "R") ) { ScalePermstruct->DiagScale = rowequ = ROW; } else if ( lsame_(equed, "C") ) { ScalePermstruct->DiagScale = colequ = COL; } else if ( lsame_(equed, "B") ) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. equilibrated? *equed = %c\n", *equed); /*fflush(stdout);*/ } #endif } /* if Fact ... */ stat->utime[EQUIL] = SuperLU_timer_() - t; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit equil"); #endif } /* if Equil ... */ /* * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation * for large diagonal is sought after. */ need_value = (options->RowPerm == LargeDiag && Fact != SamePattern_SameRowPerm && !factored); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *) GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; if ( need_value ) a_GA = GAstore->nzval; else assert(GAstore->nzval == NULL); /* ------------------------------------------------------------ Find the row permutation for A. ------------------------------------------------------------*/ if ( options->RowPerm != NO ) { t = SuperLU_timer_(); if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ /* Permute the global matrix GA for symbfact() */ for (i = 0; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } else if ( !factored && Fact != SamePattern_SameRowPerm ) { /* Get a new perm_r[] */ if ( job == 5 ) { /* Allocate storage for scaling factors. */ if ( !(R1 = (double *) SUPERLU_MALLOC(m * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R1[]"); if ( !(C1 = (double *) SUPERLU_MALLOC(n * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C1[]"); } if ( !iam ) { /* Process 0 finds a row permutation for large diagonal. */ dldperm(job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } else { MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); if ( job == 5 && Equil ) { MPI_Bcast( R1, m, MPI_DOUBLE, 0, grid->comm ); MPI_Bcast( C1, n, MPI_DOUBLE, 0, grid->comm ); } } #if ( PRNTlevel>=2 ) dmin = dlamch_("Overflow"); dsum = 0.0; dprod = 1.0; #endif if ( job == 5 ) { if ( Equil ) { for (i = 0; i < n; ++i) { R1[i] = exp(R1[i]); C1[i] = exp(C1[i]); } /* Permute the global matrix GA for symbfact(). */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) /* New diagonal */ dprod *= fabs(a[i]); #endif } } /* Scale the distributed matrix */ irow = fst_row; for (j = 0; j < m_loc; ++j) { for (i = rowptr[j]; i < rowptr[j+1]; ++i) { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; } ++irow; } /* Multiply together the scaling factors. */ if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; else for (i = 0; i < m; ++i) R[i] = R1[i]; if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; else for (i = 0; i < n; ++i) C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; } else { /* No equilibration. Only permute the global A. */ for (i = colptr[0]; i < colptr[n]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; } } SUPERLU_FREE (R1); SUPERLU_FREE (C1); } else { /* job = 2,3,4 */ for (j = 0; j < n; ++j) { for (i = colptr[j]; i < colptr[j+1]; ++i) { irow = rowind[i]; rowind[i] = perm_r[irow]; #if ( PRNTlevel>=2 ) if ( rowind[i] == j ) { /* New diagonal */ if ( job == 2 || job == 3 ) dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if ( job == 4 ) dsum += fabs(a[i]); else if ( job == 5 ) dprod *= fabs(a[i]); } #endif } } } #if ( PRNTlevel>=2 ) if ( job == 2 || job == 3 ) { if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); } else if ( job == 4 ) { if ( !iam ) printf("\tsum of diagonal %e\n", dsum); } else if ( job == 5 ) { if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif } /* else !factored */ t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. LDPERM job %d\t time: %.2f\n", job, t); #endif } else { /* options->RowPerm == NOROWPERM */ for (i = 0; i <m; ++i) perm_r[i] = i; } #if ( DEBUGlevel>=1 ) if ( !iam ) PrintInt10("perm_r", m, perm_r); #endif if ( !factored || options->IterRefine ) { /* Compute norm(A), which will be used to adjust small diagonal. */ if ( notran ) *(unsigned char *)norm = '1'; else *(unsigned char *)norm = 'I'; anorm = pdlangs(norm, A, grid); #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. anorm %e\n", anorm); #endif } /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------*/ if ( !factored ) { t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && Fact == DOFACT ) get_perm_c_dist(iam, permc_spec, &GA, perm_c); /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. Adjust perm_c[] to be consistent with a postorder of etree. Permute columns of A to form A*Pc'. */ sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ { int_t *GACcolbeg, *GACcolend, *GACrowind; GACstore = GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; for (j = 0; j < n; ++j) { for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { irow = GACrowind[i]; GACrowind[i] = perm_c[irow]; } } } stat->utime[COLPERM] = SuperLU_timer_() - t; /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures which are suitable for supernodal GENP. */ if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); if ( !(Glu_freeable = (Glu_freeable_t *) SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ iinfo = symbfact(iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); stat->utime[SYMBFAC] = SuperLU_timer_() - t; if ( iinfo < 0 ) { /* Successful return */ QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) if ( !iam ) { printf("\tNo of supers %ld\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) %ld\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) %ld\n", Glu_freeable->xusub[n]); printf("\tint %d, short %d, float %d, double %d\n", sizeof(int_t), sizeof(short), sizeof(float), sizeof(double)); printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); } #endif } else { if ( !iam ) { fprintf(stderr, "symbfact() error returns %d\n", iinfo); exit(-1); } } } /* Apply column permutation to the original distributed A */ for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_(); dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if ( Fact != SamePattern_SameRowPerm ) { iinfo = symbfact_SubFree(Glu_freeable); SUPERLU_FREE(Glu_freeable); } /* Perform numerical factorization in parallel. */ t = SuperLU_timer_(); pdgstrf(options, m, n, anorm, LUstruct, grid, stat, info); stat->utime[FACT] = SuperLU_timer_() - t; #if ( PRNTlevel>=1 ) { int_t TinyPivots; float for_lu, total, max, avg, temp; dQuerySpace_dist(n, LUstruct, grid, &num_mem_usage); MPI_Reduce( &num_mem_usage.for_lu, &for_lu, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); temp = SUPERLU_MAX(symb_mem_usage.total, symb_mem_usage.for_lu + (float)dist_mem_use + num_mem_usage.for_lu); temp = SUPERLU_MAX(temp, num_mem_usage.total); MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, grid->comm ); stat->TinyPivots = TinyPivots; if ( !iam ) { printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", for_lu*1e-6, total*1e-6); printf("\tAll space (MB):" "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); } } #endif } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ /* Permute columns of A to form A*Pc' using the existing perm_c. * NOTE: rows of A were previously permuted to Pc*A. * * XSL: NO; this is different now. */ sp_colorder(options, &GA, perm_c, NULL, &GAC); /* ????? */ } /* if !factored ... */ /* Destroy GA */ Destroy_CompCol_Matrix_dist(&GA); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Before solve"); #endif /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------*/ if ( nrhs ) { if ( !(b_work = doubleMalloc_dist(n)) ) ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------------ Scale the right-hand side if equilibration was performed. ------------------------------------------------------------*/ if ( notran ) { if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } } else if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } /* Save a copy of the right-hand side. */ ldx = ldb; if ( !(X = doubleMalloc_dist(((size_t)ldx) * nrhs)) ) ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } /* ------------------------------------------------------------ Solve the linear system. ------------------------------------------------------------*/ if ( options->SolveInitialized == NO ) { dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, SOLVEstruct); } pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); #if ( DEBUGlevel>=2 ) printf("\n(%d) .. After pdgstrs(): x =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, X[i]); #endif /* ------------------------------------------------------------ Use iterative refinement to improve the computed solution and compute error bounds and backward error estimates for it. ------------------------------------------------------------*/ if ( options->IterRefine ) { /* Improve the solution by iterative refinement. */ t = SuperLU_timer_(); pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, B, ldb, X, ldx, nrhs, SOLVEstruct, berr, stat, info); stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, SOLVEstruct->inv_perm_c, X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original system before the equilibration. */ if ( notran ) { if ( colequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= C[irow]; ++irow; } b_col += ldb; } } } else if ( rowequ ) { b_col = B; for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { b_col[i] *= R[irow]; ++irow; } b_col += ldb; } } SUPERLU_FREE(b_work); SUPERLU_FREE(X); } /* end if nrhs != 0 */ #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate storage. */ if ( Equil && Fact != SamePattern_SameRowPerm ) { switch ( ScalePermstruct->DiagScale ) { case NOEQUIL: SUPERLU_FREE(R); SUPERLU_FREE(C); break; case ROW: SUPERLU_FREE(C); break; case COL: SUPERLU_FREE(R); break; } } if ( !factored || (factored && options->IterRefine) ) Destroy_CompCol_Permuted_dist(&GAC); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgssvx()"); #endif }
void tlin::destroyD(SuperMatrix &A, bool destroyData) { if (destroyData) Destroy_Dense_Matrix(&A); else SUPERLU_FREE(A.Store); }
void cgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, SuperMatrix *B, SuperLUStat_t *stat, int *info ) { DNformat *Bstore; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int lwork = 0, *etree, i; /* Set default values for some parameters */ int panel_size; /* panel size */ int relax; /* no of columns in a relaxed snodes */ int permc_spec; trans_t trans = NOTRANS; double *utime; double t; /* Temporary time */ /* Test the input parameters ... */ *info = 0; Bstore = B->Store; if ( options->Fact != DOFACT ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_C || A->Mtype != SLU_GE ) *info = -2; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -7; if ( *info != 0 ) { i = -(*info); xerbla_("cgssv", &i); return; } utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); cCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); trans = TRANS; } else { if ( A->Stype == SLU_NC ) AA = A; } t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t; etree = intMalloc(A->ncol); t = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t; panel_size = sp_ienv(1); relax = sp_ienv(2); /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", relax, panel_size, sp_ienv(3), sp_ienv(4));*/ t = SuperLU_timer_(); /* Compute the LU factorization of A. */ cgstrf(options, &AC, relax, panel_size, etree, NULL, lwork, perm_c, perm_r, L, U, stat, info); utime[FACT] = SuperLU_timer_() - t; t = SuperLU_timer_(); if ( *info == 0 ) { /* Solve the system A*X=B, overwriting B with X. */ cgstrs (trans, L, U, perm_c, perm_r, B, stat, info); } utime[SOLVE] = SuperLU_timer_() - t; SUPERLU_FREE (etree); Destroy_CompCol_Permuted(&AC); if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
int_t dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, gridinfo_t *grid, int_t *colptr[], int_t *rowind[], double *a[]) { /* * -- Distributed SuperLU routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley. * March 15, 2003 * * Purpose * ======= * Re-distribute A on the 2D process mesh. * * Arguments * ========= * * A (input) SuperMatrix* * The distributed input matrix A of dimension (A->nrow, A->ncol). * A may be overwritten by diag(R)*A*diag(C)*Pc^T. * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. * * ScalePermstruct (input) ScalePermstruct_t* * The data structure to store the scaling and permutation vectors * describing the transformations performed to the original matrix A. * * Glu_freeable (input) *Glu_freeable_t * The global structure describing the graph of L and U. * * grid (input) gridinfo_t* * The 2D process mesh. * * colptr (output) int* * * rowind (output) int* * * a (output) double* * * Return value * ============ * */ NRformat_loc *Astore; int_t *perm_r; /* row permutation vector */ int_t *perm_c; /* column permutation vector */ int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; int_t nnz_loc; /* number of local nonzeros */ int_t nnz_remote; /* number of remote nonzeros to be sent */ int_t SendCnt; /* number of remote nonzeros to be sent */ int_t RecvCnt; /* number of remote nonzeros to be sent */ int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; int_t *ia, *ja, **ia_send, *index, *itemp; int_t *ptr_to_send; double *aij, **aij_send, *nzval, *dtemp; double *nzval_a; int iam, it, p, procs; MPI_Request *send_req; MPI_Status status; /* ------------------------------------------------------------ INITIALIZATION. ------------------------------------------------------------*/ iam = grid->iam; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter dReDistribute_A()"); #endif perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; procs = grid->nprow * grid->npcol; Astore = (NRformat_loc *) A->Store; n = A->ncol; m_loc = Astore->m_loc; fst_row = Astore->fst_row; nnzToRecv = intCalloc_dist(2*procs); nnzToSend = nnzToRecv + procs; /* ------------------------------------------------------------ COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, THEN ALLOCATE SPACE. THIS ACCOUNTS FOR THE FIRST PASS OF A. ------------------------------------------------------------*/ for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); ++nnzToSend[p]; } } /* All-to-all communication */ MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, grid->comm); maxnnzToRecv = 0; nnz_loc = SendCnt = RecvCnt = 0; for (p = 0; p < procs; ++p) { if ( p != iam ) { SendCnt += nnzToSend[p]; RecvCnt += nnzToRecv[p]; maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); } else { nnz_loc += nnzToRecv[p]; /*assert(nnzToSend[p] == nnzToRecv[p]);*/ } } k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ /* Allocate space for storing the triplets after redistribution. */ if ( !(ia = intMalloc_dist(2*k)) ) ABORT("Malloc fails for ia[]."); ja = ia + k; if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) ABORT("Malloc fails for ia_send[]."); if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) ABORT("Malloc fails for aij_send[]."); if ( !(index = intMalloc_dist(2*SendCnt)) ) ABORT("Malloc fails for index[]."); if ( !(nzval = doubleMalloc_dist(SendCnt)) ) ABORT("Malloc fails for nzval[]."); if ( !(ptr_to_send = intCalloc_dist(procs)) ) ABORT("Malloc fails for ptr_to_send[]."); if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) ABORT("Malloc fails for itemp[]."); if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) ABORT("Malloc fails for dtemp[]."); for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } } /* if procs > 1 */ if ( !(*colptr = intCalloc_dist(n+1)) ) ABORT("Malloc fails for *colptr[]."); /* ------------------------------------------------------------ LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. THIS ACCOUNTS FOR THE SECOND PASS OF A. ------------------------------------------------------------*/ nnz_loc = 0; /* Reset the local nonzero count. */ nzval_a = Astore->nzval; for (i = 0; i < m_loc; ++i) { for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ jcol = Astore->colind[j]; gbi = BlockNum( irow ); gbj = BlockNum( jcol ); p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); if ( p != iam ) { /* remote */ k = ptr_to_send[p]; ia_send[p][k] = irow; ia_send[p][k + nnzToSend[p]] = jcol; aij_send[p][k] = nzval_a[j]; ++ptr_to_send[p]; } else { /* local */ ia[nnz_loc] = irow; ja[nnz_loc] = jcol; aij[nnz_loc] = nzval_a[j]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } /* ------------------------------------------------------------ PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); it = nnzToSend[p]; MPI_Isend( aij_send[p], it, MPI_DOUBLE, p, iam+procs, grid->comm, &send_req[procs+p] ); } } for (p = 0; p < procs; ++p) { if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, grid->comm, &status ); for (i = 0; i < nnzToRecv[p]; ++i) { ia[nnz_loc] = itemp[i]; jcol = itemp[i + nnzToRecv[p]]; /*assert(jcol<n);*/ ja[nnz_loc] = jcol; aij[nnz_loc] = dtemp[i]; ++nnz_loc; ++(*colptr)[jcol]; /* Count nonzeros in each column */ } } } for (p = 0; p < procs; ++p) { if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } } /* ------------------------------------------------------------ DEALLOCATE TEMPORARY STORAGE ------------------------------------------------------------*/ SUPERLU_FREE(nnzToRecv); if ( procs > 1 ) { SUPERLU_FREE(send_req); SUPERLU_FREE(ia_send); SUPERLU_FREE(aij_send); SUPERLU_FREE(index); SUPERLU_FREE(nzval); SUPERLU_FREE(ptr_to_send); SUPERLU_FREE(itemp); SUPERLU_FREE(dtemp); } /* ------------------------------------------------------------ CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. ------------------------------------------------------------*/ if ( !(*rowind = intMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *rowind[]."); if ( !(*a = doubleMalloc_dist(nnz_loc)) ) ABORT("Malloc fails for *a[]."); /* Initialize the array of column pointers */ k = 0; jsize = (*colptr)[0]; (*colptr)[0] = 0; for (j = 1; j < n; ++j) { k += jsize; jsize = (*colptr)[j]; (*colptr)[j] = k; } /* Copy the triplets into the column oriented storage */ for (i = 0; i < nnz_loc; ++i) { j = ja[i]; k = (*colptr)[j]; (*rowind)[k] = ia[i]; (*a)[k] = aij[i]; ++(*colptr)[j]; } /* Reset the column pointers to the beginning of each column */ for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; (*colptr)[0] = 0; SUPERLU_FREE(ia); SUPERLU_FREE(aij); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit dReDistribute_A()"); #endif } /* dReDistribute_A */