void RKWidget::setSize ( const size_t value ) { int add = value%nblock; size_t newval = value + add; if ( newval == N_ ) return; dirty = true; if ( N_ != 0 ) { if(nStage != 0) delete[] b_k; SUPERLU_FREE (rhsb); SUPERLU_FREE (rhsx); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); SUPERLU_FREE (ferr); SUPERLU_FREE (berr); if(aexist) { // ??? Destroy_CompCol_Matrix(&A); //delete[] a; //delete[] xa; //delete[] asub; } Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); if ( lwork == 0 && !dirty) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&Up); } else if ( lwork > 0 ) { SUPERLU_FREE(work); } aexist= false; dirty = true; } if(nStage != 0) b_k = new double[value*nStage]; if ( !(rhsb = doubleMalloc(value)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsx = doubleMalloc(value)) ) ABORT("Malloc fails for rhsx[]."); dCreate_Dense_Matrix(&B, value, 1, rhsb, value, SLU_DN, SLU_D, SLU_GE); dCreate_Dense_Matrix(&X, value, 1, rhsx, value, SLU_DN, SLU_D, SLU_GE); if ( !(etree = intMalloc(value)) ) ABORT("Malloc fails for etree[]."); if ( !(perm_r = intMalloc(value)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(value)) ) ABORT("Malloc fails for perm_c[]."); if ( !(R = (double *) SUPERLU_MALLOC(value * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (double *) SUPERLU_MALLOC(value * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C[]."); if ( !(ferr = (double *) SUPERLU_MALLOC( sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for ferr[]."); if ( !(berr = (double *) SUPERLU_MALLOC( sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for berr[]."); resize(value); }
void tlin::allocD(SuperMatrix *&A, int rows, int cols) { A = (SuperMatrix *)SUPERLU_MALLOC(sizeof(SuperMatrix)); double *values = doubleMalloc(rows * cols * sizeof(double)); dCreate_Dense_Matrix(A, rows, cols, values, rows, SLU_DN, SLU_D, SLU_GE); }
void tlin::createS(SuperMatrix &A, int rows, int cols, int nnz) { double *values = doubleMalloc(nnz); int *rowind = intMalloc(nnz); int *colptr = intMalloc(cols + 1); dCreate_CompCol_Matrix(&A, rows, cols, nnz, values, rowind, colptr, SLU_NC, SLU_D, SLU_GE); }
void tlin::allocS(SuperMatrix *&A, int rows, int cols, int nnz) { A = (SuperMatrix *)SUPERLU_MALLOC(sizeof(SuperMatrix)); double *values = doubleMalloc(nnz); int *rowind = intMalloc(nnz); int *colptr = intMalloc(cols + 1); dCreate_CompCol_Matrix(A, rows, cols, nnz, values, rowind, colptr, SLU_NC, SLU_D, SLU_GE); }
/* * Allocate storage for various statistics. */ void StatAlloc(const int n, const int nprocs, const int panel_size, const int relax, Gstat_t *Gstat) { register int w; w = SUPERLU_MAX( panel_size, relax ) + 1; Gstat->panel_histo = intCalloc(w); Gstat->utime = (double *) doubleMalloc(NPHASES); Gstat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t)); if ( !(Gstat->procstat = (procstat_t *) SUPERLU_MALLOC(nprocs*sizeof(procstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for procstat[]" ); #if (PRNTlevel==1) printf(".. StatAlloc(): n %d, nprocs %d, panel_size %d, relax %d\n", n, nprocs, panel_size, relax); #endif #ifdef PROFILE if ( !(Gstat->panstat = (panstat_t*) SUPERLU_MALLOC(n * sizeof(panstat_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for panstat[]" ); Gstat->panhows = intCalloc(3); Gstat->height = intCalloc(n+1); if ( !(Gstat->flops_by_height = (float *) SUPERLU_MALLOC(n * sizeof(float))) ) SUPERLU_ABORT("SUPERLU_MALLOC failed for flops_by_height[]"); #endif #ifdef PREDICT_OPT if ( !(cp_panel = (cp_panel_t *) SUPERLU_MALLOC(n * sizeof(cp_panel_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for cp_panel[]" ); if ( !(desc_eft = (desc_eft_t *) SUPERLU_MALLOC(n * sizeof(desc_eft_t))) ) SUPERLU_ABORT( "SUPERLU_MALLOC failed for desc_eft[]" ); cp_firstkid = intMalloc(n+1); cp_nextkid = intMalloc(n+1); #endif }
/* * Convert a row compressed storage into a column compressed storage. */ void dCompRow_to_CompCol(int m, int n, int nnz, double *a, int *colind, int *rowptr, double **at, int **rowind, int **colptr) { register int i, j, col, relpos; int *marker; /* Allocate storage for another copy of the matrix. */ *at = (double *) doubleMalloc(nnz); *rowind = (int *) intMalloc(nnz); *colptr = (int *) intMalloc(n+1); marker = (int *) intCalloc(n); /* Get counts of each column of A, and set up column pointers */ for (i = 0; i < m; ++i) for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]]; (*colptr)[0] = 0; for (j = 0; j < n; ++j) { (*colptr)[j+1] = (*colptr)[j] + marker[j]; marker[j] = (*colptr)[j]; } /* Transfer the matrix into the compressed column storage. */ for (i = 0; i < m; ++i) { for (j = rowptr[i]; j < rowptr[i+1]; ++j) { col = colind[j]; relpos = marker[col]; (*rowind)[relpos] = i; (*at)[relpos] = a[j]; ++marker[col]; } } SUPERLU_FREE(marker); }
void tlin::createD(SuperMatrix &A, int rows, int cols) { double *values = doubleMalloc(rows * cols * sizeof(double)); dCreate_Dense_Matrix(&A, rows, cols, values, rows, SLU_DN, SLU_D, SLU_GE); }
main(int argc, char *argv[]) { /* * Purpose * ======= * * The driver program DLINSOLX2. * * This example illustrates how to use DGSSVX to solve systems repeatedly * with the same sparsity pattern of matrix A. * In this case, the column permutation vector perm_c is computed once. * The following data structures will be reused in the subsequent call to * DGSSVX: perm_c, etree * */ char equed[1]; yes_no_t equil; trans_t trans; SuperMatrix A, A1, L, U; SuperMatrix B, B1, X; NCformat *Astore; NCformat *Ustore; SCformat *Lstore; double *a, *a1; int *asub, *xa, *asub1, *xa1; int *perm_r; /* row permutations from partial pivoting */ int *perm_c; /* column permutation vector */ int *etree; void *work; int info, lwork, nrhs, ldx; int i, j, m, n, nnz; double *rhsb, *rhsb1, *rhsx, *xact; double *R, *C; double *ferr, *berr; double u, rpg, rcond; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; extern void parse_command_line(); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Defaults */ lwork = 0; nrhs = 1; equil = YES; u = 1.0; trans = NOTRANS; /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 1.0; options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; */ set_default_options(&options); /* Can use command line input to modify the defaults. */ parse_command_line(argc, argv, &lwork, &u, &equil, &trans); options.Equil = equil; options.DiagPivotThresh = u; options.Trans = trans; if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) { ABORT("DLINSOLX: cannot allocate work[]"); } } /* Read matrix A from a file in Harwell-Boeing format.*/ dreadhb(&m, &n, &nnz, &a, &asub, &xa); if ( !(a1 = doubleMalloc(nnz)) ) ABORT("Malloc fails for a1[]."); if ( !(asub1 = intMalloc(nnz)) ) ABORT("Malloc fails for asub1[]."); if ( !(xa1 = intMalloc(n+1)) ) ABORT("Malloc fails for xa1[]."); for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); Astore = A.Store; printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); if ( !(rhsb = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsb1 = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb1[]."); if ( !(rhsx = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_D, SLU_GE); dCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(trans, nrhs, xact, ldx, &A, &B); for (j = 0; j < nrhs; ++j) for (i = 0; i < m; ++i) rhsb1[i+j*m] = rhsb[i+j*m]; if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[]."); if ( !(R = (double *) SUPERLU_MALLOC(A.nrow * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (double *) SUPERLU_MALLOC(A.ncol * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C[]."); if ( !(ferr = (double *) SUPERLU_MALLOC(nrhs * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for ferr[]."); if ( !(berr = (double *) SUPERLU_MALLOC(nrhs * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for berr[]."); /* Initialize the statistics variables. */ StatInit(&stat); /* ------------------------------------------------------------ WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME: AX = B ------------------------------------------------------------*/ dgssvx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, ferr, berr, &mem_usage, &stat, &info); printf("First system: dgssvx() returns info %d\n", info); if ( info == 0 || info == n+1 ) { /* This is how you could access the solution matrix. */ double *sol = (double*) ((DNformat*) X.Store)->nzval; if ( options.PivotGrowth ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber ) printf("Recip. condition number = %e\n", rcond); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("FILL ratio = %.1f\n", (float)(Lstore->nnz + Ustore->nnz - n)/nnz); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); if ( options.IterRefine ) { printf("Iterative Refinement:\n"); printf("%8s%8s%16s%16s\n", "rhs", "Steps", "FERR", "BERR"); for (i = 0; i < nrhs; ++i) printf("%8d%8d%16e%16e\n", i+1, stat.RefineSteps, ferr[i], berr[i]); } fflush(stdout); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); Destroy_CompCol_Matrix(&A); Destroy_Dense_Matrix(&B); if ( lwork >= 0 ) { /* Deallocate storage associated with L and U. */ Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } /* ------------------------------------------------------------ NOW WE SOLVE ANOTHER LINEAR SYSTEM: A1*X = B1 ONLY THE SPARSITY PATTERN OF A1 IS THE SAME AS THAT OF A. ------------------------------------------------------------*/ options.Fact = SamePattern; StatInit(&stat); /* Initialize the statistics variables. */ dCreate_CompCol_Matrix(&A1, m, n, nnz, a1, asub1, xa1, SLU_NC, SLU_D, SLU_GE); dCreate_Dense_Matrix(&B1, m, nrhs, rhsb1, m, SLU_DN, SLU_D, SLU_GE); dgssvx(&options, &A1, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B1, &X, &rpg, &rcond, ferr, berr, &mem_usage, &stat, &info); printf("\nSecond system: dgssvx() returns info %d\n", info); if ( info == 0 || info == n+1 ) { /* This is how you could access the solution matrix. */ double *sol = (double*) ((DNformat*) X.Store)->nzval; if ( options.PivotGrowth ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber ) printf("Recip. condition number = %e\n", rcond); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); if ( options.IterRefine ) { printf("Iterative Refinement:\n"); printf("%8s%8s%16s%16s\n", "rhs", "Steps", "FERR", "BERR"); for (i = 0; i < nrhs; ++i) printf("%8d%8d%16e%16e\n", i+1, stat.RefineSteps, ferr[i], berr[i]); } fflush(stdout); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); SUPERLU_FREE (ferr); SUPERLU_FREE (berr); Destroy_CompCol_Matrix(&A1); Destroy_Dense_Matrix(&B1); Destroy_Dense_Matrix(&X); if ( lwork >= 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif }
main(int argc, char *argv[]) { SuperMatrix A; NCformat *Astore; double *a; int *asub, *xa; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ SuperMatrix L; /* factor L */ SCformat *Lstore; SuperMatrix U; /* factor U */ NCformat *Ustore; SuperMatrix B; int nrhs, ldx, info, m, n, nnz; double *xact, *rhs; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 1.0; options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; */ set_default_options(&options); /* Now we modify the default options to use the symmetric mode. */ options.SymmetricMode = YES; options.ColPerm = MMD_AT_PLUS_A; options.DiagPivotThresh = 0.001; #if 1 /* Read matrix A from a file in Harwell-Boeing format.*/ if (argc < 2) { printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n" "-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n" "-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n" "-t -triplet:\n\t[INPUT] is a triplet format matrix.\n", argv[0]); return 0; } else { switch (argv[1][1]) { case 'H': case 'h': printf("Input a Harwell-Boeing format matrix:\n"); dreadhb(&m, &n, &nnz, &a, &asub, &xa); break; case 'R': case 'r': printf("Input a Rutherford-Boeing format matrix:\n"); dreadrb(&m, &n, &nnz, &a, &asub, &xa); break; case 'T': case 't': printf("Input a triplet format matrix:\n"); dreadtriple(&m, &n, &nnz, &a, &asub, &xa); break; default: printf("Unrecognized format.\n"); return 0; } } #else /* Read the matrix in Harwell-Boeing format. */ dreadhb(&m, &n, &nnz, &a, &asub, &xa); #endif dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); Astore = A.Store; printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); nrhs = 1; if ( !(rhs = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhs[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhs, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(options.Trans, nrhs, xact, ldx, &A, &B); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); /* Initialize the statistics variables. */ StatInit(&stat); dgssv(&options, &A, perm_c, perm_r, &L, &U, &B, &stat, &info); if ( info == 0 ) { /* This is how you could access the solution matrix. */ double *sol = (double*) ((DNformat*) B.Store)->nzval; /* Compute the infinity norm of the error. */ dinf_norm_error(nrhs, &B, xact); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("FILL ratio = %.1f\n", (float)(Lstore->nnz + Ustore->nnz - n)/nnz); dQuerySpace(&L, &U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } else { printf("dgssv() error returns INFO= %d\n", info); if ( info <= n ) { /* factorization completes */ dQuerySpace(&L, &U, &mem_usage); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); } } if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhs); SUPERLU_FREE (xact); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif }
int main(int argc, char *argv[]) { void dmatvec_mult(double alpha, double x[], double beta, double y[]); void dpsolve(int n, double x[], double y[]); extern int dfgmr( int n, void (*matvec_mult)(double, double [], double, double []), void (*psolve)(int n, double [], double[]), double *rhs, double *sol, double tol, int restrt, int *itmax, FILE *fits); extern int dfill_diag(int n, NCformat *Astore); char equed[1] = {'B'}; yes_no_t equil; trans_t trans; SuperMatrix A, L, U; SuperMatrix B, X; NCformat *Astore; NCformat *Ustore; SCformat *Lstore; double *a; int *asub, *xa; int *etree; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ int nrhs, ldx, lwork, info, m, n, nnz; double *rhsb, *rhsx, *xact; double *work = NULL; double *R, *C; double u, rpg, rcond; double zero = 0.0; double one = 1.0; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; int restrt, iter, maxit, i; double resid; double *x, *b; #ifdef DEBUG extern int num_drop_L, num_drop_U; #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Defaults */ lwork = 0; nrhs = 1; equil = YES; u = 0.1; /* u=1.0 for complete factorization */ trans = NOTRANS; /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 0.1; //different from complete LU options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; options.RowPerm = LargeDiag; options.ILU_DropTol = 1e-4; options.ILU_FillTol = 1e-2; options.ILU_FillFactor = 10.0; options.ILU_DropRule = DROP_BASIC | DROP_AREA; options.ILU_Norm = INF_NORM; options.ILU_MILU = SILU; */ ilu_set_default_options(&options); /* Modify the defaults. */ options.PivotGrowth = YES; /* Compute reciprocal pivot growth */ options.ConditionNumber = YES;/* Compute reciprocal condition number */ if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) ABORT("Malloc fails for work[]."); } /* Read matrix A from a file in Harwell-Boeing format.*/ if (argc < 2) { printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n" "-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n" "-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n" "-t -triplet:\n\t[INPUT] is a triplet format matrix.\n", argv[0]); return 0; } else { switch (argv[1][1]) { case 'H': case 'h': printf("Input a Harwell-Boeing format matrix:\n"); dreadhb(&m, &n, &nnz, &a, &asub, &xa); break; case 'R': case 'r': printf("Input a Rutherford-Boeing format matrix:\n"); dreadrb(&m, &n, &nnz, &a, &asub, &xa); break; case 'T': case 't': printf("Input a triplet format matrix:\n"); dreadtriple(&m, &n, &nnz, &a, &asub, &xa); break; default: printf("Unrecognized format.\n"); return 0; } } dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); Astore = A.Store; dfill_diag(n, Astore); printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); fflush(stdout); if ( !(rhsb = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsx = doubleMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_D, SLU_GE); dCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(trans, nrhs, xact, ldx, &A, &B); if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(R = (double *) SUPERLU_MALLOC(A.nrow * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (double *) SUPERLU_MALLOC(A.ncol * sizeof(double))) ) ABORT("SUPERLU_MALLOC fails for C[]."); info = 0; #ifdef DEBUG num_drop_L = 0; num_drop_U = 0; #endif /* Initialize the statistics variables. */ StatInit(&stat); /* Compute the incomplete factorization and compute the condition number and pivot growth using dgsisx. */ dgsisx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, &mem_usage, &stat, &info); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("dgsisx(): info %d\n", info); if (info > 0 || rcond < 1e-8 || rpg > 1e8) printf("WARNING: This preconditioner might be unstable.\n"); if ( info == 0 || info == n+1 ) { if ( options.PivotGrowth == YES ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber == YES ) printf("Recip. condition number = %e\n", rcond); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } printf("n(A) = %d, nnz(A) = %d\n", n, Astore->nnz); printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("Fill ratio: nnz(F)/nnz(A) = %.3f\n", ((double)(Lstore->nnz) + (double)(Ustore->nnz) - (double)n) / (double)Astore->nnz); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); fflush(stdout); /* Set the global variables. */ GLOBAL_A = &A; GLOBAL_L = &L; GLOBAL_U = &U; GLOBAL_STAT = &stat; GLOBAL_PERM_C = perm_c; GLOBAL_PERM_R = perm_r; /* Set the variables used by GMRES. */ restrt = SUPERLU_MIN(n / 3 + 1, 50); maxit = 1000; iter = maxit; resid = 1e-8; if (!(b = doubleMalloc(m))) ABORT("Malloc fails for b[]."); if (!(x = doubleMalloc(n))) ABORT("Malloc fails for x[]."); if (info <= n + 1) { int i_1 = 1; double maxferr = 0.0, nrmA, nrmB, res, t; double temp; extern double dnrm2_(int *, double [], int *); extern void daxpy_(int *, double *, double [], int *, double [], int *); /* Call GMRES. */ for (i = 0; i < n; i++) b[i] = rhsb[i]; for (i = 0; i < n; i++) x[i] = zero; t = SuperLU_timer_(); dfgmr(n, dmatvec_mult, dpsolve, b, x, resid, restrt, &iter, stdout); t = SuperLU_timer_() - t; /* Output the result. */ nrmA = dnrm2_(&(Astore->nnz), (double *)((DNformat *)A.Store)->nzval, &i_1); nrmB = dnrm2_(&m, b, &i_1); sp_dgemv("N", -1.0, &A, x, 1, 1.0, b, 1); res = dnrm2_(&m, b, &i_1); resid = res / nrmB; printf("||A||_F = %.1e, ||B||_2 = %.1e, ||B-A*X||_2 = %.1e, " "relres = %.1e\n", nrmA, nrmB, res, resid); if (iter >= maxit) { if (resid >= 1.0) iter = -180; else if (resid > 1e-8) iter = -111; } printf("iteration: %d\nresidual: %.1e\nGMRES time: %.2f seconds.\n", iter, resid, t); /* Scale the solution back if equilibration was performed. */ if (*equed == 'C' || *equed == 'B') for (i = 0; i < n; i++) x[i] *= C[i]; for (i = 0; i < m; i++) { maxferr = SUPERLU_MAX(maxferr, fabs(x[i] - xact[i])); } printf("||X-X_true||_oo = %.1e\n", maxferr); } #ifdef DEBUG printf("%d entries in L and %d entries in U dropped.\n", num_drop_L, num_drop_U); #endif fflush(stdout); if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhsb); SUPERLU_FREE (rhsx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); if ( lwork >= 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } SUPERLU_FREE(b); SUPERLU_FREE(x); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif return 0; }
/*! \brief * * <pre> * Purpose * ======= * * DGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the solution matrix X, as computed by dgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * * </pre> */ void dgsrfs(trans_t trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, char *equed, double *R, double *C, SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, SuperLUStat_t *stat, int *info) { #define ITMAX 5 /* Table of constant values */ int ione = 1; double ndone = -1.; double done = 1.; /* Local variables */ NCformat *Astore; double *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; double *Bmat, *Xmat, *Bptr, *Xptr; int kase; double safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; double s, xk, lstres, eps, safmin; char transc[1]; trans_t transt; double *work; double *rwork; int *iwork; int isave[3]; extern int dlacon2_(int *, double *, double *, int *, double *, int *, int []); #ifdef _CRAY extern int SCOPY(int *, double *, int *, double *, int *); extern int SSAXPY(int *, double *, double *, int *, double *, int *); #else extern int dcopy_(int *, double *, int *, double *, int *); extern int daxpy_(int *, double *, double *, int *, double *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = (trans == NOTRANS); if ( !notran && trans != TRANS && trans != CONJ ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) *info = -4; else if ( ldb < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -10; else if ( ldx < SUPERLU_MAX(0, A->nrow) || X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) *info = -11; if (*info != 0) { i = -(*info); input_error("dgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = strncmp(equed, "R", 1)==0 || strncmp(equed, "B", 1)==0; colequ = strncmp(equed, "C", 1)==0 || strncmp(equed, "B", 1)==0; /* Allocate working space */ work = doubleMalloc(2*A->nrow); rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); iwork = intMalloc(2*A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transc = 'N'; transt = TRANS; } else if ( trans == TRANS ) { *(unsigned char *)transc = 'T'; transt = NOTRANS; } else if ( trans == CONJ ) { *(unsigned char *)transc = 'C'; transt = NOTRANS; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dmach("Epsilon"); safmin = dmach("Safe minimum"); /* Set SAFE1 essentially to be the underflow threshold times the number of additions in each row. */ safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY SCOPY(&A->nrow, Bptr, &ione, work, &ione); #else dcopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_dgemv(transc, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { /* trans = TRANS or CONJ */ for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += fabs(Aval[i]) * fabs(Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) { s = SUPERLU_MAX( s, fabs(work[i]) / rwork[i] ); } else if ( rwork[i] != 0.0 ) { /* Adding SAFE1 to the numerator guards against spuriously zero residuals (underflow). */ s = SUPERLU_MAX( s, (safe1 + fabs(work[i])) / rwork[i] ); } /* If rwork[i] is exactly 0.0, then we know the true residual also must be exactly 0.0. */ } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); #ifdef _CRAY SAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else daxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ stat->RefineSteps = count; /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use DLACON2 to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { /* trans == TRANS or CONJ */ for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = fabs( Xptr[irow] ); s += fabs(Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { dlacon2_(&A->nrow, &work[A->nrow], work, &iwork[A->nrow], &ferr[j], &kase, isave); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; dgstrs (transt, L, U, perm_c, perm_r, &Bjcol, stat, info); for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; dgstrs (trans, L, U, perm_c, perm_r, &Bjcol, stat, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, C[i] * fabs( Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, R[i] * fabs( Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = SUPERLU_MAX( lstres, fabs( Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* dgsrfs */
int fgmr(int n, void (*matvec) (double, double[], double, double[]), void (*psolve) (int, double[], double[]), double *rhs, double *sol, double tol, int im, int *itmax, FILE * fits) { /*---------------------------------------------------------------------- | *** Preconditioned FGMRES *** +----------------------------------------------------------------------- | This is a simple version of the ARMS preconditioned FGMRES algorithm. +----------------------------------------------------------------------- | Y. S. Dec. 2000. -- Apr. 2008 +----------------------------------------------------------------------- | on entry: |---------- | | rhs = real vector of length n containing the right hand side. | sol = real vector of length n containing an initial guess to the | solution on input. | tol = tolerance for stopping iteration | im = Krylov subspace dimension | (itmax) = max number of iterations allowed. | fits = NULL: no output | != NULL: file handle to output " resid vs time and its" | | on return: |---------- | fgmr int = 0 --> successful return. | int = 1 --> convergence not achieved in itmax iterations. | sol = contains an approximate solution (upon successful return). | itmax = has changed. It now contains the number of steps required | to converge -- +----------------------------------------------------------------------- | internal work arrays: |---------- | vv = work array of length [im+1][n] (used to store the Arnoldi | basis) | hh = work array of length [im][im+1] (Householder matrix) | z = work array of length [im][n] to store preconditioned vectors +----------------------------------------------------------------------- | subroutines called : | matvec - matrix-vector multiplication operation | psolve - (right) preconditionning operation | psolve can be a NULL pointer (GMRES without preconditioner) +---------------------------------------------------------------------*/ int maxits = *itmax; int i, i1, ii, j, k, k1, its, retval, i_1 = 1; double **hh, *c, *s, *rs, t, t0; double beta, eps1 = 0.0, gam, **vv, **z; its = 0; vv = (double **)SUPERLU_MALLOC((im + 1) * sizeof(double *)); for (i = 0; i <= im; i++) vv[i] = doubleMalloc(n); z = (double **)SUPERLU_MALLOC(im * sizeof(double *)); hh = (double **)SUPERLU_MALLOC(im * sizeof(double *)); for (i = 0; i < im; i++) { hh[i] = doubleMalloc(i + 2); z[i] = doubleMalloc(n); } c = doubleMalloc(im); s = doubleMalloc(im); rs = doubleMalloc(im + 1); /*---- outer loop starts here ----*/ do { /*---- compute initial residual vector ----*/ matvec(1.0, sol, 0.0, vv[0]); for (j = 0; j < n; j++) vv[0][j] = rhs[j] - vv[0][j]; /* vv[0]= initial residual */ beta = dnrm2_(&n, vv[0], &i_1); /*---- print info if fits != null ----*/ if (fits != NULL && its == 0) fprintf(fits, "%8d %10.2e\n", its, beta); /*if ( beta < tol * dnrm2_(&n, rhs, &i_1) )*/ if ( !(beta >= tol * dnrm2_(&n, rhs, &i_1)) ) break; t = 1.0 / beta; /*---- normalize: vv[0] = vv[0] / beta ----*/ for (j = 0; j < n; j++) vv[0][j] = vv[0][j] * t; if (its == 0) eps1 = tol * beta; /*---- initialize 1-st term of rhs of hessenberg system ----*/ rs[0] = beta; for (i = 0; i < im; i++) { its++; i1 = i + 1; /*------------------------------------------------------------ | (Right) Preconditioning Operation z_{j} = M^{-1} v_{j} +-----------------------------------------------------------*/ if (psolve) psolve(n, z[i], vv[i]); else dcopy_(&n, vv[i], &i_1, z[i], &i_1); /*---- matvec operation w = A z_{j} = A M^{-1} v_{j} ----*/ matvec(1.0, z[i], 0.0, vv[i1]); /*------------------------------------------------------------ | modified gram - schmidt... | h_{i,j} = (w,v_{i}) | w = w - h_{i,j} v_{i} +------------------------------------------------------------*/ t0 = dnrm2_(&n, vv[i1], &i_1); for (j = 0; j <= i; j++) { double negt; t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1); hh[i][j] = t; negt = -t; daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1); } /*---- h_{j+1,j} = ||w||_{2} ----*/ t = dnrm2_(&n, vv[i1], &i_1); while (t < 0.5 * t0) { t0 = t; for (j = 0; j <= i; j++) { double negt; t = ddot_(&n, vv[j], &i_1, vv[i1], &i_1); hh[i][j] += t; negt = -t; daxpy_(&n, &negt, vv[j], &i_1, vv[i1], &i_1); } t = dnrm2_(&n, vv[i1], &i_1); } hh[i][i1] = t; if (t != 0.0) { /*---- v_{j+1} = w / h_{j+1,j} ----*/ t = 1.0 / t; for (k = 0; k < n; k++) vv[i1][k] = vv[i1][k] * t; } /*--------------------------------------------------- | done with modified gram schimdt and arnoldi step | now update factorization of hh +--------------------------------------------------*/ /*-------------------------------------------------------- | perform previous transformations on i-th column of h +-------------------------------------------------------*/ for (k = 1; k <= i; k++) { k1 = k - 1; t = hh[i][k1]; hh[i][k1] = c[k1] * t + s[k1] * hh[i][k]; hh[i][k] = -s[k1] * t + c[k1] * hh[i][k]; } gam = sqrt(pow(hh[i][i], 2) + pow(hh[i][i1], 2)); /*--------------------------------------------------- | if gamma is zero then any small value will do | affect only residual estimate +--------------------------------------------------*/ /* if (gam == 0.0) gam = epsmac; */ /*---- get next plane rotation ---*/ if (gam > 0.0) { c[i] = hh[i][i] / gam; s[i] = hh[i][i1] / gam; } else { c[i] = 1.0; s[i] = 0.0; } rs[i1] = -s[i] * rs[i]; rs[i] = c[i] * rs[i]; /*---------------------------------------------------- | determine residual norm and test for convergence +---------------------------------------------------*/ hh[i][i] = c[i] * hh[i][i] + s[i] * hh[i][i1]; beta = fabs(rs[i1]); if (fits != NULL) fprintf(fits, "%8d %10.2e\n", its, beta); if (beta <= eps1 || its >= maxits) break; } if (i == im) i--; /*---- now compute solution. 1st, solve upper triangular system ----*/ rs[i] = rs[i] / hh[i][i]; for (ii = 1; ii <= i; ii++) { k = i - ii; k1 = k + 1; t = rs[k]; for (j = k1; j <= i; j++) t = t - hh[j][k] * rs[j]; rs[k] = t / hh[k][k]; } /*---- linear combination of v[i]'s to get sol. ----*/ for (j = 0; j <= i; j++) { t = rs[j]; for (k = 0; k < n; k++) sol[k] += t * z[j][k]; } /* calculate the residual and output */ matvec(1.0, sol, 0.0, vv[0]); for (j = 0; j < n; j++) vv[0][j] = rhs[j] - vv[0][j]; /* vv[0]= initial residual */ /*---- print info if fits != null ----*/ beta = dnrm2_(&n, vv[0], &i_1); /*---- restart outer loop if needed ----*/ /*if (beta >= eps1 / tol)*/ if ( !(beta < eps1 / tol) ) { its = maxits + 10; break; } if (beta <= eps1) break; } while(its < maxits); retval = (its >= maxits); for (i = 0; i <= im; i++) SUPERLU_FREE(vv[i]); SUPERLU_FREE(vv); for (i = 0; i < im; i++) { SUPERLU_FREE(hh[i]); SUPERLU_FREE(z[i]); } SUPERLU_FREE(hh); SUPERLU_FREE(z); SUPERLU_FREE(c); SUPERLU_FREE(s); SUPERLU_FREE(rs); *itmax = its; return retval; } /*----end of fgmr ----*/
int main ( int argc, char *argv[] ) /**********************************************************************/ /* Purpose: SUPER_LU_D0 runs a small 5 by 5 example of the use of SUPER_LU. Modified: 23 April 2004 Reference: James Demmel, John Gilbert, Xiaoye Li, SuperLU Users's Guide, Sections 1 and 2. */ { double *a; SuperMatrix A; int *asub; SuperMatrix B; int i; int info; SuperMatrix L; int m; int n; int nnz; int nrhs; superlu_options_t options; int *perm_c; int *perm_r; int permc_spec; double *rhs; double sol[5]; SuperLUStat_t stat; SuperMatrix U; int *xa; /* Say hello. */ printf ( "\n" ); printf ( "SUPER_LU_D0:\n" ); printf ( " Simple 5 by 5 example of SUPER_LU solver.\n" ); /* Initialize parameters. */ m = 5; n = 5; nnz = 12; /* Set aside space for the arrays. */ a = doubleMalloc ( nnz ); if ( !a ) { ABORT ( "Malloc fails for a[]." ); } asub = intMalloc ( nnz ); if ( !asub ) { ABORT ( "Malloc fails for asub[]." ); } xa = intMalloc ( n+1 ); if ( !xa ) { ABORT ( "Malloc fails for xa[]." ); } /* Initialize matrix A. */ a[0] = 19.0; a[1] = 12.0; a[2] = 12.0; a[3] = 21.0; a[4] = 12.0; a[5] = 12.0; a[6] = 21.0; a[7] = 16.0; a[8] = 21.0; a[9] = 5.0; a[10]= 21.0; a[11]= 18.0; asub[0] = 0; asub[1] = 1; asub[2] = 4; asub[3] = 1; asub[4] = 2; asub[5] = 4; asub[6] = 0; asub[7] = 2; asub[8] = 0; asub[9] = 3; asub[10]= 3; asub[11]= 4; xa[0] = 0; xa[1] = 3; xa[2] = 6; xa[3] = 8; xa[4] = 10; xa[5] = 12; sol[0] = -0.031250000; sol[1] = 0.065476190; sol[2] = 0.013392857; sol[3] = 0.062500000; sol[4] = 0.032738095; /* Create matrix A in the format expected by SuperLU. */ dCreate_CompCol_Matrix ( &A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE ); /* Create the right-hand side matrix B. */ nrhs = 1; rhs = doubleMalloc ( m * nrhs ); if ( !rhs ) { ABORT("Malloc fails for rhs[]."); } for ( i = 0; i < m; i++ ) { rhs[i] = 1.0; } dCreate_Dense_Matrix ( &B, m, nrhs, rhs, m, SLU_DN, SLU_D, SLU_GE ); /* Set up the arrays for the permutations. */ perm_r = intMalloc ( m ); if ( !perm_r ) { ABORT ( "Malloc fails for perm_r[]." ); } perm_c = intMalloc ( n ); if ( !perm_c ) { ABORT ( "Malloc fails for perm_c[]." ); } /* Set the default input options, and then adjust some of them. */ set_default_options ( &options ); options.ColPerm = NATURAL; /* Initialize the statistics variables. */ StatInit ( &stat ); /* Factor the matrix and solve the linear system. */ dgssv ( &options, &A, perm_c, perm_r, &L, &U, &B, &stat, &info ); /* Print some of the results. */ dPrint_CompCol_Matrix ( "Matrix A", &A ); dPrint_SuperNode_Matrix ( "Factor L", &L ); dPrint_CompCol_Matrix ( "Factor U", &U ); dPrint_Dense_Matrix ( "Solution X", &B ); printf ( "\n" ); printf ( " The exact solution:\n" ); printf ( "\n" ); for ( i = 0; i < n; i++ ) { printf ( "%d %f\n", i, sol[i] ); } printf ( "\n" ); print_int_vec ( "perm_r", m, perm_r ); /* De-allocate storage. */ SUPERLU_FREE ( rhs ); SUPERLU_FREE ( perm_r ); SUPERLU_FREE ( perm_c ); Destroy_CompCol_Matrix ( &A ); Destroy_SuperMatrix_Store ( &B ); Destroy_SuperNode_Matrix ( &L ); Destroy_CompCol_Matrix ( &U ); StatFree ( &stat ); printf ( "\n" ); printf ( "SUPER_LU_D0:\n" ); printf ( " Normal end of execution.\n" ); return 0; }
void dgstrs(trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, SuperMatrix *B, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * dgstrs() solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * pdgstrf(). * * Arguments * ========= * * trans (input) Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * pdgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = NCP, Dtype = _D, Mtype = TRU. * * perm_r (input) int* * Row permutation vector of size L->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means row i of A is in * position j in Pr*A. * * perm_c (int*) dimension A->ncol * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * B (input/output) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * Gstat (output) Gstat_t* * Record all the statistics about the triangular solves; * See Gstat_t structure defined in util.h. * * info (output) Diagnostics * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; #endif register int j, k, jcol, iptr, luptr, ksupno, istart, irow, bptr; register int fsupc, nsuper; int i, n, nsupc, nsupr, nrow, nrhs, ldb; int *supno; DNformat *Bstore; SCPformat *Lstore; NCPformat *Ustore; double *Lval, *Uval, *Bmat; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -4; else if ( ldb < MAX(0, L->nrow) ) *info = -6; if ( *info ) { i = -(*info); xerbla_("dgstrs", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; supno = Lstore->col_to_sup; nsuper = Lstore->nsuper; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ /*>> for (k = 0; k < n; k += nsupc) { ksupno = supno[k]; */ for (ksupno = 0; ksupno <= nsuper; ++ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_END(fsupc); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM(ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0, bptr = 0; j < nrhs; j++, bptr += ldb) { rhs_work = &Bmat[bptr]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* if-else: nsupc == 1 ... */ } /* for L-solve */ #if ( DEBUGlevel>=2 ) printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ /*>> for (k = n-1; k >= 0; k -= nsupc) { ksupno = supno[k]; */ for (ksupno = nsuper; ksupno >= 0; --ksupno) { fsupc = L_FST_SUPC(ksupno); istart = L_SUB_START(fsupc); nsupr = L_SUB_END(fsupc) - istart; nsupc = L_LAST_SUPC(ksupno) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; /* dense triangular matrix */ if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM(ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0, bptr = fsupc; j < nrhs; j++, bptr += ldb) { dusolve (nsupr, nsupc, &Lval[luptr], &Bmat[bptr]); } #endif } /* matrix-vector update */ for (j = 0, bptr = 0; j < nrhs; ++j, bptr += ldb) { rhs_work = &Bmat[bptr]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_END(jcol) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_END(jcol); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #if ( DEBUGlevel>=2 ) printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X <= Pc*X. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } else { /* Solve A'*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], info); /* Multiply by inv(L'). */ sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], info); } /* Compute the final solution X <= Pr'*X (=inv(Pr)*X) */ for (i = 0, bptr = 0; i < nrhs; i++, bptr += ldb) { rhs_work = &Bmat[bptr]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } /* if-else trans */ Gstat->ops[TRISOLVE] = solve_ops; SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info) { /* * Purpose * ======= * * DGSTRSL only performs the L-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; DNformat *Bstore; double *Bmat; SCformat *Lstore; double *Lval, *Uval; int nrow, notran; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -4; if ( *info ) { i = -(*info); xerbla_("dgstrsL", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; solve_ops = 0; if ( notran ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif SuperLUStat.ops[SOLVE] = solve_ops; } else { printf("Transposed solve not implemented.\n"); exit(0); } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void dgsrfs(char *trans, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_r, int *perm_c, char *equed, double *R, double *C, SuperMatrix *B, SuperMatrix *X, double *ferr, double *berr, int *info) { /* * Purpose * ======= * * DGSRFS improves the computed solution to a system of linear * equations and provides error bounds and backward error estimates for * the solution. * * If equilibration was performed, the system becomes: * (diag(R)*A_original*diag(C)) * X = diag(R)*B_original. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose = Transpose) * * A (input) SuperMatrix* * The original matrix A in the system, or the scaled A if * equilibration was done. The type of A can be: * Stype = NC, Dtype = _D, Mtype = GE. * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = _D, Mtype = TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, * i.e., U has types: Stype = NC, Dtype = _D, Mtype = TRU. * * perm_r (input) int*, dimension (A->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * perm_c (input) int*, dimension (A->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * equed (input) Specifies the form of equilibration that was done. * = 'N': No equilibration. * = 'R': Row equilibration, i.e., A was premultiplied by diag(R). * = 'C': Column equilibration, i.e., A was postmultiplied by * diag(C). * = 'B': Both row and column equilibration, i.e., A was replaced * by diag(R)*A*diag(C). * * R (input) double*, dimension (A->nrow) * The row scale factors for A. * If equed = 'R' or 'B', A is premultiplied by diag(R). * If equed = 'N' or 'C', R is not accessed. * * C (input) double*, dimension (A->ncol) * The column scale factors for A. * If equed = 'C' or 'B', A is postmultiplied by diag(C). * If equed = 'N' or 'R', C is not accessed. * * B (input) SuperMatrix* * B has types: Stype = DN, Dtype = _D, Mtype = GE. * The right hand side matrix B. * if equed = 'R' or 'B', B is premultiplied by diag(R). * * X (input/output) SuperMatrix* * X has types: Stype = DN, Dtype = _D, Mtype = GE. * On entry, the solution matrix X, as computed by dgstrs(). * On exit, the improved solution matrix X. * if *equed = 'C' or 'B', X should be premultiplied by diag(C) * in order to obtain the solution to the original system. * * FERR (output) double*, dimension (B->ncol) * The estimated forward error bound for each solution vector * X(j) (the j-th column of the solution matrix X). * If XTRUE is the true solution corresponding to X(j), FERR(j) * is an estimated upper bound for the magnitude of the largest * element in (X(j) - XTRUE) divided by the magnitude of the * largest element in X(j). The estimate is as reliable as * the estimate for RCOND, and is almost always a slight * overestimate of the true error. * * BERR (output) double*, dimension (B->ncol) * The componentwise relative backward error of each solution * vector X(j) (i.e., the smallest relative change in * any element of A or B that makes X(j) an exact solution). * * info (output) int* * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Internal Parameters * =================== * * ITMAX is the maximum number of steps of iterative refinement. * */ #define ITMAX 5 /* Table of constant values */ int ione = 1; double ndone = -1.; double done = 1.; /* Local variables */ NCformat *Astore; double *Aval; SuperMatrix Bjcol; DNformat *Bstore, *Xstore, *Bjcol_store; double *Bmat, *Xmat, *Bptr, *Xptr; int kase; double safe1, safe2; int i, j, k, irow, nz, count, notran, rowequ, colequ; int ldb, ldx, nrhs; double s, xk, lstres, eps, safmin; char transt[1]; double *work; double *rwork; int *iwork; extern double dlamch_(char *); extern int dlacon_(int *, double *, double *, int *, double *, int *); #ifdef _CRAY extern int SCOPY(int *, double *, int *, double *, int *); extern int SSAXPY(int *, double *, double *, int *, double *, int *); #else extern int dcopy_(int *, double *, int *, double *, int *); extern int daxpy_(int *, double *, double *, int *, double *, int *); #endif Astore = A->Store; Aval = Astore->nzval; Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; /* Test the input parameters */ *info = 0; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != NC || A->Dtype != _D || A->Mtype != GE ) *info = -2; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SC || L->Dtype != _D || L->Mtype != TRLU ) *info = -3; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != NC || U->Dtype != _D || U->Mtype != TRU ) *info = -4; else if ( ldb < MAX(0, A->nrow) || B->Stype != DN || B->Dtype != _D || B->Mtype != GE ) *info = -10; else if ( ldx < MAX(0, A->nrow) || X->Stype != DN || X->Dtype != _D || X->Mtype != GE ) *info = -11; if (*info != 0) { i = -(*info); xerbla_("dgsrfs", &i); return; } /* Quick return if possible */ if ( A->nrow == 0 || nrhs == 0) { for (j = 0; j < nrhs; ++j) { ferr[j] = 0.; berr[j] = 0.; } return; } rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); /* Allocate working space */ work = doubleMalloc(2*A->nrow); rwork = (double *) SUPERLU_MALLOC( A->nrow * sizeof(double) ); iwork = intMalloc(2*A->nrow); if ( !work || !rwork || !iwork ) ABORT("Malloc fails for work/rwork/iwork."); if ( notran ) { *(unsigned char *)transt = 'T'; } else { *(unsigned char *)transt = 'N'; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ nz = A->ncol + 1; eps = dlamch_("Epsilon"); safmin = dlamch_("Safe minimum"); safe1 = nz * safmin; safe2 = safe1 / eps; /* Compute the number of nonzeros in each row (or column) of A */ for (i = 0; i < A->nrow; ++i) iwork[i] = 0; if ( notran ) { for (k = 0; k < A->ncol; ++k) for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) ++iwork[Astore->rowind[i]]; } else { for (k = 0; k < A->ncol; ++k) iwork[k] = Astore->colptr[k+1] - Astore->colptr[k]; } /* Copy one column of RHS B into Bjcol. */ Bjcol.Stype = B->Stype; Bjcol.Dtype = B->Dtype; Bjcol.Mtype = B->Mtype; Bjcol.nrow = B->nrow; Bjcol.ncol = 1; Bjcol.Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) ); if ( !Bjcol.Store ) ABORT("SUPERLU_MALLOC fails for Bjcol.Store"); Bjcol_store = Bjcol.Store; Bjcol_store->lda = ldb; Bjcol_store->nzval = work; /* address aliasing */ /* Do for each right hand side ... */ for (j = 0; j < nrhs; ++j) { count = 0; lstres = 3.; Bptr = &Bmat[j*ldb]; Xptr = &Xmat[j*ldx]; while (1) { /* Loop until stopping criterion is satisfied. */ /* Compute residual R = B - op(A) * X, where op(A) = A, A**T, or A**H, depending on TRANS. */ #ifdef _CRAY SCOPY(&A->nrow, Bptr, &ione, work, &ione); #else dcopy_(&A->nrow, Bptr, &ione, work, &ione); #endif sp_dgemv(trans, ndone, A, Xptr, ione, done, work, ione); /* Compute componentwise relative backward error from formula max(i) ( abs(R(i)) / ( abs(op(A))*abs(X) + abs(B) )(i) ) where abs(Z) is the componentwise absolute value of the matrix or vector Z. If the i-th component of the denominator is less than SAFE2, then SAFE1 is added to the i-th component of the numerator and denominator before dividing. */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if (notran) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; s += fabs(Aval[i]) * fabs(Xptr[irow]); } rwork[k] += s; } } s = 0.; for (i = 0; i < A->nrow; ++i) { if (rwork[i] > safe2) s = MAX( s, fabs(work[i]) / rwork[i] ); else s = MAX( s, (fabs(work[i]) + safe1) / (rwork[i] + safe1) ); } berr[j] = s; /* Test stopping criterion. Continue iterating if 1) The residual BERR(J) is larger than machine epsilon, and 2) BERR(J) decreased by at least a factor of 2 during the last iteration, and 3) At most ITMAX iterations tried. */ if (berr[j] > eps && berr[j] * 2. <= lstres && count < ITMAX) { /* Update solution and try again. */ dgstrs (trans, L, U, perm_r, perm_c, &Bjcol, info); #ifdef _CRAY SAXPY(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #else daxpy_(&A->nrow, &done, work, &ione, &Xmat[j*ldx], &ione); #endif lstres = berr[j]; ++count; } else { break; } } /* end while */ /* Bound error from formula: norm(X - XTRUE) / norm(X) .le. FERR = norm( abs(inv(op(A)))* ( abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) / norm(X) where norm(Z) is the magnitude of the largest component of Z inv(op(A)) is the inverse of op(A) abs(Z) is the componentwise absolute value of the matrix or vector Z NZ is the maximum number of nonzeros in any row of A, plus 1 EPS is machine epsilon The i-th component of abs(R)+NZ*EPS*(abs(op(A))*abs(X)+abs(B)) is incremented by SAFE1 if the i-th component of abs(op(A))*abs(X) + abs(B) is less than SAFE2. Use DLACON to estimate the infinity-norm of the matrix inv(op(A)) * diag(W), where W = abs(R) + NZ*EPS*( abs(op(A))*abs(X)+abs(B) ))) */ for (i = 0; i < A->nrow; ++i) rwork[i] = fabs( Bptr[i] ); /* Compute abs(op(A))*abs(X) + abs(B). */ if ( notran ) { for (k = 0; k < A->ncol; ++k) { xk = fabs( Xptr[k] ); for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) rwork[Astore->rowind[i]] += fabs(Aval[i]) * xk; } } else { for (k = 0; k < A->ncol; ++k) { s = 0.; for (i = Astore->colptr[k]; i < Astore->colptr[k+1]; ++i) { irow = Astore->rowind[i]; xk = fabs( Xptr[irow] ); s += fabs(Aval[i]) * xk; } rwork[k] += s; } } for (i = 0; i < A->nrow; ++i) if (rwork[i] > safe2) rwork[i] = fabs(work[i]) + (iwork[i]+1)*eps*rwork[i]; else rwork[i] = fabs(work[i])+(iwork[i]+1)*eps*rwork[i]+safe1; kase = 0; do { dlacon_(&A->nrow, &work[A->nrow], work, &iwork[A->nrow], &ferr[j], &kase); if (kase == 0) break; if (kase == 1) { /* Multiply by diag(W)*inv(op(A)**T)*(diag(C) or diag(R)). */ if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->nrow; ++i) work[i] *= R[i]; dgstrs (transt, L, U, perm_r, perm_c, &Bjcol, info); for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; } else { /* Multiply by (diag(C) or diag(R))*inv(op(A))*diag(W). */ for (i = 0; i < A->nrow; ++i) work[i] *= rwork[i]; dgstrs (trans, L, U, perm_r, perm_c, &Bjcol, info); if ( notran && colequ ) for (i = 0; i < A->ncol; ++i) work[i] *= C[i]; else if ( !notran && rowequ ) for (i = 0; i < A->ncol; ++i) work[i] *= R[i]; } } while ( kase != 0 ); /* Normalize error. */ lstres = 0.; if ( notran && colequ ) { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, C[i] * fabs( Xptr[i]) ); } else if ( !notran && rowequ ) { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, R[i] * fabs( Xptr[i]) ); } else { for (i = 0; i < A->nrow; ++i) lstres = MAX( lstres, fabs( Xptr[i]) ); } if ( lstres != 0. ) ferr[j] /= lstres; } /* for each RHS j ... */ SUPERLU_FREE(work); SUPERLU_FREE(rwork); SUPERLU_FREE(iwork); SUPERLU_FREE(Bjcol.Store); return; } /* dgsrfs */
void dgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * DGSTRS solves a system of linear equations A*X=B or A'*X=B * with A sparse and B dense, using the LU factorization computed by * DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (L->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #if 0 int incx = 1, incy = 1; #endif #ifdef USE_VENDOR_BLAS double alpha = 1.0, beta = 1.0; double *work_col; #endif DNformat *Bstore; double *Bmat; SCformat *Lstore; NCformat *Ustore; double *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; double *work, *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); superlu_xerbla("dgstrs", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","L","N","U",&nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); hypre_F90_NAME_BLAS(dgemm,DGEMM)("N","N",&nrow,&nrhs,&nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; sludlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); sludmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else hypre_F90_NAME_BLAS(dtrsm,DTRSM)("L","U","N","N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) sludusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B or CONJ(A)*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_dtrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void dgsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree, char *equed, double *R, double *C, SuperMatrix *L, SuperMatrix *U, void *work, int lwork, SuperMatrix *B, SuperMatrix *X, double *recip_pivot_growth, double *rcond, mem_usage_t *mem_usage, SuperLUStat_t *stat, int *info) { DNformat *Bstore, *Xstore; double *Bmat, *Xmat; int ldb, ldx, nrhs; SuperMatrix *AA;/* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int colequ, equil, nofact, notran, rowequ, permc_spec, mc64; trans_t trant; char norm[1]; int i, j, info1; double amax, anorm, bignum, smlnum, colcnd, rowcnd, rcmax, rcmin; int relax, panel_size; double diag_pivot_thresh; double t0; /* temporary time */ double *utime; int *perm = NULL; /* External functions */ extern double dlangs(char *, SuperMatrix *); Bstore = B->Store; Xstore = X->Store; Bmat = Bstore->nzval; Xmat = Xstore->nzval; ldb = Bstore->lda; ldx = Xstore->lda; nrhs = B->ncol; *info = 0; nofact = (options->Fact != FACTORED); equil = (options->Equil == YES); notran = (options->Trans == NOTRANS); mc64 = (options->RowPerm == LargeDiag); if ( nofact ) { *(unsigned char *)equed = 'N'; rowequ = FALSE; colequ = FALSE; } else { rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); smlnum = dlamch_("Safe minimum"); bignum = 1. / smlnum; } /* Test the input parameters */ if (!nofact && options->Fact != DOFACT && options->Fact != SamePattern && options->Fact != SamePattern_SameRowPerm && !notran && options->Trans != TRANS && options->Trans != CONJ && !equil && options->Equil != NO) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; else if (options->Fact == FACTORED && !(rowequ || colequ || lsame_(equed, "N"))) *info = -6; else { if (rowequ) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, R[j]); rcmax = SUPERLU_MAX(rcmax, R[j]); } if (rcmin <= 0.) *info = -7; else if ( A->nrow > 0) rowcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else rowcnd = 1.; } if (colequ && *info == 0) { rcmin = bignum; rcmax = 0.; for (j = 0; j < A->nrow; ++j) { rcmin = SUPERLU_MIN(rcmin, C[j]); rcmax = SUPERLU_MAX(rcmax, C[j]); } if (rcmin <= 0.) *info = -8; else if (A->nrow > 0) colcnd = SUPERLU_MAX(rcmin,smlnum) / SUPERLU_MIN(rcmax,bignum); else colcnd = 1.; } if (*info == 0) { if ( lwork < -1 ) *info = -12; else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -13; else if ( X->ncol < 0 || Xstore->lda < SUPERLU_MAX(0, A->nrow) || (B->ncol != 0 && B->ncol != X->ncol) || X->Stype != SLU_DN || X->Dtype != SLU_D || X->Mtype != SLU_GE ) *info = -14; } } if (*info != 0) { i = -(*info); xerbla_("dgsisx", &i); return; } /* Initialization for factor parameters */ panel_size = sp_ienv(1); relax = sp_ienv(2); diag_pivot_thresh = options->DiagPivotThresh; utime = stat->utime; /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = A->Store; AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); dCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype); if ( notran ) { /* Reverse the transpose argument. */ trant = TRANS; notran = 0; } else { trant = NOTRANS; notran = 1; } } else { /* A->Stype == SLU_NC */ trant = options->Trans; AA = A; } if ( nofact ) { register int i, j; NCformat *Astore = AA->Store; int nnz = Astore->nnz; int *colptr = Astore->colptr; int *rowind = Astore->rowind; double *nzval = (double *)Astore->nzval; int n = AA->nrow; if ( mc64 ) { *equed = 'B'; /*rowequ = colequ = 1;*/ t0 = SuperLU_timer_(); if ((perm = intMalloc(n)) == NULL) ABORT("SUPERLU_MALLOC fails for perm[]"); info1 = dldperm(5, n, nnz, colptr, rowind, nzval, perm, R, C); if (info1 > 0) { /* MC64 fails, call dgsequ() later */ mc64 = 0; SUPERLU_FREE(perm); perm = NULL; } else { rowequ = colequ = 1; for (i = 0; i < n; i++) { R[i] = exp(R[i]); C[i] = exp(C[i]); } /* permute and scale the matrix */ for (j = 0; j < n; j++) { for (i = colptr[j]; i < colptr[j + 1]; i++) { nzval[i] *= R[rowind[i]] * C[j]; rowind[i] = perm[rowind[i]]; } } } utime[EQUIL] = SuperLU_timer_() - t0; } if ( !mc64 & equil ) { t0 = SuperLU_timer_(); /* Compute row and column scalings to equilibrate the matrix A. */ dgsequ(AA, R, C, &rowcnd, &colcnd, &amax, &info1); if ( info1 == 0 ) { /* Equilibrate matrix A. */ dlaqgs(AA, R, C, rowcnd, colcnd, amax, equed); rowequ = lsame_(equed, "R") || lsame_(equed, "B"); colequ = lsame_(equed, "C") || lsame_(equed, "B"); } utime[EQUIL] = SuperLU_timer_() - t0; } } if ( nofact ) { t0 = SuperLU_timer_(); /* * Gnet column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) get_perm_c(permc_spec, AA, perm_c); utime[COLPERM] = SuperLU_timer_() - t0; t0 = SuperLU_timer_(); sp_preorder(options, AA, perm_c, etree, &AC); utime[ETREE] = SuperLU_timer_() - t0; /* Compute the LU factorization of A*Pc. */ t0 = SuperLU_timer_(); dgsitrf(options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, L, U, stat, info); utime[FACT] = SuperLU_timer_() - t0; if ( lwork == -1 ) { mem_usage->total_needed = *info - A->ncol; return; } } if ( options->PivotGrowth ) { if ( *info > 0 ) return; /* Compute the reciprocal pivot growth factor *recip_pivot_growth. */ *recip_pivot_growth = dPivotGrowth(A->ncol, AA, perm_c, L, U); } if ( options->ConditionNumber ) { /* Estimate the reciprocal of the condition number of A. */ t0 = SuperLU_timer_(); if ( notran ) { *(unsigned char *)norm = '1'; } else { *(unsigned char *)norm = 'I'; } anorm = dlangs(norm, AA); dgscon(norm, L, U, anorm, rcond, stat, &info1); utime[RCOND] = SuperLU_timer_() - t0; } if ( nrhs > 0 ) { /* Solve the system */ double *tmp, *rhs_work; int n = A->nrow; if ( mc64 ) { if ((tmp = doubleMalloc(n)) == NULL) ABORT("SUPERLU_MALLOC fails for tmp[]"); } /* Scale and permute the right-hand side if equilibration and permutation from MC64 were performed. */ if ( notran ) { if ( rowequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) Bmat[i + j*ldb] *= R[i]; } if ( mc64 ) { for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (i = 0; i < n; i++) tmp[perm[i]] = rhs_work[i]; for (i = 0; i < n; i++) rhs_work[i] = tmp[i]; } } } else if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { Bmat[i + j*ldb] *= C[i]; } } /* Compute the solution matrix X. */ for (j = 0; j < nrhs; j++) /* Save a copy of the right hand sides */ for (i = 0; i < B->nrow; i++) Xmat[i + j*ldx] = Bmat[i + j*ldb]; t0 = SuperLU_timer_(); dgstrs (trant, L, U, perm_c, perm_r, X, stat, &info1); utime[SOLVE] = SuperLU_timer_() - t0; /* Transform the solution matrix X to a solution of the original system. */ if ( notran ) { if ( colequ ) { for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { Xmat[i + j*ldx] *= C[i]; } } } else { /* transposed system */ if ( rowequ ) { if ( mc64 ) { for (j = 0; j < nrhs; j++) { for (i = 0; i < n; i++) tmp[i] = Xmat[i + j * ldx]; /*dcopy*/ for (i = 0; i < n; i++) Xmat[i + j * ldx] = R[i] * tmp[perm[i]]; } } else { for (j = 0; j < nrhs; ++j) for (i = 0; i < A->nrow; ++i) { Xmat[i + j*ldx] *= R[i]; } } } } if ( mc64 ) SUPERLU_FREE(tmp); } /* end if nrhs > 0 */ if ( options->ConditionNumber ) { /* Set INFO = A->ncol+1 if the matrix is singular to working precision. */ if ( *rcond < dlamch_("E") && *info == 0) *info = A->ncol + 1; } if (perm) SUPERLU_FREE(perm); if ( nofact ) { ilu_dQuerySpace(L, U, mem_usage); Destroy_CompCol_Permuted(&AC); } if ( A->Stype == SLU_NR ) { Destroy_SuperMatrix_Store(AA); SUPERLU_FREE(AA); } }
void zgsitrf(superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, int *etree, void *work, int lwork, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */ SuperLUStat_t *stat, int *info) { /* Local working arrays */ NCPformat *Astore; int *iperm_r = NULL; /* inverse of perm_r; used when options->Fact == SamePattern_SameRowPerm */ int *iperm_c; /* inverse of perm_c */ int *swap, *iswap; /* swap is used to store the row permutation during the factorization. Initially, it is set to iperm_c (row indeces of Pc*A*Pc'). iswap is the inverse of swap. After the factorization, it is equal to perm_r. */ int *iwork; doublecomplex *zwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *marker, *marker_relax; doublecomplex *dense, *tempv; double *dtempv; int *relax_end, *relax_fsupc; doublecomplex *a; int *asub; int *xa_begin, *xa_end; int *xsup, *supno; int *xlsub, *xlusup, *xusub; int nzlumax; double *amax; doublecomplex drop_sum; double alpha, omega; /* used in MILU, mimicing DRIC */ double *dwork2; /* used by the second dropping rule */ /* Local scalars */ fact_t fact = options->Fact; double diag_pivot_thresh = options->DiagPivotThresh; double drop_tol = options->ILU_DropTol; /* tau */ double fill_ini = options->ILU_FillTol; /* tau^hat */ double gamma = options->ILU_FillFactor; int drop_rule = options->ILU_DropRule; milu_t milu = options->ILU_MILU; double fill_tol; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ register int jcol; register int kcol; /* end column of a relaxed snode */ register int icol; register int i, k, jj, new_next, iinfo; int m, n, min_mn, jsupno, fsupc, nextlu, nextu; int w_def; /* upper bound on panel width */ int usepr, iperm_r_allocated = 0; int nnzL, nnzU; int *panel_histo = stat->panel_histo; flops_t *ops = stat->ops; int last_drop;/* the last column which the dropping rules applied */ int quota; int nnzAj; /* number of nonzeros in A(:,1:j) */ int nnzLj, nnzUj; double tol_L = drop_tol, tol_U = drop_tol; doublecomplex zero = {0.0, 0.0}; double one = 1.0; /* Executable */ iinfo = 0; m = A->nrow; n = A->ncol; min_mn = SUPERLU_MIN(m, n); Astore = A->Store; a = Astore->nzval; asub = Astore->rowind; xa_begin = Astore->colbeg; xa_end = Astore->colend; /* Allocate storage common to the factor routines */ *info = zLUMemInit(fact, work, lwork, m, n, Astore->nnz, panel_size, gamma, L, U, Glu, &iwork, &zwork); if ( *info ) return; xsup = Glu->xsup; supno = Glu->supno; xlsub = Glu->xlsub; xlusup = Glu->xlusup; xusub = Glu->xusub; SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &marker_relax, &marker); zSetRWork(m, panel_size, zwork, &dense, &tempv); usepr = (fact == SamePattern_SameRowPerm); if ( usepr ) { /* Compute the inverse of perm_r */ iperm_r = (int *) intMalloc(m); for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k; iperm_r_allocated = 1; } iperm_c = (int *) intMalloc(n); for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k; swap = (int *)intMalloc(n); for (k = 0; k < n; k++) swap[k] = iperm_c[k]; iswap = (int *)intMalloc(n); for (k = 0; k < n; k++) iswap[k] = perm_c[k]; amax = (double *) doubleMalloc(panel_size); if (drop_rule & DROP_SECONDARY) dwork2 = (double *)doubleMalloc(n); else dwork2 = NULL; nnzAj = 0; nnzLj = 0; nnzUj = 0; last_drop = SUPERLU_MAX(min_mn - 2 * sp_ienv(7), (int)(min_mn * 0.95)); alpha = pow((double)n, -1.0 / options->ILU_MILU_Dim); /* Identify relaxed snodes */ relax_end = (int *) intMalloc(n); relax_fsupc = (int *) intMalloc(n); if ( options->SymmetricMode == YES ) ilu_heap_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); else ilu_relax_snode(n, etree, relax, marker, relax_end, relax_fsupc); ifill (perm_r, m, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); supno[0] = -1; xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0; w_def = panel_size; /* Mark the rows used by relaxed supernodes */ ifill (marker_relax, m, EMPTY); i = mark_relax(m, relax_end, relax_fsupc, xa_begin, xa_end, asub, marker_relax); #if ( PRNTlevel >= 1) printf("%d relaxed supernodes.\n", i); #endif /* * Work on one "panel" at a time. A panel is one of the following: * (a) a relaxed supernode at the bottom of the etree, or * (b) panel_size contiguous columns, defined by the user */ for (jcol = 0; jcol < min_mn; ) { if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */ kcol = relax_end[jcol]; /* end of the relaxed snode */ panel_histo[kcol-jcol+1]++; /* Drop small rows in the previous supernode. */ if (jcol > 0 && jcol < last_drop) { int first = xsup[supno[jcol - 1]]; int last = jcol - 1; int quota; /* Compute the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * (m - first) / m * (last - first + 1); else if (drop_rule & DROP_COLUMN) { int i; quota = 0; for (i = first; i <= last; i++) quota += xa_end[i] - xa_begin[i]; quota = gamma * quota * (m - first) / m; } else if (drop_rule & DROP_AREA) quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) - nnzLj; else quota = m * n; fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / min_mn); /* Drop small rows */ dtempv = (double *) tempv; i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, &fill_tol, Glu, dtempv, dwork2, 0); /* Reset the parameters */ if (drop_rule & DROP_DYNAMIC) { if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) < nnzLj) tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); else tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); } if (fill_tol < 0) iinfo -= (int)fill_tol; #ifdef DEBUG num_drop_L += i * (last - first + 1); #endif } /* -------------------------------------- * Factorize the relaxed supernode(jcol:kcol) * -------------------------------------- */ /* Determine the union of the row structure of the snode */ if ( (*info = ilu_zsnode_dfs(jcol, kcol, asub, xa_begin, xa_end, marker, Glu)) != 0 ) return; nextu = xusub[jcol]; nextlu = xlusup[jcol]; jsupno = supno[jcol]; fsupc = xsup[jsupno]; new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1); nzlumax = Glu->nzlumax; while ( new_next > nzlumax ) { if ((*info = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))) return; } for (icol = jcol; icol <= kcol; icol++) { xusub[icol+1] = nextu; amax[0] = 0.0; /* Scatter into SPA dense[*] */ for (k = xa_begin[icol]; k < xa_end[icol]; k++) { register double tmp = z_abs1 (&a[k]); if (tmp > amax[0]) amax[0] = tmp; dense[asub[k]] = a[k]; } nnzAj += xa_end[icol] - xa_begin[icol]; if (amax[0] == 0.0) { amax[0] = fill_ini; #if ( PRNTlevel >= 1) printf("Column %d is entirely zero!\n", icol); fflush(stdout); #endif } /* Numeric update within the snode */ zsnode_bmod(icol, jsupno, fsupc, dense, tempv, Glu, stat); if (usepr) pivrow = iperm_r[icol]; fill_tol = pow(fill_ini, 1.0 - (double)icol / (double)min_mn); if ( (*info = ilu_zpivotL(icol, diag_pivot_thresh, &usepr, perm_r, iperm_c[icol], swap, iswap, marker_relax, &pivrow, amax[0] * fill_tol, milu, zero, Glu, stat)) ) { iinfo++; marker[pivrow] = kcol; } } jcol = kcol + 1; } else { /* Work on one panel of panel_size columns */ /* Adjust panel_size so that a panel won't overlap with the next * relaxed snode. */ panel_size = w_def; for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) if ( relax_end[k] != EMPTY ) { panel_size = k - jcol; break; } if ( k == min_mn ) panel_size = min_mn - jcol; panel_histo[panel_size]++; /* symbolic factor on a panel of columns */ ilu_zpanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1, dense, amax, panel_lsub, segrep, repfnz, marker, parent, xplore, Glu); /* numeric sup-panel updates in topological order */ zpanel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, Glu, stat); /* Sparse LU within the panel, and below panel diagonal */ for (jj = jcol; jj < jcol + panel_size; jj++) { k = (jj - jcol) * m; /* column index for w-wide arrays */ nseg = nseg1; /* Begin after all the panel segments */ nnzAj += xa_end[jj] - xa_begin[jj]; if ((*info = ilu_zcolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k], segrep, &repfnz[k], marker, parent, xplore, Glu))) return; /* Numeric updates */ if ((*info = zcolumn_bmod(jj, (nseg - nseg1), &dense[k], tempv, &segrep[nseg1], &repfnz[k], jcol, Glu, stat)) != 0) return; /* Make a fill-in position if the column is entirely zero */ if (xlsub[jj + 1] == xlsub[jj]) { register int i, row; int nextl; int nzlmax = Glu->nzlmax; int *lsub = Glu->lsub; int *marker2 = marker + 2 * m; /* Allocate memory */ nextl = xlsub[jj] + 1; if (nextl >= nzlmax) { int error = zLUMemXpand(jj, nextl, LSUB, &nzlmax, Glu); if (error) { *info = error; return; } lsub = Glu->lsub; } xlsub[jj + 1]++; assert(xlusup[jj]==xlusup[jj+1]); xlusup[jj + 1]++; ((doublecomplex *) Glu->lusup)[xlusup[jj]] = zero; /* Choose a row index (pivrow) for fill-in */ for (i = jj; i < n; i++) if (marker_relax[swap[i]] <= jj) break; row = swap[i]; marker2[row] = jj; lsub[xlsub[jj]] = row; #ifdef DEBUG printf("Fill col %d.\n", jj); fflush(stdout); #endif } /* Computer the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * jj / m; else if (drop_rule & DROP_COLUMN) quota = gamma * (xa_end[jj] - xa_begin[jj]) * (jj + 1) / m; else if (drop_rule & DROP_AREA) quota = gamma * 0.9 * nnzAj * 0.5 - nnzUj; else quota = m; /* Copy the U-segments to ucol[*] and drop small entries */ if ((*info = ilu_zcopy_to_ucol(jj, nseg, segrep, &repfnz[k], perm_r, &dense[k], drop_rule, milu, amax[jj - jcol] * tol_U, quota, &drop_sum, &nnzUj, Glu, dwork2)) != 0) return; /* Reset the dropping threshold if required */ if (drop_rule & DROP_DYNAMIC) { if (gamma * 0.9 * nnzAj * 0.5 < nnzLj) tol_U = SUPERLU_MIN(1.0, tol_U * 2.0); else tol_U = SUPERLU_MAX(drop_tol, tol_U * 0.5); } if (drop_sum.r != 0.0 && drop_sum.i != 0.0) { omega = SUPERLU_MIN(2.0*(1.0-alpha)/z_abs1(&drop_sum), 1.0); zd_mult(&drop_sum, &drop_sum, omega); } if (usepr) pivrow = iperm_r[jj]; fill_tol = pow(fill_ini, 1.0 - (double)jj / (double)min_mn); if ( (*info = ilu_zpivotL(jj, diag_pivot_thresh, &usepr, perm_r, iperm_c[jj], swap, iswap, marker_relax, &pivrow, amax[jj - jcol] * fill_tol, milu, drop_sum, Glu, stat)) ) { iinfo++; marker[m + pivrow] = jj; marker[2 * m + pivrow] = jj; } /* Reset repfnz[] for this column */ resetrep_col (nseg, segrep, &repfnz[k]); /* Start a new supernode, drop the previous one */ if (jj > 0 && supno[jj] > supno[jj - 1] && jj < last_drop) { int first = xsup[supno[jj - 1]]; int last = jj - 1; int quota; /* Compute the quota */ if (drop_rule & DROP_PROWS) quota = gamma * Astore->nnz / m * (m - first) / m * (last - first + 1); else if (drop_rule & DROP_COLUMN) { int i; quota = 0; for (i = first; i <= last; i++) quota += xa_end[i] - xa_begin[i]; quota = gamma * quota * (m - first) / m; } else if (drop_rule & DROP_AREA) quota = gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) - nnzLj; else quota = m * n; fill_tol = pow(fill_ini, 1.0 - 0.5 * (first + last) / (double)min_mn); /* Drop small rows */ dtempv = (double *) tempv; i = ilu_zdrop_row(options, first, last, tol_L, quota, &nnzLj, &fill_tol, Glu, dtempv, dwork2, 1); /* Reset the parameters */ if (drop_rule & DROP_DYNAMIC) { if (gamma * nnzAj * (1.0 - 0.5 * (last + 1.0) / m) < nnzLj) tol_L = SUPERLU_MIN(1.0, tol_L * 2.0); else tol_L = SUPERLU_MAX(drop_tol, tol_L * 0.5); } if (fill_tol < 0) iinfo -= (int)fill_tol; #ifdef DEBUG num_drop_L += i * (last - first + 1); #endif } /* if start a new supernode */ } /* for */ jcol += panel_size; /* Move to the next panel */ } /* else */ } /* for */ *info = iinfo; if ( m > n ) { k = 0; for (i = 0; i < m; ++i) if ( perm_r[i] == EMPTY ) { perm_r[i] = n + k; ++k; } } ilu_countnz(min_mn, &nnzL, &nnzU, Glu); fixupL(min_mn, perm_r, Glu); zLUWorkFree(iwork, zwork, Glu); /* Free work space and compress storage */ if ( fact == SamePattern_SameRowPerm ) { /* L and U structures may have changed due to possibly different pivoting, even though the storage is available. There could also be memory expansions, so the array locations may have changed, */ ((SCformat *)L->Store)->nnz = nnzL; ((SCformat *)L->Store)->nsuper = Glu->supno[n]; ((SCformat *)L->Store)->nzval = (doublecomplex *) Glu->lusup; ((SCformat *)L->Store)->nzval_colptr = Glu->xlusup; ((SCformat *)L->Store)->rowind = Glu->lsub; ((SCformat *)L->Store)->rowind_colptr = Glu->xlsub; ((NCformat *)U->Store)->nnz = nnzU; ((NCformat *)U->Store)->nzval = (doublecomplex *) Glu->ucol; ((NCformat *)U->Store)->rowind = Glu->usub; ((NCformat *)U->Store)->colptr = Glu->xusub; } else { zCreate_SuperNode_Matrix(L, A->nrow, min_mn, nnzL, (doublecomplex *) Glu->lusup, Glu->xlusup, Glu->lsub, Glu->xlsub, Glu->supno, Glu->xsup, SLU_SC, SLU_Z, SLU_TRLU); zCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, (doublecomplex *) Glu->ucol, Glu->usub, Glu->xusub, SLU_NC, SLU_Z, SLU_TRU); } ops[FACT] += ops[TRSV] + ops[GEMV]; stat->expansions = --(Glu->num_expansions); if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r); SUPERLU_FREE (iperm_c); SUPERLU_FREE (relax_end); SUPERLU_FREE (swap); SUPERLU_FREE (iswap); SUPERLU_FREE (relax_fsupc); SUPERLU_FREE (amax); if ( dwork2 ) SUPERLU_FREE (dwork2); }
main(int argc, char *argv[]) { SuperMatrix A, AC, L, U, B; NCformat *Astore; SCPformat *Lstore; NCPformat *Ustore; superlumt_options_t superlumt_options; pxgstrf_shared_t pxgstrf_shared; pdgstrf_threadarg_t *pdgstrf_threadarg; int nprocs; fact_t fact; trans_t trans; yes_no_t refact, usepr; double u, drop_tol; double *a; int *asub, *xa; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ void *work; int info, lwork, nrhs, ldx; int m, n, nnz, permc_spec, panel_size, relax; int i, firstfact; double *rhsb, *xact; Gstat_t Gstat; flops_t flopcnt; void parse_command_line(); /* Default parameters to control factorization. */ nprocs = 1; fact = EQUILIBRATE; trans = NOTRANS; panel_size = sp_ienv(1); relax = sp_ienv(2); u = 1.0; usepr = NO; drop_tol = 0.0; work = NULL; lwork = 0; nrhs = 1; /* Get the number of processes from command line. */ parse_command_line(argc, argv, &nprocs); /* Read the input matrix stored in Harwell-Boeing format. */ dreadhb(&m, &n, &nnz, &a, &asub, &xa); /* Set up the sparse matrix data structure for A. */ dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); if (!(rhsb = doubleMalloc(m * nrhs))) SUPERLU_ABORT("Malloc fails for rhsb[]."); dCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); ldx = n; dGenXtrue(n, nrhs, xact, ldx); dFillRHS(trans, nrhs, xact, ldx, &A, &B); if (!(perm_r = intMalloc(m))) SUPERLU_ABORT("Malloc fails for perm_r[]."); if (!(perm_c = intMalloc(n))) SUPERLU_ABORT("Malloc fails for perm_c[]."); /******************************** * THE FIRST TIME FACTORIZATION * ********************************/ /* ------------------------------------------------------------ Allocate storage and initialize statistics variables. ------------------------------------------------------------*/ StatAlloc(n, nprocs, panel_size, relax, &Gstat); StatInit(n, nprocs, &Gstat); /* ------------------------------------------------------------ Get column permutation vector perm_c[], according to permc_spec: permc_spec = 0: natural ordering permc_spec = 1: minimum degree ordering on structure of A'*A permc_spec = 2: minimum degree ordering on structure of A'+A permc_spec = 3: approximate minimum degree for unsymmetric matrices ------------------------------------------------------------*/ permc_spec = 1; get_perm_c(permc_spec, &A, perm_c); /* ------------------------------------------------------------ Initialize the option structure superlumt_options using the user-input parameters; Apply perm_c to the columns of original A to form AC. ------------------------------------------------------------*/ refact= NO; pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax, u, usepr, drop_tol, perm_c, perm_r, work, lwork, &A, &AC, &superlumt_options, &Gstat); /* ------------------------------------------------------------ Compute the LU factorization of A. The following routine will create nprocs threads. ------------------------------------------------------------*/ pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info); flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; Gstat.ops[FACT] = flopcnt; /* ------------------------------------------------------------ Solve the system A*X=B, overwriting B with X. ------------------------------------------------------------*/ dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info); printf("\n** Result of sparse LU **\n"); dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */ Destroy_CompCol_Permuted(&AC); /* Free extra arrays in AC. */ /********************************* * THE SUBSEQUENT FACTORIZATIONS * *********************************/ /* ------------------------------------------------------------ Re-initialize statistics variables and options used by the factorization routine pdgstrf(). ------------------------------------------------------------*/ StatInit(n, nprocs, &Gstat); refact= YES; pdgstrf_init(nprocs, fact, trans, refact, panel_size, relax, u, usepr, drop_tol, perm_c, perm_r, work, lwork, &A, &AC, &superlumt_options, &Gstat); /* ------------------------------------------------------------ Compute the LU factorization of A. The following routine will create nprocs threads. ------------------------------------------------------------*/ pdgstrf(&superlumt_options, &AC, perm_r, &L, &U, &Gstat, &info); flopcnt = 0; for (i = 0; i < nprocs; ++i) flopcnt += Gstat.procstat[i].fcops; Gstat.ops[FACT] = flopcnt; /* ------------------------------------------------------------ Re-generate right-hand side B, then solve A*X= B. ------------------------------------------------------------*/ dFillRHS(trans, nrhs, xact, ldx, &A, &B); dgstrs(trans, &L, &U, perm_r, perm_c, &B, &Gstat, &info); /* ------------------------------------------------------------ Deallocate storage after factorization. ------------------------------------------------------------*/ pxgstrf_finalize(&superlumt_options, &AC); printf("\n** Result of sparse LU **\n"); dinf_norm_error(nrhs, &B, xact); /* Check inf. norm of the error */ Lstore = (SCPformat *) L.Store; Ustore = (NCPformat *) U.Store; printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); fflush(stdout); SUPERLU_FREE (rhsb); SUPERLU_FREE (xact); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); if ( lwork >= 0 ) { Destroy_SuperNode_SCP(&L); Destroy_CompCol_NCP(&U); } StatFree(&Gstat); }
int main(int argc, char *argv[]) { /* * Purpose * ======= * * DDRIVE is the main test program for the DOUBLE linear * equation driver routines DGSSV and DGSSVX. * * The program is invoked by a shell script file -- dtest.csh. * The output from the tests are written into a file -- dtest.out. * * ===================================================================== */ double *a, *a_save; int *asub, *asub_save; int *xa, *xa_save; SuperMatrix A, B, X, L, U; SuperMatrix ASAV, AC; GlobalLU_t Glu; /* Not needed on return. */ mem_usage_t mem_usage; int *perm_r; /* row permutation from partial pivoting */ int *perm_c, *pc_save; /* column permutation */ int *etree; double zero = 0.0; double *R, *C; double *ferr, *berr; double *rwork; double *wwork; void *work; int info, lwork, nrhs, panel_size, relax; int m, n, nnz; double *xact; double *rhsb, *solx, *bsav; int ldb, ldx; double rpg, rcond; int i, j, k1; double rowcnd, colcnd, amax; int maxsuper, rowblk, colblk; int prefact, nofact, equil, iequed; int nt, nrun, nfail, nerrs, imat, fimat, nimat; int nfact, ifact, itran; int kl, ku, mode, lda; int zerot, izero, ioff; double u; double anorm, cndnum; double *Afull; double result[NTESTS]; superlu_options_t options; fact_t fact; trans_t trans; SuperLUStat_t stat; static char matrix_type[8]; static char equed[1], path[4], sym[1], dist[1]; FILE *fp; /* Fixed set of parameters */ int iseed[] = {1988, 1989, 1990, 1991}; static char equeds[] = {'N', 'R', 'C', 'B'}; static fact_t facts[] = {FACTORED, DOFACT, SamePattern, SamePattern_SameRowPerm}; static trans_t transs[] = {NOTRANS, TRANS, CONJ}; /* Some function prototypes */ extern int dgst01(int, int, SuperMatrix *, SuperMatrix *, SuperMatrix *, int *, int *, double *); extern int dgst02(trans_t, int, int, int, SuperMatrix *, double *, int, double *, int, double *resid); extern int dgst04(int, int, double *, int, double *, int, double rcond, double *resid); extern int dgst07(trans_t, int, int, SuperMatrix *, double *, int, double *, int, double *, int, double *, double *, double *); extern int dlatb4_slu(char *, int *, int *, int *, char *, int *, int *, double *, int *, double *, char *); extern int dlatms_slu(int *, int *, char *, int *, char *, double *d, int *, double *, double *, int *, int *, char *, double *, int *, double *, int *); extern int sp_dconvert(int, int, double *, int, int, int, double *a, int *, int *, int *); /* Executable statements */ strcpy(path, "DGE"); nrun = 0; nfail = 0; nerrs = 0; /* Defaults */ lwork = 0; n = 1; nrhs = 1; panel_size = sp_ienv(1); relax = sp_ienv(2); u = 1.0; strcpy(matrix_type, "LA"); parse_command_line(argc, argv, matrix_type, &n, &panel_size, &relax, &nrhs, &maxsuper, &rowblk, &colblk, &lwork, &u, &fp); if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) { fprintf(stderr, "expert: cannot allocate %d bytes\n", lwork); exit (-1); } } /* Set the default input options. */ set_default_options(&options); options.DiagPivotThresh = u; options.PrintStat = NO; options.PivotGrowth = YES; options.ConditionNumber = YES; options.IterRefine = SLU_DOUBLE; if ( strcmp(matrix_type, "LA") == 0 ) { /* Test LAPACK matrix suite. */ m = n; lda = SUPERLU_MAX(n, 1); nnz = n * n; /* upper bound */ fimat = 1; nimat = NTYPES; Afull = doubleCalloc(lda * n); dallocateA(n, nnz, &a, &asub, &xa); } else { /* Read a sparse matrix */ fimat = nimat = 0; dreadhb(fp, &m, &n, &nnz, &a, &asub, &xa); } dallocateA(n, nnz, &a_save, &asub_save, &xa_save); rhsb = doubleMalloc(m * nrhs); bsav = doubleMalloc(m * nrhs); solx = doubleMalloc(n * nrhs); ldb = m; ldx = n; dCreate_Dense_Matrix(&B, m, nrhs, rhsb, ldb, SLU_DN, SLU_D, SLU_GE); dCreate_Dense_Matrix(&X, n, nrhs, solx, ldx, SLU_DN, SLU_D, SLU_GE); xact = doubleMalloc(n * nrhs); etree = intMalloc(n); perm_r = intMalloc(n); perm_c = intMalloc(n); pc_save = intMalloc(n); R = (double *) SUPERLU_MALLOC(m*sizeof(double)); C = (double *) SUPERLU_MALLOC(n*sizeof(double)); ferr = (double *) SUPERLU_MALLOC(nrhs*sizeof(double)); berr = (double *) SUPERLU_MALLOC(nrhs*sizeof(double)); j = SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs); rwork = (double *) SUPERLU_MALLOC(j*sizeof(double)); for (i = 0; i < j; ++i) rwork[i] = 0.; if ( !R ) ABORT("SUPERLU_MALLOC fails for R"); if ( !C ) ABORT("SUPERLU_MALLOC fails for C"); if ( !ferr ) ABORT("SUPERLU_MALLOC fails for ferr"); if ( !berr ) ABORT("SUPERLU_MALLOC fails for berr"); if ( !rwork ) ABORT("SUPERLU_MALLOC fails for rwork"); wwork = doubleCalloc( SUPERLU_MAX(m,n) * SUPERLU_MAX(4,nrhs) ); for (i = 0; i < n; ++i) perm_c[i] = pc_save[i] = i; options.ColPerm = MY_PERMC; for (imat = fimat; imat <= nimat; ++imat) { /* All matrix types */ if ( imat ) { /* Skip types 5, 6, or 7 if the matrix size is too small. */ zerot = (imat >= 5 && imat <= 7); if ( zerot && n < imat-4 ) continue; /* Set up parameters with DLATB4 and generate a test matrix with DLATMS. */ dlatb4_slu(path, &imat, &n, &n, sym, &kl, &ku, &anorm, &mode, &cndnum, dist); dlatms_slu(&n, &n, dist, iseed, sym, &rwork[0], &mode, &cndnum, &anorm, &kl, &ku, "No packing", Afull, &lda, &wwork[0], &info); if ( info ) { printf(FMT3, "DLATMS", info, izero, n, nrhs, imat, nfail); continue; } /* For types 5-7, zero one or more columns of the matrix to test that INFO is returned correctly. */ if ( zerot ) { if ( imat == 5 ) izero = 1; else if ( imat == 6 ) izero = n; else izero = n / 2 + 1; ioff = (izero - 1) * lda; if ( imat < 7 ) { for (i = 0; i < n; ++i) Afull[ioff + i] = zero; } else { for (j = 0; j < n - izero + 1; ++j) for (i = 0; i < n; ++i) Afull[ioff + i + j*lda] = zero; } } else { izero = 0; } /* Convert to sparse representation. */ sp_dconvert(n, n, Afull, lda, kl, ku, a, asub, xa, &nnz); } else { izero = 0; zerot = 0; } dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_D, SLU_GE); /* Save a copy of matrix A in ASAV */ dCreate_CompCol_Matrix(&ASAV, m, n, nnz, a_save, asub_save, xa_save, SLU_NC, SLU_D, SLU_GE); dCopy_CompCol_Matrix(&A, &ASAV); /* Form exact solution. */ dGenXtrue(n, nrhs, xact, ldx); StatInit(&stat); for (iequed = 0; iequed < 4; ++iequed) { *equed = equeds[iequed]; if (iequed == 0) nfact = 4; else nfact = 1; /* Only test factored, pre-equilibrated matrix */ for (ifact = 0; ifact < nfact; ++ifact) { fact = facts[ifact]; options.Fact = fact; for (equil = 0; equil < 2; ++equil) { options.Equil = equil; prefact = ( options.Fact == FACTORED || options.Fact == SamePattern_SameRowPerm ); /* Need a first factor */ nofact = (options.Fact != FACTORED); /* Not factored */ /* Restore the matrix A. */ dCopy_CompCol_Matrix(&ASAV, &A); if ( zerot ) { if ( prefact ) continue; } else if ( options.Fact == FACTORED ) { if ( equil || iequed ) { /* Compute row and column scale factors to equilibrate matrix A. */ dgsequ(&A, R, C, &rowcnd, &colcnd, &amax, &info); /* Force equilibration. */ if ( !info && n > 0 ) { if ( strncmp(equed, "R", 1)==0 ) { rowcnd = 0.; colcnd = 1.; } else if ( strncmp(equed, "C", 1)==0 ) { rowcnd = 1.; colcnd = 0.; } else if ( strncmp(equed, "B", 1)==0 ) { rowcnd = 0.; colcnd = 0.; } } /* Equilibrate the matrix. */ dlaqgs(&A, R, C, rowcnd, colcnd, amax, equed); } } if ( prefact ) { /* Need a factor for the first time */ /* Save Fact option. */ fact = options.Fact; options.Fact = DOFACT; /* Preorder the matrix, obtain the column etree. */ sp_preorder(&options, &A, perm_c, etree, &AC); /* Factor the matrix AC. */ dgstrf(&options, &AC, relax, panel_size, etree, work, lwork, perm_c, perm_r, &L, &U, &Glu, &stat, &info); if ( info ) { printf("** First factor: info %d, equed %c\n", info, *equed); if ( lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); exit(0); } } Destroy_CompCol_Permuted(&AC); /* Restore Fact option. */ options.Fact = fact; } /* if .. first time factor */ for (itran = 0; itran < NTRAN; ++itran) { trans = transs[itran]; options.Trans = trans; /* Restore the matrix A. */ dCopy_CompCol_Matrix(&ASAV, &A); /* Set the right hand side. */ dFillRHS(trans, nrhs, xact, ldx, &A, &B); dCopy_Dense_Matrix(m, nrhs, rhsb, ldb, bsav, ldb); /*---------------- * Test dgssv *----------------*/ if ( options.Fact == DOFACT && itran == 0) { /* Not yet factored, and untransposed */ dCopy_Dense_Matrix(m, nrhs, rhsb, ldb, solx, ldx); dgssv(&options, &A, perm_c, perm_r, &L, &U, &X, &stat, &info); if ( info && info != izero ) { printf(FMT3, "dgssv", info, izero, n, nrhs, imat, nfail); } else { /* Reconstruct matrix from factors and compute residual. */ dgst01(m, n, &A, &L, &U, perm_c, perm_r, &result[0]); nt = 1; if ( izero == 0 ) { /* Compute residual of the computed solution. */ dCopy_Dense_Matrix(m, nrhs, rhsb, ldb, wwork, ldb); dgst02(trans, m, n, nrhs, &A, solx, ldx, wwork,ldb, &result[1]); nt = 2; } /* Print information about the tests that did not pass the threshold. */ for (i = 0; i < nt; ++i) { if ( result[i] >= THRESH ) { printf(FMT1, "dgssv", n, i, result[i]); ++nfail; } } nrun += nt; } /* else .. info == 0 */ /* Restore perm_c. */ for (i = 0; i < n; ++i) perm_c[i] = pc_save[i]; if (lwork == 0) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } } /* if .. end of testing dgssv */ /*---------------- * Test dgssvx *----------------*/ /* Equilibrate the matrix if fact = FACTORED and equed = 'R', 'C', or 'B'. */ if ( options.Fact == FACTORED && (equil || iequed) && n > 0 ) { dlaqgs(&A, R, C, rowcnd, colcnd, amax, equed); } /* Solve the system and compute the condition number and error bounds using dgssvx. */ dgssvx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, ferr, berr, &Glu, &mem_usage, &stat, &info); if ( info && info != izero ) { printf(FMT3, "dgssvx", info, izero, n, nrhs, imat, nfail); if ( lwork == -1 ) { printf("** Estimated memory: %.0f bytes\n", mem_usage.total_needed); exit(0); } } else { if ( !prefact ) { /* Reconstruct matrix from factors and compute residual. */ dgst01(m, n, &A, &L, &U, perm_c, perm_r, &result[0]); k1 = 0; } else { k1 = 1; } if ( !info ) { /* Compute residual of the computed solution.*/ dCopy_Dense_Matrix(m, nrhs, bsav, ldb, wwork, ldb); dgst02(trans, m, n, nrhs, &ASAV, solx, ldx, wwork, ldb, &result[1]); /* Check solution from generated exact solution. */ dgst04(n, nrhs, solx, ldx, xact, ldx, rcond, &result[2]); /* Check the error bounds from iterative refinement. */ dgst07(trans, n, nrhs, &ASAV, bsav, ldb, solx, ldx, xact, ldx, ferr, berr, &result[3]); /* Print information about the tests that did not pass the threshold. */ for (i = k1; i < NTESTS; ++i) { if ( result[i] >= THRESH ) { printf(FMT2, "dgssvx", options.Fact, trans, *equed, n, imat, i, result[i]); ++nfail; } } nrun += NTESTS; } /* if .. info == 0 */ } /* else .. end of testing dgssvx */ } /* for itran ... */ if ( lwork == 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } } /* for equil ... */ } /* for ifact ... */ } /* for iequed ... */ #if 0 if ( !info ) { PrintPerf(&L, &U, &mem_usage, rpg, rcond, ferr, berr, equed); } #endif Destroy_SuperMatrix_Store(&A); Destroy_SuperMatrix_Store(&ASAV); StatFree(&stat); } /* for imat ... */ /* Print a summary of the results. */ PrintSumm("DGE", nfail, nrun, nerrs); if ( strcmp(matrix_type, "LA") == 0 ) SUPERLU_FREE (Afull); SUPERLU_FREE (rhsb); SUPERLU_FREE (bsav); SUPERLU_FREE (solx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (pc_save); SUPERLU_FREE (R); SUPERLU_FREE (C); SUPERLU_FREE (ferr); SUPERLU_FREE (berr); SUPERLU_FREE (rwork); SUPERLU_FREE (wwork); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); #if 0 Destroy_CompCol_Matrix(&A); Destroy_CompCol_Matrix(&ASAV); #else SUPERLU_FREE(a); SUPERLU_FREE(asub); SUPERLU_FREE(xa); SUPERLU_FREE(a_save); SUPERLU_FREE(asub_save); SUPERLU_FREE(xa_save); #endif if ( lwork > 0 ) { SUPERLU_FREE (work); Destroy_SuperMatrix_Store(&L); Destroy_SuperMatrix_Store(&U); } return 0; }
/*! \brief * * <pre> * Purpose * ======= * * dgstrsU only performs the U-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) trans_t * Specifies the form of the system of equations: * = NOTRANS: A * X = B (No transpose) * = TRANS: A'* X = B (Transpose) * = CONJ: A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_c (input) int*, dimension (L->ncol) * Column permutation vector, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * </pre> */ void dgstrsU(trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif #ifdef USE_VENDOR_BLAS double alpha = 1.0, beta = 1.0; double *work_col; #endif DNformat *Bstore; double *Bmat; SCformat *Lstore; NCformat *Ustore; double *Lval, *Uval; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, jcol, n, ldb, nrhs; double *rhs_work, *soln; flops_t solve_ops; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_D || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("dgstrs", &i); return; } n = L->nrow; soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { rhs_work[fsupc] /= Lval[luptr]; rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else dtrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) dusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); rhs_work[irow] -= rhs_work[jcol] * Uval[i]; } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); dprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve U'x = b */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_dtrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); } } SUPERLU_FREE(soln); }
/*! \brief * * <pre> * Purpose * ======= * * dgstrsL only performs the L-solve using the LU factorization computed * by DGSTRF. * * See supermatrix.h for the definition of 'SuperMatrix' structure. * * Arguments * ========= * * trans (input) char* * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * L (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SLU_SC, Dtype = SLU_D, Mtype = SLU_TRLU. * * U (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U as computed by * dgstrf(). Use column-wise storage scheme, i.e., U has types: * Stype = SLU_NC, Dtype = SLU_D, Mtype = SLU_TRU. * * perm_r (input) int*, dimension (L->nrow) * Row permutation vector, which defines the permutation matrix Pr; * perm_r[i] = j means row i of A is in position j in Pr*A. * * B (input/output) SuperMatrix* * B has types: Stype = SLU_DN, Dtype = SLU_D, Mtype = SLU_GE. * On entry, the right hand side matrix. * On exit, the solution matrix if info = 0; * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * </pre> */ void dgstrsL(char *trans, SuperMatrix *L, int *perm_r, SuperMatrix *B, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; double alpha = 1.0, beta = 1.0; DNformat *Bstore; double *Bmat; SCformat *Lstore; double *Lval, *Uval; int nrow, notran; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; double *work, *work_col, *rhs_work, *soln; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; void dprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; notran = lsame_(trans, "N"); if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C") ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_D || L->Mtype != SLU_TRLU ) *info = -2; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_D || B->Mtype != SLU_GE ) *info = -4; if ( *info ) { i = -(*info); xerbla_("dgstrsL", &i); return; } n = L->nrow; work = doubleCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = doubleMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; solve_ops = 0; if ( notran ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += nsupc * (nsupc - 1) * nrhs; solve_ops += 2 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr]; } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else dtrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); dgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work_col[i]; /* Scatter */ work_col[i] = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; dlsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); dmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); rhs_work[irow] -= work[i]; work[i] = 0.0; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); dprint_soln(n, nrhs, Bmat); #endif SuperLUStat.ops[SOLVE] = solve_ops; } else { printf("Transposed solve not implemented.\n"); exit(0); } SUPERLU_FREE(work); SUPERLU_FREE(soln); }