int pcgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ complex *dense, /* modified */ complex *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; complex *lusup; int *xlusup, *xlusup_end; complex *tempv1; int mem_error; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; #ifdef DEBUG if (jcol == -1) printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
void cpanel_bmod ( const int m, /* in - number of rows in the matrix */ const int w, /* in */ const int jcol, /* in */ const int nseg, /* in */ complex *dense, /* out, of size n by w */ complex *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in, of size n by w */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha, beta; #endif register int k, ksub; int fsupc, nsupc, nsupr, nrow; int krep, krep_ind; complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* Points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i; register int jj; /* Index through each column in the panel */ int *xsup, *supno; int *lsub, *xlsub; complex *lusup; int *xlusup; int *repfnz_col; /* repfnz[] for a column in the panel */ complex *dense_col; /* dense[] for a column in the panel */ complex *tempv1; /* Used in 1-D update */ complex *TriTmp, *MatvecTmp; /* used in 2-D update */ complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex comp_temp, comp_temp1; register int ldaTmp; register int r_ind, r_hi; static int first = 1, maxsuper, rowblk, colblk; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; if ( first ) { maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } ldaTmp = maxsuper + rowblk; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nrow = nsupr - nsupc; lptr = xlsub[fsupc]; krep_ind = lptr + nsupc - 1; repfnz_col = repfnz; dense_col = dense; if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ TriTmp = tempv; /* Sequence through each column in panel -- triangular solves */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* Copy U[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solves. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif } /* else ... */ } /* for jj ... end tri-solves */ /* Block row updates; push all the way into dense[*] block */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); luptr = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; /* Sequence through each column in panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Perform a block update, and scatter the result of matrix-vector to dense[]. */ no_zeros = kfnz - fsupc; luptr1 = luptr + nsupr * no_zeros; MatvecTmp = &TriTmp[maxsuper]; #ifdef USE_VENDOR_BLAS alpha = one; beta = zero; #ifdef _CRAY CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #else cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #endif #else cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], TriTmp, MatvecTmp); #endif /* Scatter MatvecTmp[*] into SPA dense[*] temporarily * such that MatvecTmp[*] can be re-used for the * the next blok row update. dense[] will be copied into * global store after the whole panel has been finished. */ isub = isub1; for (i = 0; i < block_nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &MatvecTmp[i]); MatvecTmp[i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* Scatter the triangular solves into SPA dense[*] */ repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; TriTmp[i] = zero; ++isub; } } /* for jj ... */ } else { /* 1-D block modification */ /* Sequence through each column in the panel */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense[]. */ no_zeros = kfnz - fsupc; /* Copy U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[*] into SPA dense[*] temporarily, such * that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; tempv[i] = zero; isub++; } /* Scatter the update from tempv1[*] into SPA dense[*] */ /* Start dense rectangular L */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze>=4 ... */ } /* for each column in the panel... */ } /* else 1-D update ... */ } /* for each updating supernode ... */ }
int cgst01(int m, int n, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, float *resid) { /* Purpose ======= CGST01 reconstructs a matrix A from its L*U factorization and computes the residual norm(L*U - A) / ( N * norm(A) * EPS ), where EPS is the machine epsilon. Arguments ========== M (input) INT The number of rows of the matrix A. M >= 0. N (input) INT The number of columns of the matrix A. N >= 0. A (input) SuperMatrix *, dimension (A->nrow, A->ncol) The original M x N matrix A. L (input) SuperMatrix *, dimension (L->nrow, L->ncol) The factor matrix L. U (input) SuperMatrix *, dimension (U->nrow, U->ncol) The factor matrix U. perm_c (input) INT array, dimension (N) The column permutation from CGSTRF. perm_r (input) INT array, dimension (M) The pivot indices from CGSTRF. RESID (output) FLOAT* norm(L*U - A) / ( N * norm(A) * EPS ) ===================================================================== */ /* Local variables */ complex zero = {0.0, 0.0}; int i, j, k, arow, lptr,isub, urow, superno, fsupc, u_part; complex utemp, comp_temp; float anorm, tnorm, cnorm; float eps; complex *work; SCformat *Lstore; NCformat *Astore, *Ustore; complex *Aval, *Lval, *Uval; int *colbeg, *colend; /* Function prototypes */ extern float clangs(char *, SuperMatrix *); /* Quick exit if M = 0 or N = 0. */ if (m <= 0 || n <= 0) { *resid = 0.f; return 0; } work = (complex *)complexCalloc(m); Astore = A->Store; Aval = Astore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; colbeg = intMalloc(n); colend = intMalloc(n); for (i = 0; i < n; i++) { colbeg[perm_c[i]] = Astore->colptr[i]; colend[perm_c[i]] = Astore->colptr[i+1]; } /* Determine EPS and the norm of A. */ eps = smach("Epsilon"); anorm = clangs("1", A); cnorm = 0.; /* Compute the product L*U, one column at a time */ for (k = 0; k < n; ++k) { /* The U part outside the rectangular supernode */ for (i = U_NZ_START(k); i < U_NZ_START(k+1); ++i) { urow = U_SUB(i); utemp = Uval[i]; superno = Lstore->col_to_sup[urow]; fsupc = L_FST_SUPC(superno); u_part = urow - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)].r -= utemp.r; work[L_SUB(lptr-1)].i -= utemp.i; for (j = L_NZ_START(urow) + u_part; j < L_NZ_START(urow+1); ++j) { isub = L_SUB(lptr); cc_mult(&comp_temp, &utemp, &Lval[j]); c_sub(&work[isub], &work[isub], &comp_temp); ++lptr; } } /* The U part inside the rectangular supernode */ superno = Lstore->col_to_sup[k]; fsupc = L_FST_SUPC(superno); urow = L_NZ_START(k); for (i = fsupc; i <= k; ++i) { utemp = Lval[urow++]; u_part = i - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)].r -= utemp.r; work[L_SUB(lptr-1)].i -= utemp.i; for (j = L_NZ_START(i)+u_part; j < L_NZ_START(i+1); ++j) { isub = L_SUB(lptr); cc_mult(&comp_temp, &utemp, &Lval[j]); c_sub(&work[isub], &work[isub], &comp_temp); ++lptr; } } /* Now compute A[k] - (L*U)[k] (Both matrices may be permuted.) */ for (i = colbeg[k]; i < colend[k]; ++i) { arow = Astore->rowind[i]; work[perm_r[arow]].r += Aval[i].r; work[perm_r[arow]].i += Aval[i].i; } /* Now compute the 1-norm of the column vector work */ tnorm = 0.; for (i = 0; i < m; ++i) { tnorm += fabs(work[i].r) + fabs(work[i].i); work[i] = zero; } cnorm = SUPERLU_MAX(tnorm, cnorm); } *resid = cnorm; if (anorm <= 0.f) { if (*resid != 0.f) { *resid = 1.f / eps; } } else { *resid = *resid / (float) n / anorm / eps; } SUPERLU_FREE(work); SUPERLU_FREE(colbeg); SUPERLU_FREE(colend); return 0; /* End of CGST01 */ } /* cgst01_ */
int cpivotL( const int jcol, /* in */ const float u, /* in - diagonal pivoting threshold */ int *usepr, /* re-use the pivot sequence given by perm_r/iperm_r */ int *perm_r, /* may be modified */ int *iperm_r, /* in - inverse of perm_r */ int *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */ int *pivrow, /* out */ GlobalLU_t *Glu, /* modified - global LU data structures */ SuperLUStat_t *stat /* output */ ) { complex one = {1.0, 0.0}; int fsupc; /* first column in the supernode */ int nsupc; /* no of columns in the supernode */ int nsupr; /* no of rows in the supernode */ int lptr; /* points to the starting subscript of the supernode */ int pivptr, old_pivptr, diag, diagind; float pivmax, rtemp, thresh; complex temp; complex *lu_sup_ptr; complex *lu_col_ptr; int *lsub_ptr; int isub, icol, k, itemp; int *lsub, *xlsub; complex *lusup; int *xlusup; flops_t *ops = stat->ops; /* Initialize pointers */ lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ lptr = xlsub[fsupc]; nsupr = xlsub[fsupc+1] - lptr; lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ #ifdef DEBUG if ( jcol == MIN_COL ) { printf(_T("Before cdiv: col %d\n"), jcol); for (k = nsupc; k < nsupr; k++) printf(_T(" lu[%d] %f\n"), lsub_ptr[k], lu_col_ptr[k]); } #endif /* Determine the largest abs numerical value for partial pivoting; Also search for user-specified pivot, and diagonal element. */ if ( *usepr ) *pivrow = iperm_r[jcol]; diagind = iperm_c[jcol]; pivmax = 0.0; pivptr = nsupc; diag = EMPTY; old_pivptr = nsupc; for (isub = nsupc; isub < nsupr; ++isub) { rtemp = c_abs1 (&lu_col_ptr[isub]); if ( rtemp > pivmax ) { pivmax = rtemp; pivptr = isub; } if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub; if ( lsub_ptr[isub] == diagind ) diag = isub; } /* Test for singularity */ if ( pivmax == 0.0 ) { *pivrow = lsub_ptr[pivptr]; perm_r[*pivrow] = jcol; *usepr = 0; return (jcol+1); } thresh = u * pivmax; /* Choose appropriate pivotal element by our policy. */ if ( *usepr ) { rtemp = c_abs1 (&lu_col_ptr[old_pivptr]); if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; else *usepr = 0; } if ( *usepr == 0 ) { /* Use diagonal pivot? */ if ( diag >= 0 ) { /* diagonal exists */ rtemp = c_abs1 (&lu_col_ptr[diag]); if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; } *pivrow = lsub_ptr[pivptr]; } /* Record pivot row */ perm_r[*pivrow] = jcol; /* Interchange row subscripts */ if ( pivptr != nsupc ) { itemp = lsub_ptr[pivptr]; lsub_ptr[pivptr] = lsub_ptr[nsupc]; lsub_ptr[nsupc] = itemp; /* Interchange numerical values as well, for the whole snode, such * that L is indexed the same way as A. */ for (icol = 0; icol <= nsupc; icol++) { itemp = pivptr + icol * nsupr; temp = lu_sup_ptr[itemp]; lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; lu_sup_ptr[nsupc + icol*nsupr] = temp; } } /* if */ /* cdiv operation */ ops[FACT] += 10 * (nsupr - nsupc); c_div(&temp, &one, &lu_col_ptr[nsupc]); for (k = nsupc+1; k < nsupr; k++) cc_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); return 0; }
int ilu_cpivotL( const int jcol, /* in */ const double u, /* in - diagonal pivoting threshold */ int *usepr, /* re-use the pivot sequence given by * perm_r/iperm_r */ int *perm_r, /* may be modified */ int diagind, /* diagonal of Pc*A*Pc' */ int *swap, /* in/out record the row permutation */ int *iswap, /* in/out inverse of swap, it is the same as perm_r after the factorization */ int *marker, /* in */ int *pivrow, /* in/out, as an input if *usepr!=0 */ double fill_tol, /* in - fill tolerance of current column * used for a singular column */ milu_t milu, /* in */ complex drop_sum, /* in - computed in ilu_ccopy_to_ucol() (MILU only) */ GlobalLU_t *Glu, /* modified - global LU data structures */ SuperLUStat_t *stat /* output */ ) { int n; /* number of columns */ int fsupc; /* first column in the supernode */ int nsupc; /* no of columns in the supernode */ int nsupr; /* no of rows in the supernode */ int lptr; /* points to the starting subscript of the supernode */ register int pivptr; int old_pivptr, diag, ptr0; register float pivmax, rtemp; float thresh; complex temp; complex *lu_sup_ptr; complex *lu_col_ptr; int *lsub_ptr; register int isub, icol, k, itemp; int *lsub, *xlsub; complex *lusup; int *xlusup; flops_t *ops = stat->ops; int info; complex one = {1.0, 0.0}; /* Initialize pointers */ n = Glu->n; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; fsupc = (Glu->xsup)[(Glu->supno)[jcol]]; nsupc = jcol - fsupc; /* excluding jcol; nsupc >= 0 */ lptr = xlsub[fsupc]; nsupr = xlsub[fsupc+1] - lptr; lu_sup_ptr = &lusup[xlusup[fsupc]]; /* start of the current supernode */ lu_col_ptr = &lusup[xlusup[jcol]]; /* start of jcol in the supernode */ lsub_ptr = &lsub[lptr]; /* start of row indices of the supernode */ /* Determine the largest abs numerical value for partial pivoting; Also search for user-specified pivot, and diagonal element. */ pivmax = -1.0; pivptr = nsupc; diag = EMPTY; old_pivptr = nsupc; ptr0 = EMPTY; for (isub = nsupc; isub < nsupr; ++isub) { if (marker[lsub_ptr[isub]] > jcol) continue; /* do not overlap with a later relaxed supernode */ switch (milu) { case SMILU_1: c_add(&temp, &lu_col_ptr[isub], &drop_sum); rtemp = c_abs1(&temp); break; case SMILU_2: case SMILU_3: /* In this case, drop_sum contains the sum of the abs. value */ rtemp = c_abs1(&lu_col_ptr[isub]); break; case SILU: default: rtemp = c_abs1(&lu_col_ptr[isub]); break; } if (rtemp > pivmax) { pivmax = rtemp; pivptr = isub; } if (*usepr && lsub_ptr[isub] == *pivrow) old_pivptr = isub; if (lsub_ptr[isub] == diagind) diag = isub; if (ptr0 == EMPTY) ptr0 = isub; } if (milu == SMILU_2 || milu == SMILU_3) pivmax += drop_sum.r; /* Test for singularity */ if (pivmax < 0.0) { fprintf(stderr, "[0]: jcol=%d, SINGULAR!!!\n", jcol); fflush(stderr); exit(1); } if ( pivmax == 0.0 ) { if (diag != EMPTY) *pivrow = lsub_ptr[pivptr = diag]; else if (ptr0 != EMPTY) *pivrow = lsub_ptr[pivptr = ptr0]; else { /* look for the first row which does not belong to any later supernodes */ for (icol = jcol; icol < n; icol++) if (marker[swap[icol]] <= jcol) break; if (icol >= n) { fprintf(stderr, "[1]: jcol=%d, SINGULAR!!!\n", jcol); fflush(stderr); exit(1); } *pivrow = swap[icol]; /* pick up the pivot row */ for (isub = nsupc; isub < nsupr; ++isub) if ( lsub_ptr[isub] == *pivrow ) { pivptr = isub; break; } } pivmax = fill_tol; lu_col_ptr[pivptr].r = pivmax; lu_col_ptr[pivptr].i = 0.0; *usepr = 0; #ifdef DEBUG printf("[0] ZERO PIVOT: FILL (%d, %d).\n", *pivrow, jcol); fflush(stdout); #endif info =jcol + 1; } /* if (*pivrow == 0.0) */ else { thresh = u * pivmax; /* Choose appropriate pivotal element by our policy. */ if ( *usepr ) { switch (milu) { case SMILU_1: c_add(&temp, &lu_col_ptr[old_pivptr], &drop_sum); rtemp = c_abs1(&temp); break; case SMILU_2: case SMILU_3: rtemp = c_abs1(&lu_col_ptr[old_pivptr]) + drop_sum.r; break; case SILU: default: rtemp = c_abs1(&lu_col_ptr[old_pivptr]); break; } if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = old_pivptr; else *usepr = 0; } if ( *usepr == 0 ) { /* Use diagonal pivot? */ if ( diag >= 0 ) { /* diagonal exists */ switch (milu) { case SMILU_1: c_add(&temp, &lu_col_ptr[diag], &drop_sum); rtemp = c_abs1(&temp); break; case SMILU_2: case SMILU_3: rtemp = c_abs1(&lu_col_ptr[diag]) + drop_sum.r; break; case SILU: default: rtemp = c_abs1(&lu_col_ptr[diag]); break; } if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag; } *pivrow = lsub_ptr[pivptr]; } info = 0; /* Reset the diagonal */ switch (milu) { case SMILU_1: c_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); break; case SMILU_2: case SMILU_3: temp = c_sgn(&lu_col_ptr[pivptr]); cc_mult(&temp, &temp, &drop_sum); c_add(&lu_col_ptr[pivptr], &lu_col_ptr[pivptr], &drop_sum); break; case SILU: default: break; } } /* else */ /* Record pivot row */ perm_r[*pivrow] = jcol; if (jcol < n - 1) { register int t1, t2, t; t1 = iswap[*pivrow]; t2 = jcol; if (t1 != t2) { t = swap[t1]; swap[t1] = swap[t2]; swap[t2] = t; t1 = swap[t1]; t2 = t; t = iswap[t1]; iswap[t1] = iswap[t2]; iswap[t2] = t; } } /* if (jcol < n - 1) */ /* Interchange row subscripts */ if ( pivptr != nsupc ) { itemp = lsub_ptr[pivptr]; lsub_ptr[pivptr] = lsub_ptr[nsupc]; lsub_ptr[nsupc] = itemp; /* Interchange numerical values as well, for the whole snode, such * that L is indexed the same way as A. */ for (icol = 0; icol <= nsupc; icol++) { itemp = pivptr + icol * nsupr; temp = lu_sup_ptr[itemp]; lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr]; lu_sup_ptr[nsupc + icol*nsupr] = temp; } } /* if */ /* cdiv operation */ ops[FACT] += 10 * (nsupr - nsupc); c_div(&temp, &one, &lu_col_ptr[nsupc]); for (k = nsupc+1; k < nsupr; k++) cc_mult(&lu_col_ptr[k], &lu_col_ptr[k], &temp); return info; }
/*! \brief * <pre> * Purpose * ======= * ilu_cdrop_row() - Drop some small rows from the previous * supernode (L-part only). * </pre> */ int ilu_cdrop_row( superlu_options_t *options, /* options */ int first, /* index of the first column in the supernode */ int last, /* index of the last column in the supernode */ double drop_tol, /* dropping parameter */ int quota, /* maximum nonzero entries allowed */ int *nnzLj, /* in/out number of nonzeros in L(:, 1:last) */ double *fill_tol, /* in/out - on exit, fill_tol=-num_zero_pivots, * does not change if options->ILU_MILU != SMILU1 */ GlobalLU_t *Glu, /* modified */ float swork[], /* working space * the length of swork[] should be no less than * the number of rows in the supernode */ float swork2[], /* working space with the same size as swork[], * used only by the second dropping rule */ int lastc /* if lastc == 0, there is nothing after the * working supernode [first:last]; * if lastc == 1, there is one more column after * the working supernode. */ ) { register int i, j, k, m1; register int nzlc; /* number of nonzeros in column last+1 */ register int xlusup_first, xlsub_first; int m, n; /* m x n is the size of the supernode */ int r = 0; /* number of dropped rows */ register float *temp; register complex *lusup = (complex *) Glu->lusup; register int *lsub = Glu->lsub; register int *xlsub = Glu->xlsub; register int *xlusup = Glu->xlusup; register float d_max = 0.0, d_min = 1.0; int drop_rule = options->ILU_DropRule; milu_t milu = options->ILU_MILU; norm_t nrm = options->ILU_Norm; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; int i_1 = 1; int inc_diag; /* inc_diag = m + 1 */ int nzp = 0; /* number of zero pivots */ float alpha = pow((double)(Glu->n), -1.0 / options->ILU_MILU_Dim); xlusup_first = xlusup[first]; xlsub_first = xlsub[first]; m = xlusup[first + 1] - xlusup_first; n = last - first + 1; m1 = m - 1; inc_diag = m + 1; nzlc = lastc ? (xlusup[last + 2] - xlusup[last + 1]) : 0; temp = swork - n; /* Quick return if nothing to do. */ if (m == 0 || m == n || drop_rule == NODROP) { *nnzLj += m * n; return 0; } /* basic dropping: ILU(tau) */ for (i = n; i <= m1; ) { /* the average abs value of ith row */ switch (nrm) { case ONE_NORM: temp[i] = scasum_(&n, &lusup[xlusup_first + i], &m) / (double)n; break; case TWO_NORM: temp[i] = scnrm2_(&n, &lusup[xlusup_first + i], &m) / sqrt((double)n); break; case INF_NORM: default: k = icamax_(&n, &lusup[xlusup_first + i], &m) - 1; temp[i] = c_abs1(&lusup[xlusup_first + i + m * k]); break; } /* drop small entries due to drop_tol */ if (drop_rule & DROP_BASIC && temp[i] < drop_tol) { r++; /* drop the current row and move the last undropped row here */ if (r > 1) /* add to last row */ { /* accumulate the sum (for MILU) */ switch (milu) { case SMILU_1: case SMILU_2: caxpy_(&n, &one, &lusup[xlusup_first + i], &m, &lusup[xlusup_first + m - 1], &m); break; case SMILU_3: for (j = 0; j < n; j++) lusup[xlusup_first + (m - 1) + j * m].r += c_abs1(&lusup[xlusup_first + i + j * m]); break; case SILU: default: break; } ccopy_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); } /* if (r > 1) */ else /* move to last row */ { cswap_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); if (milu == SMILU_3) for (j = 0; j < n; j++) { lusup[xlusup_first + m1 + j * m].r = c_abs1(&lusup[xlusup_first + m1 + j * m]); lusup[xlusup_first + m1 + j * m].i = 0.0; } } lsub[xlsub_first + i] = lsub[xlsub_first + m1]; m1--; continue; } /* if dropping */ else { if (temp[i] > d_max) d_max = temp[i]; if (temp[i] < d_min) d_min = temp[i]; } i++; } /* for */ /* Secondary dropping: drop more rows according to the quota. */ quota = ceil((double)quota / (double)n); if (drop_rule & DROP_SECONDARY && m - r > quota) { register double tol = d_max; /* Calculate the second dropping tolerance */ if (quota > n) { if (drop_rule & DROP_INTERP) /* by interpolation */ { d_max = 1.0 / d_max; d_min = 1.0 / d_min; tol = 1.0 / (d_max + (d_min - d_max) * quota / (m - n - r)); } else /* by quick select */ { int len = m1 - n + 1; scopy_(&len, swork, &i_1, swork2, &i_1); tol = sqselect(len, swork2, quota - n); #if 0 register int *itemp = iwork - n; A = temp; for (i = n; i <= m1; i++) itemp[i] = i; qsort(iwork, m1 - n + 1, sizeof(int), _compare_); tol = temp[itemp[quota]]; #endif } } for (i = n; i <= m1; ) { if (temp[i] <= tol) { register int j; r++; /* drop the current row and move the last undropped row here */ if (r > 1) /* add to last row */ { /* accumulate the sum (for MILU) */ switch (milu) { case SMILU_1: case SMILU_2: caxpy_(&n, &one, &lusup[xlusup_first + i], &m, &lusup[xlusup_first + m - 1], &m); break; case SMILU_3: for (j = 0; j < n; j++) lusup[xlusup_first + (m - 1) + j * m].r += c_abs1(&lusup[xlusup_first + i + j * m]); break; case SILU: default: break; } ccopy_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); } /* if (r > 1) */ else /* move to last row */ { cswap_(&n, &lusup[xlusup_first + m1], &m, &lusup[xlusup_first + i], &m); if (milu == SMILU_3) for (j = 0; j < n; j++) { lusup[xlusup_first + m1 + j * m].r = c_abs1(&lusup[xlusup_first + m1 + j * m]); lusup[xlusup_first + m1 + j * m].i = 0.0; } } lsub[xlsub_first + i] = lsub[xlsub_first + m1]; m1--; temp[i] = temp[m1]; continue; } i++; } /* for */ } /* if secondary dropping */ for (i = n; i < m; i++) temp[i] = 0.0; if (r == 0) { *nnzLj += m * n; return 0; } /* add dropped entries to the diagnal */ if (milu != SILU) { register int j; complex t; float omega; for (j = 0; j < n; j++) { t = lusup[xlusup_first + (m - 1) + j * m]; if (t.r == 0.0 && t.i == 0.0) continue; omega = SUPERLU_MIN(2.0 * (1.0 - alpha) / c_abs1(&t), 1.0); cs_mult(&t, &t, omega); switch (milu) { case SMILU_1: if ( !(c_eq(&t, &none)) ) { c_add(&t, &t, &one); cc_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], &t); } else { cs_mult( &lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], *fill_tol); #ifdef DEBUG printf("[1] ZERO PIVOT: FILL col %d.\n", first + j); fflush(stdout); #endif nzp++; } break; case SMILU_2: cs_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], 1.0 + c_abs1(&t)); break; case SMILU_3: c_add(&t, &t, &one); cc_mult(&lusup[xlusup_first + j * inc_diag], &lusup[xlusup_first + j * inc_diag], &t); break; case SILU: default: break; } } if (nzp > 0) *fill_tol = -nzp; } /* Remove dropped entries from the memory and fix the pointers. */ m1 = m - r; for (j = 1; j < n; j++) { register int tmp1, tmp2; tmp1 = xlusup_first + j * m1; tmp2 = xlusup_first + j * m; for (i = 0; i < m1; i++) lusup[i + tmp1] = lusup[i + tmp2]; } for (i = 0; i < nzlc; i++) lusup[xlusup_first + i + n * m1] = lusup[xlusup_first + i + n * m]; for (i = 0; i < nzlc; i++) lsub[xlsub[last + 1] - r + i] = lsub[xlsub[last + 1] + i]; for (i = first + 1; i <= last + 1; i++) { xlusup[i] -= r * (i - first); xlsub[i] -= r; } if (lastc) { xlusup[last + 2] -= r * n; xlsub[last + 2] -= r; } *nnzLj += (m - r) * n; return r; }
/*! \brief Performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y * * <pre> * Purpose * ======= * * sp_cgemv() performs one of the matrix-vector operations * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, * where alpha and beta are scalars, x and y are vectors and A is a * sparse A->nrow by A->ncol matrix. * * Parameters * ========== * * TRANS - (input) char* * On entry, TRANS specifies the operation to be performed as * follows: * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * TRANS = 'C' or 'c' y := alpha*A^H*x + beta*y. * * ALPHA - (input) complex * On entry, ALPHA specifies the scalar alpha. * * A - (input) SuperMatrix* * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * * X - (input) complex*, array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * * INCX - (input) int * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * * BETA - (input) complex * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * * Y - (output) complex*, array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - (input) int * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * * ==== Sparse Level 2 Blas routine. * </pre> */ int sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x, int incx, complex beta, complex *y, int incy) { /* Local variables */ NCformat *Astore; complex *Aval; int info; complex temp, temp1; int lenx, leny, i, j, irow; int iy, jx, jy, kx, ky; int notran; complex comp_zero = {0.0, 0.0}; complex comp_one = {1.0, 0.0}; notran = ( strncmp(trans, "N", 1)==0 || strncmp(trans, "n", 1)==0 ); Astore = A->Store; Aval = Astore->nzval; /* Test the input parameters */ info = 0; if ( !notran && strncmp(trans, "T", 1)!=0 && strncmp(trans, "C", 1)!=0) info = 1; else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; else if (incx == 0) info = 5; else if (incy == 0) info = 8; if (info != 0) { input_error("sp_cgemv ", &info); return 0; } /* Quick return if possible. */ if (A->nrow == 0 || A->ncol == 0 || c_eq(&alpha, &comp_zero) && c_eq(&beta, &comp_one)) return 0; /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if ( notran ) { lenx = A->ncol; leny = A->nrow; } else { lenx = A->nrow; leny = A->ncol; } if (incx > 0) kx = 0; else kx = - (lenx - 1) * incx; if (incy > 0) ky = 0; else ky = - (leny - 1) * incy; /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if ( !c_eq(&beta, &comp_one) ) { if (incy == 1) { if ( c_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) y[i] = comp_zero; else for (i = 0; i < leny; ++i) cc_mult(&y[i], &beta, &y[i]); } else { iy = ky; if ( c_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) { y[iy] = comp_zero; iy += incy; } else for (i = 0; i < leny; ++i) { cc_mult(&y[iy], &beta, &y[iy]); iy += incy; } } } if ( c_eq(&alpha, &comp_zero) ) return 0; if ( notran ) { /* Form y := alpha*A*x + y. */ jx = kx; if (incy == 1) { for (j = 0; j < A->ncol; ++j) { if ( !c_eq(&x[jx], &comp_zero) ) { cc_mult(&temp, &alpha, &x[jx]); for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; cc_mult(&temp1, &temp, &Aval[i]); c_add(&y[irow], &y[irow], &temp1); } } jx += incx; } } else { ABORT("Not implemented."); } } else if (strncmp(trans, "T", 1) == 0 || strncmp(trans, "t", 1) == 0) { /* Form y := alpha*A'*x + y. */ jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { temp = comp_zero; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; cc_mult(&temp1, &Aval[i], &x[irow]); c_add(&temp, &temp, &temp1); } cc_mult(&temp1, &alpha, &temp); c_add(&y[jy], &y[jy], &temp1); jy += incy; } } else { ABORT("Not implemented."); } } else { /* trans == 'C' or 'c' */ /* Form y := alpha * conj(A) * x + y. */ complex temp2; jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { temp = comp_zero; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; temp2.r = Aval[i].r; temp2.i = -Aval[i].i; /* conjugation */ cc_mult(&temp1, &temp2, &x[irow]); c_add(&temp, &temp, &temp1); } cc_mult(&temp1, &alpha, &temp); c_add(&y[jy], &y[jy], &temp1); jy += incy; } } else { ABORT("Not implemented."); } } return 0; } /* sp_cgemv */
/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b * * <pre> * Purpose * ======= * * sp_ctrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A^H*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU. * * x - (input/output) complex* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * </pre> */ int sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, complex *x, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int incx = 1, incy = 1; complex temp; complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex comp_zero = {0.0, 0.0}; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; complex *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( strncmp(uplo,"L", 1)!=0 && strncmp(uplo, "U", 1)!=0 ) *info = -1; else if ( strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 && strncmp(trans, "C", 1)!=0) *info = -2; else if ( strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0 ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); input_error("sp_ctrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = complexCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_ctrsv()."); if ( strncmp(trans, "N", 1)==0 ) { /* Form x := inv(A)*x. */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; solve_ops += 8 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ work[i] = comp_zero; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); cc_mult(&comp_zero, &x[fsupc], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[jcol], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } } } /* for k ... */ } } else if ( strncmp(trans, "T", 1)==0 ) { /* Form x := inv(A')*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_mult(&comp_zero, &x[irow], &Lval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[irow], &Uval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } else { /* Form x := conj(inv(A'))*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := conj(inv(L'))*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_conj(&temp, &Lval[i]); cc_mult(&comp_zero, &x[irow], &temp); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := conj(inv(U'))*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_conj(&temp, &Uval[i]); cc_mult(&comp_zero, &x[irow], &temp); c_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { cc_conj(&temp, &Lval[luptr]); c_div(&x[fsupc], &x[fsupc], &temp); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
void pcgstrf_bmod1D( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int fsupc, /* leading column of the updating supernode */ const int krep, /* last column of the updating supernode */ const int nsupc, /* number of columns in the updating s-node */ int nsupr, /* number of rows in the updating supernode */ int nrow, /* number of rows below the diagonal block of the updating supernode */ int *repfnz, /* in */ int *panel_lsub, /* modified */ int *w_lsub_end, /* modified */ int *spa_marker, /* modified; size n-by-w */ complex *dense, /* modified */ complex *tempv, /* working array - zeros on entry/exit */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Results are returned in SPA dense[*,w]. * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; register int lptr; /* start of row subscripts of the updating supernode */ register int i, krep_ind, kfnz, isub, irow, no_zeros; register int jj; /* index through each column in the panel */ int *repfnz_col; /* repfnz[] for a column in the panel */ complex *dense_col; /* dense[] for a column in the panel */ complex *tempv1; /* used to store matrix-vector result */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ int *lsub, *xlsub_end; complex *lusup; int *xlusup; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex comp_temp, comp_temp1; #ifdef TIMING double *utime = Gstat->utime; double f_time; #endif lsub = Glu->lsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; lptr = Glu->xlsub[fsupc]; krep_ind = lptr + nsupc - 1; /* Pointers to each column of the w-wide arrays. */ repfnz_col= repfnz; dense_col = dense; col_marker= spa_marker; col_lsub = panel_lsub; #if ( DEBUGlevel>=2 ) if (jcol == BADPAN && krep == BADREP) { printf("(%d) pcgstrf_bmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n", pnum, jcol, fsupc, krep, nsupc, nsupr, nrow); PrintInt10("lsub[xlsub[2774]]", nsupr, &lsub[lptr]); } #endif /* * Sequence through each column in the panel ... */ for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; /* Calculate flops: tri-solve + mat-vector */ Gstat->procstat[pnum].fcops += flopcnt; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; #if ( DEBUGlevel>=2 ) if (krep == BADCOL && jj == -1) { printf("(%d) pcgstrf_bmod1D[segsze=1]: k %d, j %d, ukj %.10e\n", pnum, lsub[krep_ind], jj, ukj); PrintInt10("segsze=1", nsupr, &lsub[lptr]); } #endif for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else if ( segsze <= 3 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { /* segsze >= 4 */ /* * Perform a triangular solve and matrix-vector update, * then scatter the result of sup-col update to dense[*]. */ no_zeros = kfnz - fsupc; /* Gather U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef TIMING f_time = SuperLU_timer_(); #endif #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif /* _CRAY_PVP */ #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif /* Scatter tempv[*] into SPA dense[*] temporarily, * such that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; #if ( DEBUGlevel>=2 ) if (jj == -1 && krep == 3423) printf("(%d) pcgstrf_bmod1D[scatter] jj %d, dense_col[%d] %e\n", pnum, jj, irow, dense_col[irow]); #endif } /* Scatter the update from tempv1[*] into SPA dense[*] */ /*#pragma ivdep*/ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif tempv1[i] = zero; isub++; } } /* else segsze >= 4 ... */ } /* for jj ... */ }
/* Return value: 0 - successful return * > 0 - number of bytes allocated when run out of space */ int ccolumn_bmod ( const int jcol, /* in */ const int nseg, /* in */ complex *dense, /* in */ complex *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in */ int fpanelc, /* in -- first column in the current panel */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { /* * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j] * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha, beta; /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; register int lptr, kfnz, isub, irow, i; register int no_zeros, new_next; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode. */ int *xsup, *supno; int *lsub, *xlsub; complex *lusup; int *xlusup; int nzlumax; complex *tempv1; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; int mem_error; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; nzlumax = Glu->nzlumax; jcolp1 = jcol + 1; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc > fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[] as a temporary storage */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; tempv[i] = zero; ++isub; } /* Scatter tempv1[] into SPA dense[] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } } /* if jsupno ... */ } /* for each segment... */ /* * Process the supernodal portion of L\U[*,j] */ nextlu = xlusup[jcol]; fsupc = xsup[jsupno]; /* Copy the SPA dense into L\U[*,j] */ new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; while ( new_next > nzlumax ) { if (mem_error = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) return (mem_error); lusup = Glu->lusup; lsub = Glu->lsub; } for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ /* For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * 1) fsupc < fpanelc, then fst_col := fpanelc * 2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* Distance between the current supernode and the current panel. d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* Points to the beginning of jcol in snode L\U(jsupno) */ ufirst = xlusup[jcol] + d_fsupc; ops[TRSV] += 4 * nsupc * (nsupc - 1); ops[GEMV] += 8 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #endif alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #ifdef _CRAY CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
void cgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; #ifdef USE_VENDOR_BLAS complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex *work_col; #endif complex temp_comp; DNformat *Bstore; complex *Bmat; SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; complex *work, *rhs_work, *soln; flops_t solve_ops; void cprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("cgstrs", &i); return; } n = L->nrow; work = complexCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = complexMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; solve_ops += 8 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; cc_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else ctrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); cgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); c_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); work_col[i].r = 0.0; work_col[i].i = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; clsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); cmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); c_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); work[i].r = 0.; work[i].i = 0.; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); cprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { c_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else ctrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) cusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); cc_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); cprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B or CONJ(A)*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; if (trans == TRANS) { for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_ctrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_ctrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } } else { /* trans == CONJ */ for (k = 0; k < nrhs; ++k) { /* Multiply by conj(inv(U')). */ sp_ctrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by conj(inv(L')). */ sp_ctrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); } } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
int sp_cgemv(char *trans, complex alpha, SuperMatrix *A, complex *x, int incx, complex beta, complex *y, int incy) { /* Purpose ======= sp_cgemv() performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, where alpha and beta are scalars, x and y are vectors and A is a sparse A->nrow by A->ncol matrix. Parameters ========== TRANS - (input) char* On entry, TRANS specifies the operation to be performed as follows: TRANS = 'N' or 'n' y := alpha*A*x + beta*y. TRANS = 'T' or 't' y := alpha*A'*x + beta*y. TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. ALPHA - (input) complex On entry, ALPHA specifies the scalar alpha. A - (input) SuperMatrix* Before entry, the leading m by n part of the array A must contain the matrix of coefficients. X - (input) complex*, array of DIMENSION at least ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, the incremented array X must contain the vector x. INCX - (input) int On entry, INCX specifies the increment for the elements of X. INCX must not be zero. BETA - (input) complex On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input. Y - (output) complex*, array of DIMENSION at least ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry with BETA non-zero, the incremented array Y must contain the vector y. On exit, Y is overwritten by the updated vector y. INCY - (input) int On entry, INCY specifies the increment for the elements of Y. INCY must not be zero. ==== Sparse Level 2 Blas routine. */ /* Local variables */ NCformat *Astore; complex *Aval; int info; complex temp, temp1; int lenx, leny, i, j, irow; int iy, jx, jy, kx, ky; int notran; complex comp_zero = {0.0, 0.0}; complex comp_one = {1.0, 0.0}; notran = lsame_(trans, "N"); Astore = A->Store; Aval = Astore->nzval; /* Test the input parameters */ info = 0; if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1; else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; else if (incx == 0) info = 5; else if (incy == 0) info = 8; if (info != 0) { xerbla_("sp_cgemv ", &info); return 0; } /* Quick return if possible. */ if (A->nrow == 0 || A->ncol == 0 || c_eq(&alpha, &comp_zero) && c_eq(&beta, &comp_one)) return 0; /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if (lsame_(trans, "N")) { lenx = A->ncol; leny = A->nrow; } else { lenx = A->nrow; leny = A->ncol; } if (incx > 0) kx = 0; else kx = - (lenx - 1) * incx; if (incy > 0) ky = 0; else ky = - (leny - 1) * incy; /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if ( !c_eq(&beta, &comp_one) ) { if (incy == 1) { if ( c_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) y[i] = comp_zero; else for (i = 0; i < leny; ++i) cc_mult(&y[i], &beta, &y[i]); } else { iy = ky; if ( c_eq(&beta, &comp_zero) ) for (i = 0; i < leny; ++i) { y[iy] = comp_zero; iy += incy; } else for (i = 0; i < leny; ++i) { cc_mult(&y[iy], &beta, &y[iy]); iy += incy; } } } if ( c_eq(&alpha, &comp_zero) ) return 0; if ( notran ) { /* Form y := alpha*A*x + y. */ jx = kx; if (incy == 1) { for (j = 0; j < A->ncol; ++j) { if ( !c_eq(&x[jx], &comp_zero) ) { cc_mult(&temp, &alpha, &x[jx]); for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; cc_mult(&temp1, &temp, &Aval[i]); c_add(&y[irow], &y[irow], &temp1); } } jx += incx; } } else { ABORT("Not implemented."); } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (incx == 1) { for (j = 0; j < A->ncol; ++j) { temp = comp_zero; for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { irow = Astore->rowind[i]; cc_mult(&temp1, &Aval[i], &x[irow]); c_add(&temp, &temp, &temp1); } cc_mult(&temp1, &alpha, &temp); c_add(&y[jy], &y[jy], &temp1); jy += incy; } } else { ABORT("Not implemented."); } } return 0; } /* sp_cgemv */
int sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, complex *x, int *info) { /* * Purpose * ======= * * sp_ctrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU. * * x - (input/output) complex* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int incx = 1, incy = 1; complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex comp_zero = {0.0, 0.0}; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; complex *work; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_ctrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = complexCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_ctrsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1); solve_ops += 8 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ work[i] = comp_zero; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1); if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); cc_mult(&comp_zero, &x[fsupc], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[jcol], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_mult(&comp_zero, &x[irow], &Lval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[irow], &Uval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); } } solve_ops += 4 * nsupc * (nsupc + 1); if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } SuperLUStat.ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }