void dd (header *hd) { header *st=hd,*hd1,*result; int c1,c2,i,j,r; double *m1,*m2,*mr; complex *mc1,*mc2,*mcr,hc1,hc2; interval *mi1,*mi2,*mir,hi1,hi2; hd1=next_param(st); equal_params_2(&hd,&hd1); if (error) return; getmatrix(hd,&r,&c1,&m1); if (r!=1) wrong_arg(); getmatrix(hd1,&r,&c2,&m2); if (r!=1) wrong_arg(); if (c1!=c2) wrong_arg(); if (iscomplex(hd)) /* complex values */ { mc1=(complex *)m1; mc2=(complex *)m2; result=new_cmatrix(1,c1,""); if (error) return; mcr=(complex *)matrixof(result); memmove((char *)mcr,(char *)mc2,c1*sizeof(complex)); for (i=1; i<c1; i++) { for (j=c1-1; j>=i; j--) { if (mc1[j][0]==mc1[j-i][0] && mc1[j][1]==mc1[j-i][1]) wrong_arg(); c_sub(mcr[j],mcr[j-1],hc1); c_sub(mc1[j],mc1[j-i],hc2); c_div(hc1,hc2,mcr[j]); } } } else if (isinterval(hd)) /* complex values */ { mi1=(complex *)m1; mi2=(complex *)m2; result=new_imatrix(1,c1,""); if (error) return; mir=(interval *)matrixof(result); memmove((char *)mir,(char *)mi2,c1*sizeof(interval)); for (i=1; i<c1; i++) { for (j=c1-1; j>=i; j--) { i_sub(mir[j],mir[j-1],hi1); if (hi1[0]<=0 && hi1[1]>=0) { output("Interval points coincide\n"); error=1; return; } i_sub(mi1[j],mi1[j-i],hi2); i_div(hi1,hi2,mir[j]); } } } else if (isreal(hd)) { result=new_matrix(1,c1,""); if (error) return; mr=matrixof(result); memmove((char *)mr,(char *)m2,c1*sizeof(double)); for (i=1; i<c1; i++) { for (j=c1-1; j>=i; j--) { if (m1[j]==m1[j-i]) wrong_arg(); mr[j]=(mr[j]-mr[j-1])/(m1[j]-m1[j-i]); } } } else wrong_arg(); moveresult(st,result); }
/** * \brief Expand pole product * \param c resulting filter coefficients * \param poles pole locations * \param K number of poles * \ingroup vyv_gaussian * * This routine expands the product to obtain the filter coefficients: * \f[ \prod_{k=0}^{K-1}\frac{\mathrm{poles}[k]-1}{\mathrm{poles}[k]-z^{-1}} = \frac{c[0]}{1+\sum_{k=1}^K c[k] z^{-k}}. \f] */ static void expand_pole_product(double *c, const complex4c *poles, int K) { complex4c denom[VYV_MAX_K + 1]; int k, j; assert(K <= VYV_MAX_K); denom[0] = poles[0]; denom[1] = make_complex(-1, 0); for (k = 1; k < K; ++k) { denom[k + 1] = c_neg(denom[k]); for (j = k; j > 0; --j) denom[j] = c_sub(c_mul(denom[j], poles[k]), denom[j - 1]); denom[0] = c_mul(denom[0], poles[k]); } for (k = 1; k <= K; ++k) c[k] = c_div(denom[k], denom[0]).real; for (c[0] = 1, k = 1; k <= K; ++k) c[0] += c[k]; return; }
static struct polynom *pol_div(struct polynom *p,struct polynom *q,struct polynom *p1,struct polynom *p2) { int i,j; struct complex z,z1; z.x=0; z.y=0; // RS 7.2.2013 p->n=p1->n-p2->n; if (p->n<0) { sur_print("\nDegree of dividend < Degree of divisor"); WAIT; return(p); } for (i=0; i<=p->n; ++i) p->a[i].x=p->a[i].y=0.0; for (i=0; i<=p1->n; ++i) { q->a[i].x=p1->a[i].x; q->a[i].y=p1->a[i].y; } for (i=p1->n; i>=p2->n; --i) { if (c_zero(&(q->a[i]))) continue; c_div(&z,&(q->a[i]),&(p2->a[p2->n])); p->a[i-p2->n].x=z.x; p->a[i-p2->n].y=z.y; q->a[i].x=q->a[i].y=0.0; for (j=i-1; j>=i-p2->n; --j) c_sub(&(q->a[j]),&(q->a[j]), c_mult(&z1,&z,&(p2->a[p2->n-i+j]))); } i=p2->n-1; while (c_zero(&(q->a[i])) && i>0) --i; q->n=i; return(p); }
void Dn_up(struct c_complex z,long nstop,struct c_complex*D) /*:16*/ #line 216 "./mie.w" { struct c_complex zinv,k_over_z; long k; D[0]= c_inv(c_tan(z)); zinv= c_inv(z); for(k= 1;k<nstop;k++){ k_over_z= c_smul((double)k,zinv); D[k]= c_sub(c_inv(c_sub(k_over_z,D[k-1])),k_over_z); } }
void CholeskySolver<TYPE>::CholeskyDecomposition(const BaseMatrix<TYPE> &bm) { assert(bm.Nrows() == bm.Ncols()); int n = lm.Nrows(); const TYPE &e = SimpleSolver<TYPE>::epsilon; TYPE temp; for (int i = 1; i <= n; ++i) { if (i == 1) { temp = bm(i, i); } else { temp = bm(i, i) - (c_sub(lm, i, i, 1, i - 1) * t(c_sub(lm, i, i, 1, i - 1)))(1, 1); } if (temp <= e) { LinearEquationSolver<TYPE>::fail = true; return; } else { lm(i, i) = std::sqrt(temp); for (int j = i + 1; j <= n; ++j) { if (i == 1) { lm(j, i) = bm(j, i) / lm(i, i); } else { lm(j, i) = (bm(j, i) - (c_sub(lm, i, i, 1, i - 1) * t(c_sub(lm, j, j, 1, i - 1)))(1, 1)) / lm(i, i); } } } } }
void polydd (header *hd) { header *st=hd,*hd1,*result; int c1,c2,i,j,r; double *m1,*m2,*mr,x; complex *mc1,*mc2,*mcr,hc,xc; hd1=next_param(st); equal_params_2(&hd,&hd1); if (error) return; getmatrix(hd,&r,&c1,&m1); if (r!=1) wrong_arg(); getmatrix(hd1,&r,&c2,&m2); if (r!=1) wrong_arg(); if (c1!=c2) wrong_arg(); if (iscomplex(hd)) /* complex values */ { mc1=(complex *)m1; mc2=(complex *)m2; result=new_cmatrix(1,c1,""); if (error) return; mcr=(complex *)matrixof(result); c_copy(mcr[c1-1],mc2[c1-1]); for (i=c1-2; i>=0; i--) { c_copy(xc,mc1[i]); c_mult(xc,mcr[i+1],hc); c_sub(mc2[i],hc,mcr[i]); for (j=i+1; j<c1-1; j++) { c_mult(xc,mcr[j+1],hc); c_sub(mcr[j],hc,mcr[j]); } } } else { result=new_matrix(1,c1,""); if (error) return; mr=matrixof(result); mr[c1-1]=m2[c1-1]; for (i=c1-2; i>=0; i--) { x=m1[i]; mr[i]=m2[i]-x*mr[i+1]; for (j=i+1; j<c1-1; j++) mr[j]=mr[j]-x*mr[j+1]; } } moveresult(st,result); }
void Dn_down(struct c_complex z,long nstop,struct c_complex*D) /*:19*/ #line 247 "./mie.w" { long k; struct c_complex zinv,k_over_z; D[nstop-1]= Lentz_Dn(z,nstop); zinv= c_inv(z); for(k= nstop-1;k>=1;k--){ k_over_z= c_smul((double)k,zinv); D[k-1]= c_sub(k_over_z,c_inv(c_add(D[k],k_over_z))); } }
//傅里叶变化 void fft(int N,complex f[]) { complex t,wn;//中间变量 int i,j,k,m,n,l,r,M; int la,lb,lc; /*----计算分解的级数M=log2(N)----*/ for(i=N,M=1;(i=i/2)!=1;M++); /*----按照倒位序重新排列原信号----*/ for(i=1,j=N/2;i<=N-2;i++) { if(i<j) { t=f[j]; f[j]=f[i]; f[i]=t; } k=N/2; while(k<=j) { j=j-k; k=k/2; } j=j+k; } /*----FFT算法----*/ for(m=1;m<=M;m++) { la=pow(2,m); //la=2^m代表第m级每个分组所含节点数 lb=la/2; //lb代表第m级每个分组所含碟形单元数 //同时它也表示每个碟形单元上下节点之间的距离 /*----碟形运算----*/ for(l=1;l<=lb;l++) { r=(l-1)*pow(2,M-m); for(n=l-1;n<N-1;n=n+la) //遍历每个分组,分组总数为N/la { lc=n+lb; //n,lc分别代表一个碟形单元的上、下节点编号 Wn_i(N,r,&wn,1);//wn=Wnr c_mul(f[lc],wn,&t);//t = f[lc] * wn复数运算 c_sub(f[n],t,&(f[lc]));//f[lc] = f[n] - f[lc] * Wnr c_plus(f[n],t,&(f[n]));//f[n] = f[n] + f[lc] * Wnr } } } }
/*! \brief Check the inf-norm of the error vector */ void cinf_norm_error(int nrhs, SuperMatrix *X, complex *xtrue) { DNformat *Xstore; float err, xnorm; complex *Xmat, *soln_work; complex temp; int i, j; Xstore = X->Store; Xmat = Xstore->nzval; for (j = 0; j < nrhs; j++) { soln_work = &Xmat[j*Xstore->lda]; err = xnorm = 0.0; for (i = 0; i < X->nrow; i++) { c_sub(&temp, &soln_work[i], &xtrue[i]); err = SUPERLU_MAX(err, c_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, c_abs(&soln_work[i])); } err = err / xnorm; printf("||X - Xtrue||/||X|| = %e\n", err); } }
static struct polynom *pol_sub(struct polynom *p,struct polynom *p1,struct polynom *p2) { int i; // RS REM struct complex tulo; p->n=(p1->n>p2->n)? (p1->n):(p2->n); for (i=0; i<=p->n; ++i) { if (i<=p1->n) { if (i<=p2->n) c_sub(&(p->a[i]),&(p1->a[i]),&(p2->a[i])); else { p->a[i].x=p1->a[i].x; p->a[i].y=p1->a[i].y; } } else { p->a[i].x=-(p2->a[i].x); p->a[i].y=-(p2->a[i].y); } } i=p->n; while (c_zero(&(p->a[i])) && i>0) --i; p->n=i; return(p); }
/*! \brief Performs numeric block updates within the relaxed snode. */ int csnode_bmod ( const int jcol, /* in */ const int jsupno, /* in */ const int fsupc, /* in */ complex *dense, /* in */ complex *tempv, /* working array */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha = {-1.0, 0.0}, beta = {1.0, 0.0}; #endif complex comp_zero = {0.0, 0.0}; int luptr, nsupc, nsupr, nrow; int isub, irow, i, iptr; register int ufirst, nextlu; int *lsub, *xlsub; complex *lusup; int *xlusup; flops_t *ops = stat->ops; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = (complex *) Glu->lusup; xlusup = Glu->xlusup; nextlu = xlusup[jcol]; /* * Process the supernodal portion of L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = comp_zero; ++nextlu; } xlusup[jcol + 1] = nextlu; /* Initialize xlusup for next column */ if ( fsupc < jcol ) { luptr = xlusup[fsupc]; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nsupc = jcol - fsupc; /* Excluding jcol */ ufirst = xlusup[jcol]; /* Points to the beginning of column jcol in supernode L\U(jsupno). */ nrow = nsupr - nsupc; ops[TRSV] += 4 * nsupc * (nsupc - 1); ops[GEMV] += 8 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else #if SCIPY_FIX if (nsupr < nsupc) { /* Fail early rather than passing in invalid parameters to TRSV. */ ABORT("failed to factorize matrix"); } #endif ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], &tempv[0] ); /* Scatter tempv[*] into lusup[*] */ iptr = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[iptr], &lusup[iptr], &tempv[i]); ++iptr; tempv[i] = comp_zero; } #endif } return 0; }
void LUdecomposition(long n, complex **A, complex *b, complex *x) { long i,j,k,*p; complex akk, bk, z, sum; complex *Ak; /* Allocate auxiliary variables */ p = (long *)Mem(MEM_ALLOC, n, sizeof(long)); /* Gaussian elimination with partial pivoting */ for (k=0; k<n-1; k++){ /* columns of A */ /* --- Pivoting --- */ akk.re = 0.0; akk.im = 0.0; j = k; for (i=k; i<n; i++){ if ( c_norm(A[i][k]) > c_norm(akk)){ akk = A[i][k]; CheckValue(FUNCTION_NAME, "A[i][k].re","", A[i][k].re, -INFTY, INFTY); CheckValue(FUNCTION_NAME, "A[i][k].im","", A[i][k].im, -INFTY, INFTY); j = i; } } if (j != k){ /* swap rows j and k*/ Ak = A[k]; A[k] = A[j]; /* swap pointers*/ A[j] = Ak; bk = b[k]; b[k] = b[j]; b[j] = bk; } p[k] = j; /* Keep list of permutations */ /* Sanity check */ if (akk.re != A[k][k].re && akk.im != A[k][k].im){ Die(FUNCTION_NAME, "Something went wrong with pivoting?"); return; } if (akk.re == 0.0 && akk.im == 0.0){ Die(FUNCTION_NAME, "Matrix singular?"); return; } /* Store L part */ for (i=k+1;i<n; i++){ A[i][k] = c_div(A[i][k],A[k][k]); CheckValue(FUNCTION_NAME, "A[i][k].re","", A[i][k].re, -INFTY, INFTY); CheckValue(FUNCTION_NAME, "A[i][k].im","", A[i][k].im, -INFTY, INFTY); } /* Update */ for (i=k+1; i<n; i++){ /* update b */ z = c_mul(A[i][k], b[k]); b[i] = c_sub(b[i], z); CheckValue(FUNCTION_NAME, "b[i].re","", b[i].re, -INFTY, INFTY); CheckValue(FUNCTION_NAME, "b[i].im","", b[i].im, -INFTY, INFTY); for (j=k+1; j<n; j++){ /* Update U part of A */ z = c_mul( A[i][k], A[k][j]); A[i][j] = c_sub(A[i][j],z); CheckValue(FUNCTION_NAME, "A[i][j].re","", A[i][j].re, -INFTY, INFTY); CheckValue(FUNCTION_NAME, "A[i][j].im","", A[i][j].im, -INFTY, INFTY); } } } /* Solve x */ x[n-1] = c_div(b[n-1],A[n-1][n-1]); for (i=n-2; i>=0; i--){ sum.re = 0.0; sum.im = 0.0; for (j=i+1; j<n; j++){ z = c_mul(A[i][j], x[j]); sum = c_add(sum, z); } z = c_sub(b[i], sum); x[i] = c_div(z, A[i][i]); } Mem(MEM_FREE, p); /* Return */ return; }
int sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, complex *x, int *info) { /* * Purpose * ======= * * sp_ctrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A'*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU. * * x - (input/output) complex* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int incx = 1, incy = 1; complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex comp_zero = {0.0, 0.0}; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; complex *work; flops_t solve_ops; extern SuperLUStat_t SuperLUStat; /* Test the input parameters */ *info = 0; if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1; else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2; else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); xerbla_("sp_ctrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = complexCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_ctrsv()."); if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1); solve_ops += 8 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ work[i] = comp_zero; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1); if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); cc_mult(&comp_zero, &x[fsupc], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[jcol], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } } } /* for k ... */ } } else { /* Form x := inv(A')*x */ if ( lsame_(uplo, "L") ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_mult(&comp_zero, &x[irow], &Lval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[irow], &Uval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); } } solve_ops += 4 * nsupc * (nsupc + 1); if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } SuperLUStat.ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
/*! \brief Solves one of the systems of equations A*x = b, or A'*x = b * * <pre> * Purpose * ======= * * sp_ctrsv() solves one of the systems of equations * A*x = b, or A'*x = b, * where b and x are n element vectors and A is a sparse unit , or * non-unit, upper or lower triangular matrix. * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * uplo - (input) char* * On entry, uplo specifies whether the matrix is an upper or * lower triangular matrix as follows: * uplo = 'U' or 'u' A is an upper triangular matrix. * uplo = 'L' or 'l' A is a lower triangular matrix. * * trans - (input) char* * On entry, trans specifies the equations to be solved as * follows: * trans = 'N' or 'n' A*x = b. * trans = 'T' or 't' A'*x = b. * trans = 'C' or 'c' A^H*x = b. * * diag - (input) char* * On entry, diag specifies whether or not A is unit * triangular as follows: * diag = 'U' or 'u' A is assumed to be unit triangular. * diag = 'N' or 'n' A is not assumed to be unit * triangular. * * L - (input) SuperMatrix* * The factor L from the factorization Pr*A*Pc=L*U. Use * compressed row subscripts storage for supernodes, * i.e., L has types: Stype = SC, Dtype = SLU_C, Mtype = TRLU. * * U - (input) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. * U has types: Stype = NC, Dtype = SLU_C, Mtype = TRU. * * x - (input/output) complex* * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * info - (output) int* * If *info = -i, the i-th argument had an illegal value. * </pre> */ int sp_ctrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, SuperMatrix *U, complex *x, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int incx = 1, incy = 1; complex temp; complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex comp_zero = {0.0, 0.0}; int nrow; int fsupc, nsupr, nsupc, luptr, istart, irow; int i, k, iptr, jcol; complex *work; flops_t solve_ops; /* Test the input parameters */ *info = 0; if ( strncmp(uplo,"L", 1)!=0 && strncmp(uplo, "U", 1)!=0 ) *info = -1; else if ( strncmp(trans, "N", 1)!=0 && strncmp(trans, "T", 1)!=0 && strncmp(trans, "C", 1)!=0) *info = -2; else if ( strncmp(diag, "U", 1)!=0 && strncmp(diag, "N", 1)!=0 ) *info = -3; else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4; else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5; if ( *info ) { i = -(*info); input_error("sp_ctrsv", &i); return 0; } Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( !(work = complexCalloc(L->nrow)) ) ABORT("Malloc fails for work in sp_ctrsv()."); if ( strncmp(trans, "N", 1)==0 ) { /* Form x := inv(A)*x. */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L)*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); nrow = nsupr - nsupc; /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc; solve_ops += 8 * nrow * nsupc; if ( nsupc == 1 ) { for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) { irow = L_SUB(iptr); ++luptr; cc_mult(&comp_zero, &x[fsupc], &Lval[luptr]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #else ctrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); cgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy); #endif #else clsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]); cmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc], &x[fsupc], &work[0] ); #endif iptr = istart + nsupc; for (i = 0; i < nrow; ++i, ++iptr) { irow = L_SUB(iptr); c_sub(&x[irow], &x[irow], &work[i]); /* Scatter */ work[i] = comp_zero; } } } /* for k ... */ } else { /* Form x := inv(U)*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) { irow = U_SUB(i); cc_mult(&comp_zero, &x[fsupc], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif #else cusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] ); #endif for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[jcol], &Uval[i]); c_sub(&x[irow], &x[irow], &comp_zero); } } } } /* for k ... */ } } else if ( strncmp(trans, "T", 1)==0 ) { /* Form x := inv(A')*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := inv(L')*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_mult(&comp_zero, &x[irow], &Lval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := inv(U')*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_mult(&comp_zero, &x[irow], &Uval[i]); c_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { c_div(&x[fsupc], &x[fsupc], &Lval[luptr]); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd("T", strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } else { /* Form x := conj(inv(A'))*x */ if ( strncmp(uplo, "L", 1)==0 ) { /* Form x := conj(inv(L'))*x */ if ( L->nrow == 0 ) return 0; /* Quick return */ for (k = Lstore->nsuper; k >= 0; --k) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 8 * (nsupr - nsupc) * nsupc; for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { iptr = istart + nsupc; for (i = L_NZ_START(jcol) + nsupc; i < L_NZ_START(jcol+1); i++) { irow = L_SUB(iptr); cc_conj(&temp, &Lval[i]); cc_mult(&comp_zero, &x[irow], &temp); c_sub(&x[jcol], &x[jcol], &comp_zero); iptr++; } } if ( nsupc > 1 ) { solve_ops += 4 * nsupc * (nsupc - 1); #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } } else { /* Form x := conj(inv(U'))*x */ if ( U->nrow == 0 ) return 0; /* Quick return */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc); nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) { irow = U_SUB(i); cc_conj(&temp, &Uval[i]); cc_mult(&comp_zero, &x[irow], &temp); c_sub(&x[jcol], &x[jcol], &comp_zero); } } /* 1 c_div costs 10 flops */ solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc; if ( nsupc == 1 ) { cc_conj(&temp, &Lval[luptr]); c_div(&x[fsupc], &x[fsupc], &temp); } else { #ifdef _CRAY ftcs1 = _cptofcd("U", strlen("U")); ftcs2 = _cptofcd(trans, strlen("T")); ftcs3 = _cptofcd("N", strlen("N")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #else ctrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr, &x[fsupc], &incx); #endif } } /* for k ... */ } } stat->ops[SOLVE] += solve_ops; SUPERLU_FREE(work); return 0; }
void cgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, SuperMatrix *B, SuperLUStat_t *stat, int *info) { #ifdef _CRAY _fcd ftcs1, ftcs2, ftcs3, ftcs4; #endif int incx = 1, incy = 1; #ifdef USE_VENDOR_BLAS complex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; complex *work_col; #endif complex temp_comp; DNformat *Bstore; complex *Bmat; SCformat *Lstore; NCformat *Ustore; complex *Lval, *Uval; int fsupc, nrow, nsupr, nsupc, luptr, istart, irow; int i, j, k, iptr, jcol, n, ldb, nrhs; complex *work, *rhs_work, *soln; flops_t solve_ops; void cprint_soln(); /* Test input parameters ... */ *info = 0; Bstore = B->Store; ldb = Bstore->lda; nrhs = B->ncol; if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1; else if ( L->nrow != L->ncol || L->nrow < 0 || L->Stype != SLU_SC || L->Dtype != SLU_C || L->Mtype != SLU_TRLU ) *info = -2; else if ( U->nrow != U->ncol || U->nrow < 0 || U->Stype != SLU_NC || U->Dtype != SLU_C || U->Mtype != SLU_TRU ) *info = -3; else if ( ldb < SUPERLU_MAX(0, L->nrow) || B->Stype != SLU_DN || B->Dtype != SLU_C || B->Mtype != SLU_GE ) *info = -6; if ( *info ) { i = -(*info); xerbla_("cgstrs", &i); return; } n = L->nrow; work = complexCalloc(n * nrhs); if ( !work ) ABORT("Malloc fails for local work[]."); soln = complexMalloc(n); if ( !soln ) ABORT("Malloc fails for local soln[]."); Bmat = Bstore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; solve_ops = 0; if ( trans == NOTRANS ) { /* Permute right hand sides to form Pr*B */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } /* Forward solve PLy=Pb. */ for (k = 0; k <= Lstore->nsuper; k++) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; nrow = nsupr - nsupc; solve_ops += 4 * nsupc * (nsupc - 1) * nrhs; solve_ops += 8 * nrow * nsupc * nrhs; if ( nsupc == 1 ) { for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; luptr = L_NZ_START(fsupc); for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){ irow = L_SUB(iptr); ++luptr; cc_mult(&temp_comp, &rhs_work[fsupc], &Lval[luptr]); c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } else { luptr = L_NZ_START(fsupc); #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); CTRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); CGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #else ctrsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); cgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, &Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, &beta, &work[0], &n ); #endif for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; work_col = &work[j*n]; iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); c_sub(&rhs_work[irow], &rhs_work[irow], &work_col[i]); work_col[i].r = 0.0; work_col[i].i = 0.0; iptr++; } } #else for (j = 0; j < nrhs; j++) { rhs_work = &Bmat[j*ldb]; clsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]); cmatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc], &rhs_work[fsupc], &work[0] ); iptr = istart + nsupc; for (i = 0; i < nrow; i++) { irow = L_SUB(iptr); c_sub(&rhs_work[irow], &rhs_work[irow], &work[i]); work[i].r = 0.; work[i].i = 0.; iptr++; } } #endif } /* else ... */ } /* for L-solve */ #ifdef DEBUG printf("After L-solve: y=\n"); cprint_soln(n, nrhs, Bmat); #endif /* * Back solve Ux=y. */ for (k = Lstore->nsuper; k >= 0; k--) { fsupc = L_FST_SUPC(k); istart = L_SUB_START(fsupc); nsupr = L_SUB_START(fsupc+1) - istart; nsupc = L_FST_SUPC(k+1) - fsupc; luptr = L_NZ_START(fsupc); solve_ops += 4 * nsupc * (nsupc + 1) * nrhs; if ( nsupc == 1 ) { rhs_work = &Bmat[0]; for (j = 0; j < nrhs; j++) { c_div(&rhs_work[fsupc], &rhs_work[fsupc], &Lval[luptr]); rhs_work += ldb; } } else { #ifdef USE_VENDOR_BLAS #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("U", strlen("U")); ftcs3 = _cptofcd("N", strlen("N")); CTRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #else ctrsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha, &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb); #endif #else for (j = 0; j < nrhs; j++) cusolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] ); #endif } for (j = 0; j < nrhs; ++j) { rhs_work = &Bmat[j*ldb]; for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) { solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol)); for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){ irow = U_SUB(i); cc_mult(&temp_comp, &rhs_work[jcol], &Uval[i]); c_sub(&rhs_work[irow], &rhs_work[irow], &temp_comp); } } } } /* for U-solve */ #ifdef DEBUG printf("After U-solve: x=\n"); cprint_soln(n, nrhs, Bmat); #endif /* Compute the final solution X := Pc*X. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = solve_ops; } else { /* Solve A'*X=B or CONJ(A)*X=B */ /* Permute right hand sides to form Pc'*B. */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } stat->ops[SOLVE] = 0; if (trans == TRANS) { for (k = 0; k < nrhs; ++k) { /* Multiply by inv(U'). */ sp_ctrsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by inv(L'). */ sp_ctrsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info); } } else { /* trans == CONJ */ for (k = 0; k < nrhs; ++k) { /* Multiply by conj(inv(U')). */ sp_ctrsv("U", "C", "N", L, U, &Bmat[k*ldb], stat, info); /* Multiply by conj(inv(L')). */ sp_ctrsv("L", "C", "U", L, U, &Bmat[k*ldb], stat, info); } } /* Compute the final solution X := Pr'*X (=inv(Pr)*X) */ for (i = 0; i < nrhs; i++) { rhs_work = &Bmat[i*ldb]; for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]]; for (k = 0; k < n; k++) rhs_work[k] = soln[k]; } } SUPERLU_FREE(work); SUPERLU_FREE(soln); }
void Mie(double x,struct c_complex m,double*mu,long nangles,struct c_complex*s1, struct c_complex*s2,double*qext,double*qsca,double*qback,double*g) /*:34*/ #line 519 "./mie.w" { /*36:*/ #line 546 "./mie.w" struct c_complex*D; struct c_complex z1,an,bn,bnm1,anm1,qbcalc; double*pi0,*pi1,*tau; struct c_complex xi,xi0,xi1; double psi,psi0,psi1; double alpha,beta,factor; long n,k,nstop,sign; *qext= -1; *qsca= -1; *qback= -1; *g= -1; /*:36*/ #line 522 "./mie.w" /*37:*/ #line 559 "./mie.w" if(m.im> 0.0){ mie_error("This program requires m.im>=0",1); return; } if(x<=0.0){ mie_error("This program requires positive sphere sizes",2); return; } if(nangles<0){ mie_error("This program requires non-negative angle sizes",3); return; } if(nangles<0){ mie_error("This program requires non-negative angle sizes",4); return; } if((nangles> 0)&&(s1==NULL)){ mie_error("Space must be allocated for s1 if nangles!=0",5); return; } if((nangles> 0)&&(s2==NULL)){ mie_error("Space must be allocated for s2if nangles!=0",6); return; } if(x> 20000){ mie_error("Program not validated for spheres with x>20000",7); return; } /*:37*/ #line 524 "./mie.w" /*38:*/ #line 589 "./mie.w" if((m.re==0)&&(x<0.1)){ small_conducting_Mie(x,m,mu,nangles,s1,s2,qext,qsca,qback,g); return; } if((m.re> 0.0)&&(c_abs(m)*x<0.1)){ small_Mie(x,m,mu,nangles,s1,s2,qext,qsca,qback,g); return; } /*:38*/ #line 525 "./mie.w" /*40:*/ #line 616 "./mie.w" nstop= floor(x+4.05*pow(x,0.33333)+2.0); /*:40*/ #line 527 "./mie.w" /*39:*/ #line 600 "./mie.w" if(nangles> 0){ set_carray(s1,nangles,c_set(0.0,0.0)); set_carray(s2,nangles,c_set(0.0,0.0)); pi0= new_darray(nangles); pi1= new_darray(nangles); tau= new_darray(nangles); set_darray(pi0,nangles,0.0); set_darray(tau,nangles,0.0); set_darray(pi1,nangles,1.0); } /*:39*/ #line 529 "./mie.w" if(m.re> 0) /*41:*/ #line 634 "./mie.w" { struct c_complex z; z= c_smul(x,m); D= new_carray(nstop+1); if(D==NULL){ mie_error("Cannot allocate log array",8); return; } if(fabs(m.im*x)<((13.78*m.re-10.8)*m.re+3.9)) Dn_up(z,nstop,D); else Dn_down(z,nstop,D); } /*:41*/ #line 531 "./mie.w" /*42:*/ #line 671 "./mie.w" psi0= sin(x); psi1= psi0/x-cos(x); xi0= c_set(psi0,cos(x)); xi1= c_set(psi1,cos(x)/x+sin(x)); *qsca= 0.0; *g= 0.0; *qext= 0.0; sign= 1; qbcalc= c_set(0.0,0.0); anm1= c_set(0.0,0.0); bnm1= c_set(0.0,0.0); /*:42*/ #line 533 "./mie.w" for(n= 1;n<=nstop;n++){ /*43:*/ #line 696 "./mie.w" if(m.re==0.0){ an= c_sdiv(n*psi1/x-psi0,c_sub(c_smul(n/x,xi1),xi0)); bn= c_sdiv(psi1,xi1); }else if(m.im==0.0){ z1.re= D[n].re/m.re+n/x; an= c_sdiv(z1.re*psi1-psi0,c_sub(c_smul(z1.re,xi1),xi0)); z1.re= D[n].re*m.re+n/x; bn= c_sdiv(z1.re*psi1-psi0,c_sub(c_smul(z1.re,xi1),xi0)); }else{ z1= c_div(D[n],m); z1.re+= n/x; an= c_div(c_set(z1.re*psi1-psi0,z1.im*psi1),c_sub(c_mul(z1,xi1),xi0)); z1= c_mul(D[n],m); z1.re+= n/x; bn= c_div(c_set(z1.re*psi1-psi0,z1.im*psi1),c_sub(c_mul(z1,xi1),xi0)); } /*:43*/ #line 536 "./mie.w" /*44:*/ #line 734 "./mie.w" for(k= 0;k<nangles;k++){ factor= (2.0*n+1.0)/(n+1.0)/n; tau[k]= n*mu[k]*pi1[k]-(n+1)*pi0[k]; alpha= factor*pi1[k]; beta= factor*tau[k]; s1[k].re+= alpha*an.re+beta*bn.re; s1[k].im+= alpha*an.im+beta*bn.im; s2[k].re+= alpha*bn.re+beta*an.re; s2[k].im+= alpha*bn.im+beta*an.im; } for(k= 0;k<nangles;k++){ factor= pi1[k]; pi1[k]= ((2.0*n+1.0)*mu[k]*pi1[k]-(n+1.0)*pi0[k])/n; pi0[k]= factor; } /*:44*/ #line 537 "./mie.w" /*45:*/ #line 780 "./mie.w" factor= 2.0*n+1.0; *g+= (n-1.0/n)*(anm1.re*an.re+anm1.im*an.im+bnm1.re*bn.re+bnm1.im*bn.im); *g+= factor/n/(n+1.0)*(an.re*bn.re+an.im*bn.im); *qsca+= factor*(c_norm(an)+c_norm(bn)); *qext+= factor*(an.re+bn.re); sign*= -1; qbcalc.re+= sign*factor*(an.re-bn.re); qbcalc.im+= sign*factor*(an.im-bn.im); /*:45*/ #line 538 "./mie.w" /*46:*/ #line 804 "./mie.w" factor= (2.0*n+1.0)/x; xi= c_sub(c_smul(factor,xi1),xi0); xi0= xi1; xi1= xi; psi= factor*psi1-psi0; psi0= psi1; psi1= xi1.re; anm1= an; bnm1= bn; /*:46*/ #line 539 "./mie.w" } /*47:*/ #line 817 "./mie.w" *qsca*= 2/(x*x); *qext*= 2/(x*x); *g*= 4/(*qsca)/(x*x); *qback= c_norm(qbcalc)/(x*x); /*:47*/ #line 542 "./mie.w" /*48:*/ #line 823 "./mie.w" if(m.re> 0)free_carray(D); if(nangles> 0){ free_darray(pi0); free_darray(pi1); free_darray(tau); } /*:48*/ #line 543 "./mie.w" }
/* Return value: 0 - successful return * > 0 - number of bytes allocated when run out of space */ int ccolumn_bmod ( const int jcol, /* in */ const int nseg, /* in */ complex *dense, /* in */ complex *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in */ int fpanelc, /* in -- first column in the current panel */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { /* * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j] * */ #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha, beta; /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno; register int lptr, kfnz, isub, irow, i; register int no_zeros, new_next; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode. */ int *xsup, *supno; int *lsub, *xlsub; complex *lusup; int *xlusup; int nzlumax; complex *tempv1; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; int mem_error; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; nzlumax = Glu->nzlumax; jcolp1 = jcol + 1; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc > fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[] as a temporary storage */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; tempv[i] = zero; ++isub; } /* Scatter tempv1[] into SPA dense[] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } } /* if jsupno ... */ } /* for each segment... */ /* * Process the supernodal portion of L\U[*,j] */ nextlu = xlusup[jcol]; fsupc = xsup[jsupno]; /* Copy the SPA dense into L\U[*,j] */ new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc]; while ( new_next > nzlumax ) { if (mem_error = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)) return (mem_error); lusup = Glu->lusup; lsub = Glu->lsub; } for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */ /* For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * 1) fsupc < fpanelc, then fst_col := fpanelc * 2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* Distance between the current supernode and the current panel. d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* Points to the beginning of jcol in snode L\U(jsupno) */ ufirst = xlusup[jcol] + d_fsupc; ops[TRSV] += 4 * nsupc * (nsupc - 1); ops[GEMV] += 8 * nrow * nsupc; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); #endif alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #ifdef _CRAY CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
void cpanel_bmod ( const int m, /* in - number of rows in the matrix */ const int w, /* in */ const int jcol, /* in */ const int nseg, /* in */ complex *dense, /* out, of size n by w */ complex *tempv, /* working array */ int *segrep, /* in */ int *repfnz, /* in, of size n by w */ GlobalLU_t *Glu, /* modified */ SuperLUStat_t *stat /* output */ ) { #ifdef USE_VENDOR_BLAS #ifdef _CRAY _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif int incx = 1, incy = 1; complex alpha, beta; #endif register int k, ksub; int fsupc, nsupc, nsupr, nrow; int krep, krep_ind; complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* Points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i; register int jj; /* Index through each column in the panel */ int *xsup, *supno; int *lsub, *xlsub; complex *lusup; int *xlusup; int *repfnz_col; /* repfnz[] for a column in the panel */ complex *dense_col; /* dense[] for a column in the panel */ complex *tempv1; /* Used in 1-D update */ complex *TriTmp, *MatvecTmp; /* used in 2-D update */ complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex comp_temp, comp_temp1; register int ldaTmp; register int r_ind, r_hi; static int first = 1, maxsuper, rowblk, colblk; flops_t *ops = stat->ops; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; lusup = Glu->lusup; xlusup = Glu->xlusup; if ( first ) { maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) ); rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } ldaTmp = maxsuper + rowblk; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub[fsupc+1] - xlsub[fsupc]; nrow = nsupr - nsupc; lptr = xlsub[fsupc]; krep_ind = lptr + nsupc - 1; repfnz_col = repfnz; dense_col = dense; if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */ TriTmp = tempv; /* Sequence through each column in panel -- triangular solves */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* Copy U[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solves. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif } /* else ... */ } /* for jj ... end tri-solves */ /* Block row updates; push all the way into dense[*] block */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = SUPERLU_MIN(nrow, r_ind + rowblk); block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind); luptr = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; /* Sequence through each column in panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Perform a block update, and scatter the result of matrix-vector to dense[]. */ no_zeros = kfnz - fsupc; luptr1 = luptr + nsupr * no_zeros; MatvecTmp = &TriTmp[maxsuper]; #ifdef USE_VENDOR_BLAS alpha = one; beta = zero; #ifdef _CRAY CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #else cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy); #endif #else cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1], TriTmp, MatvecTmp); #endif /* Scatter MatvecTmp[*] into SPA dense[*] temporarily * such that MatvecTmp[*] can be re-used for the * the next blok row update. dense[] will be copied into * global store after the whole panel has been finished. */ isub = isub1; for (i = 0; i < block_nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &MatvecTmp[i]); MatvecTmp[i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* Scatter the triangular solves into SPA dense[*] */ repfnz_col = repfnz; TriTmp = tempv; dense_col = dense; for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; TriTmp[i] = zero; ++isub; } } /* for jj ... */ } else { /* 1-D block modification */ /* Sequence through each column in the panel */ for (jj = jcol; jj < jcol + w; jj++, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; ops[TRSV] += 4 * segsze * (segsze - 1); ops[GEMV] += 8 * nrow * segsze; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; } } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); } } } else { /* segsze >= 4 */ /* * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense[]. */ no_zeros = kfnz - fsupc; /* Copy U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #ifdef _CRAY CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #ifdef _CRAY CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[*] into SPA dense[*] temporarily, such * that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; tempv[i] = zero; isub++; } /* Scatter the update from tempv1[*] into SPA dense[*] */ /* Start dense rectangular L */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze>=4 ... */ } /* for each column in the panel... */ } /* else 1-D update ... */ } /* for each updating supernode ... */ }
int cgst01(int m, int n, SuperMatrix *A, SuperMatrix *L, SuperMatrix *U, int *perm_c, int *perm_r, float *resid) { /* Purpose ======= CGST01 reconstructs a matrix A from its L*U factorization and computes the residual norm(L*U - A) / ( N * norm(A) * EPS ), where EPS is the machine epsilon. Arguments ========== M (input) INT The number of rows of the matrix A. M >= 0. N (input) INT The number of columns of the matrix A. N >= 0. A (input) SuperMatrix *, dimension (A->nrow, A->ncol) The original M x N matrix A. L (input) SuperMatrix *, dimension (L->nrow, L->ncol) The factor matrix L. U (input) SuperMatrix *, dimension (U->nrow, U->ncol) The factor matrix U. perm_c (input) INT array, dimension (N) The column permutation from CGSTRF. perm_r (input) INT array, dimension (M) The pivot indices from CGSTRF. RESID (output) FLOAT* norm(L*U - A) / ( N * norm(A) * EPS ) ===================================================================== */ /* Local variables */ complex zero = {0.0, 0.0}; int i, j, k, arow, lptr,isub, urow, superno, fsupc, u_part; complex utemp, comp_temp; float anorm, tnorm, cnorm; float eps; complex *work; SCformat *Lstore; NCformat *Astore, *Ustore; complex *Aval, *Lval, *Uval; int *colbeg, *colend; /* Function prototypes */ extern float clangs(char *, SuperMatrix *); /* Quick exit if M = 0 or N = 0. */ if (m <= 0 || n <= 0) { *resid = 0.f; return 0; } work = (complex *)complexCalloc(m); Astore = A->Store; Aval = Astore->nzval; Lstore = L->Store; Lval = Lstore->nzval; Ustore = U->Store; Uval = Ustore->nzval; colbeg = intMalloc(n); colend = intMalloc(n); for (i = 0; i < n; i++) { colbeg[perm_c[i]] = Astore->colptr[i]; colend[perm_c[i]] = Astore->colptr[i+1]; } /* Determine EPS and the norm of A. */ eps = smach("Epsilon"); anorm = clangs("1", A); cnorm = 0.; /* Compute the product L*U, one column at a time */ for (k = 0; k < n; ++k) { /* The U part outside the rectangular supernode */ for (i = U_NZ_START(k); i < U_NZ_START(k+1); ++i) { urow = U_SUB(i); utemp = Uval[i]; superno = Lstore->col_to_sup[urow]; fsupc = L_FST_SUPC(superno); u_part = urow - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)].r -= utemp.r; work[L_SUB(lptr-1)].i -= utemp.i; for (j = L_NZ_START(urow) + u_part; j < L_NZ_START(urow+1); ++j) { isub = L_SUB(lptr); cc_mult(&comp_temp, &utemp, &Lval[j]); c_sub(&work[isub], &work[isub], &comp_temp); ++lptr; } } /* The U part inside the rectangular supernode */ superno = Lstore->col_to_sup[k]; fsupc = L_FST_SUPC(superno); urow = L_NZ_START(k); for (i = fsupc; i <= k; ++i) { utemp = Lval[urow++]; u_part = i - fsupc + 1; lptr = L_SUB_START(fsupc) + u_part; work[L_SUB(lptr-1)].r -= utemp.r; work[L_SUB(lptr-1)].i -= utemp.i; for (j = L_NZ_START(i)+u_part; j < L_NZ_START(i+1); ++j) { isub = L_SUB(lptr); cc_mult(&comp_temp, &utemp, &Lval[j]); c_sub(&work[isub], &work[isub], &comp_temp); ++lptr; } } /* Now compute A[k] - (L*U)[k] (Both matrices may be permuted.) */ for (i = colbeg[k]; i < colend[k]; ++i) { arow = Astore->rowind[i]; work[perm_r[arow]].r += Aval[i].r; work[perm_r[arow]].i += Aval[i].i; } /* Now compute the 1-norm of the column vector work */ tnorm = 0.; for (i = 0; i < m; ++i) { tnorm += fabs(work[i].r) + fabs(work[i].i); work[i] = zero; } cnorm = SUPERLU_MAX(tnorm, cnorm); } *resid = cnorm; if (anorm <= 0.f) { if (*resid != 0.f) { *resid = 1.f / eps; } } else { *resid = *resid / (float) n / anorm / eps; } SUPERLU_FREE(work); SUPERLU_FREE(colbeg); SUPERLU_FREE(colend); return 0; /* End of CGST01 */ } /* cgst01_ */
void pcgstrf_bmod1D( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int fsupc, /* leading column of the updating supernode */ const int krep, /* last column of the updating supernode */ const int nsupc, /* number of columns in the updating s-node */ int nsupr, /* number of rows in the updating supernode */ int nrow, /* number of rows below the diagonal block of the updating supernode */ int *repfnz, /* in */ int *panel_lsub, /* modified */ int *w_lsub_end, /* modified */ int *spa_marker, /* modified; size n-by-w */ complex *dense, /* modified */ complex *tempv, /* working array - zeros on entry/exit */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Results are returned in SPA dense[*,w]. * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif complex ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; register int lptr; /* start of row subscripts of the updating supernode */ register int i, krep_ind, kfnz, isub, irow, no_zeros; register int jj; /* index through each column in the panel */ int *repfnz_col; /* repfnz[] for a column in the panel */ complex *dense_col; /* dense[] for a column in the panel */ complex *tempv1; /* used to store matrix-vector result */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ int *lsub, *xlsub_end; complex *lusup; int *xlusup; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex comp_temp, comp_temp1; #ifdef TIMING double *utime = Gstat->utime; double f_time; #endif lsub = Glu->lsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; lptr = Glu->xlsub[fsupc]; krep_ind = lptr + nsupc - 1; /* Pointers to each column of the w-wide arrays. */ repfnz_col= repfnz; dense_col = dense; col_marker= spa_marker; col_lsub = panel_lsub; #if ( DEBUGlevel>=2 ) if (jcol == BADPAN && krep == BADREP) { printf("(%d) pcgstrf_bmod1D[1] jcol %d, fsupc %d, krep %d, nsupc %d, nsupr %d, nrow %d\n", pnum, jcol, fsupc, krep, nsupc, nsupr, nrow); PrintInt10("lsub[xlsub[2774]]", nsupr, &lsub[lptr]); } #endif /* * Sequence through each column in the panel ... */ for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m, repfnz_col += m, dense_col += m) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; /* Calculate flops: tri-solve + mat-vector */ Gstat->procstat[pnum].fcops += flopcnt; /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; #if ( DEBUGlevel>=2 ) if (krep == BADCOL && jj == -1) { printf("(%d) pcgstrf_bmod1D[segsze=1]: k %d, j %d, ukj %.10e\n", pnum, lsub[krep_ind], jj, ukj); PrintInt10("segsze=1", nsupr, &lsub[lptr]); } #endif for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); ++luptr; #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else if ( segsze <= 3 ) { #ifdef TIMING f_time = SuperLU_timer_(); #endif ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; ++luptr; ++luptr1; ++luptr2; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense_col[irow], &dense_col[irow], &comp_temp); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { /* segsze >= 4 */ /* * Perform a triangular solve and matrix-vector update, * then scatter the result of sup-col update to dense[*]. */ no_zeros = kfnz - fsupc; /* Gather U[*,j] segment from dense[*] to tempv[*]: * The result of triangular solve is in tempv[*]; * The result of matrix vector update is in dense_col[*] */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; ++i) { irow = lsub[isub]; tempv[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef TIMING f_time = SuperLU_timer_(); #endif #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif /* _CRAY_PVP */ #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif /* Scatter tempv[*] into SPA dense[*] temporarily, * such that tempv[*] can be used for the triangular solve of * the next column of the panel. They will be copied into * ucol[*] after the whole panel has been finished. */ isub = lptr + no_zeros; /*#pragma ivdep*/ for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; #if ( DEBUGlevel>=2 ) if (jj == -1 && krep == 3423) printf("(%d) pcgstrf_bmod1D[scatter] jj %d, dense_col[%d] %e\n", pnum, jj, irow, dense_col[irow]); #endif } /* Scatter the update from tempv1[*] into SPA dense[*] */ /*#pragma ivdep*/ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]); /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif tempv1[i] = zero; isub++; } } /* else segsze >= 4 ... */ } /* for jj ... */ }
static int polroot( struct polynom *p, struct complex *pz, /* pointer to root */ struct complex *pz0 /* pointer to initial value */ ) { int i; int n_iter; struct polynom d; struct complex v,v0,vd,delta,zmin; double y,ymin; // extern struct polynom *pol_der(); // extern struct complex *pol_value(); delta.x=0; delta.y=0; // RS 7.2.2013 //printf("\np->n=%d",p->n); getch(); if (p->n==1) { c_div(pz,&(p->a[0]),&(p->a[1])); pz->x=-(pz->x); pz->y=-(pz->y); return(1); } //printf("\nr2"); pol_value(p,pz0,&v0); //printf("\nr3"); if (c_zero(&v0)) { pz->x=pz0->x; pz->y=pz0->y; return(1); } //printf("\nr4"); zmin.x=pz0->x; zmin.y=pz0->y; ymin=v0.x*v0.x+v0.y*v0.y; pol_der(&d,p); //printf("\nr5"); n_iter=0; while (1) { pol_value(&d,pz0,&vd); c_div(&delta,&v0,&vd); c_sub(pz,pz0,&delta); pol_value(p,pz,&v); v0.x=v.x; v0.y=v.y; pz0->x=pz->x; pz0->y=pz->y; ++n_iter; PR_UP; sprintf(sbuf,"\npolroot: N=%d Re=%e Im=%e",n_iter,pz->x,pz->y); sur_print(sbuf); // RS CHA Rprintf /* Rprintf("zero: %d\n",c_zero(pz)); getch(); */ /* if (c_zero(&pz)) break; */ if (c_zero(pz)) break; y=v.x*v.x+v.y*v.y; if (y<ymin) { ymin=y; zmin.x=pz->x; zmin.y=pz->y; } if (fabs(delta.x)<roots_eps && fabs(delta.y)<roots_eps) break; if (n_iter>roots_max_iter) { pz->x=zmin.x; pz->y=zmin.y; break; } if (sur_kbhit()) { pz->x=zmin.x; pz->y=zmin.y; i=sur_getch(); if (i=='.') break; } } return (n_iter); }
int pcgstrf_snode_bmod( const int pnum, /* process number */ const int jcol, /* in - current column in the s-node */ const int jsupno, /* in */ const int fsupc, /* in - first column in the s-node */ complex *dense, /* in */ complex *tempv, /* working array */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Performs numeric block updates within the relaxed supernode. */ complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; #if ( MACH==CRAY_PVP ) _fcd ftcs1, ftcs2, ftcs3; #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha = none, beta = one; #endif int luptr, nsupc, nsupr, nrow; int isub, irow, i, iptr; register int ufirst, nextlu; complex *lusup; int *lsub, *xlsub, *xlsub_end, *xlusup, *xlusup_end; register float flopcnt; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; nextlu = xlusup[jcol]; /* * Process the supernodal portion of L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; isub++) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; ++nextlu; } xlusup_end[jcol] = nextlu; if ( fsupc < jcol ) { luptr = xlusup[fsupc]; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nsupc = jcol - fsupc; /* Excluding jcol */ ufirst = xlusup[jcol]; /* Points to the beginning of column jcol in supernode L\U(jsupno). */ nrow = nsupr - nsupc; flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], &tempv[0] ); /* Scatter tempv[*] into lusup[*] */ iptr = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[iptr], &lusup[iptr], &tempv[i]); ++iptr; tempv[i] = zero; } #endif } return 0; }
void simulation_solve(simulation_t *simulation) { int m,s; int i,j,k, i_max, x; m = simulation->n_vars; int n = m+1; s = simulation_context_get_n_samples(simulation->nodelist->netlist->sc); for ( k = 0 ; k < m ; ++k ) { //pivot is col k //find a line with the maximum k i_max = simulation_max(simulation, k,m); if ( cell_is_zero(simulation, &simulation->cells[i_max][k]) ) { fprintf(stderr,"singular matrix, zero found @ row %d col %d (mean = %.1f)\n", i_max, k, (double)cell_mean(simulation,&simulation->cells[i_max][k])); simulation_dump(simulation); assert(0); } assert(i_max >= k); if ( i_max != k ) { cell_t *tmp; tmp = simulation->cells[k]; simulation->cells[k] = simulation->cells[i_max]; simulation->cells[i_max] = tmp; } for ( i = k + 1 ; i < m ; i ++ ) { if ( simulation->cells[i][k].state == CELL_ZERO ) { //fprintf(stderr,"continue\n"); continue; } assert( i != k); #ifdef USE_OMP #pragma omp parallel for #endif for ( j = n - 1 ; j > k ; --j ) { // Fij = Fij - Fkj * Fik/Fkk assert( j != k ); cell_t *line_i = simulation->cells[i]; cell_t *line_k = simulation->cells[k]; line_i[j].state = CELL_SET; for ( x = 0 ; x < s ; ++x ) { //assert( line_k[k].state != CELL_ZERO && !cell_is_zero(simulation,&line_k[k])); line_i[j].values[x] = c_let( c_sub(line_i[j].values[x], c_mul(line_k[j].values[x], c_div( line_i[k].values[x], line_k[k].values[x] )))); /* line_i[j].values[x] = line_i[j].values[x] - line_k[j].values[x] * ( line_i[k].values[x] / line_k[k].values[x] ); */ /* assert( i != k ); line_i[j].values[x] = line_i[j].values[x] * line_k[k].values[x] - line_k[j].values[x] * line_i[k].values[x]; */ } } simulation->cells[i][k].state = CELL_ZERO; #ifdef USE_OMP #pragma omp parallel for #endif for ( x = 0 ; x < s ; ++x ) { simulation->cells[i][k].values[x]=0.L; } } } //simulation_dump(simulation); //backward elimination //simulation_dump(simulation); if (1) for ( k = m - 1 ; k >= 0 ; --k ) { //set pivot to 1 // Lk = Lk / Fk,k #ifdef USE_OMP #pragma omp parallel for #endif for ( j = k+1 ; j < n ; ++j ) { assert( j != k ); simulation->cells[k][j].state = CELL_SET; for ( x = 0 ; x < s ; ++x ) { //assert( simulation->cells[k][k].state != CELL_ZERO && !cell_is_zero(simulation, &simulation->cells[k][k])); simulation->cells[k][j].values[x] = c_let( c_div(simulation->cells[k][j].values[x], simulation->cells[k][k].values[x] )); /* simulation->cells[k][j].values[x] = simulation->cells[k][j].values[x] / simulation->cells[k][k].values[x]; */ } } simulation->cells[k][k].state = CELL_POSITIVE_UNITY; #ifdef USE_OMP #pragma omp parallel for #endif for ( x = 0 ; x < s ; ++x ) { simulation->cells[k][k].values[x] = 1.L + I * 0.L; } } //zeroes pivot col for ( k = m - 1 ; k >= 0 ; --k ) // vertical pivot Fk,k { assert( simulation->cells[k][k].state == CELL_POSITIVE_UNITY ); // Li = Li - Lk * Fik #ifdef USE_OMP #pragma omp parallel for #endif for ( i = 0 ; i < k ; ++i) // vertical { assert( i != k ); for ( j = k + 1 ; j < m ; ++ j ) assert( simulation->cells[i][j].state == CELL_ZERO ); // => Fir = Fir - Fik x Fkr // => Fi,k = 0 int result_col = n - 1; simulation->cells[i][result_col].state = CELL_SET; for ( x = 0 ; x < s ; ++x ) { simulation->cells[i][result_col].values[x] = c_let( c_sub(simulation->cells[i][result_col].values[x], c_mul(simulation->cells[i][k].values[x], simulation->cells[k][result_col].values[x]))); /* simulation->cells[i][j].values[x] = simulation->cells[i][j].values[x] - simulation->cells[i][k].values[x] * simulation->cells[k][j].values[x]; */ } simulation->cells[i][k].state = CELL_ZERO; for ( x = 0 ; x < s ; ++x ) { simulation->cells[i][k].values[x] = 0.L; } } } //sanity check assert( m == simulation->n_vars ); assert( n == simulation->n_vars+1 ); for ( i = 0 ; i < m ; ++ i ) for ( j = 0 ; j < m ; ++ j ) if ( i==j ) assert(simulation->cells[i][j].state == CELL_POSITIVE_UNITY ); else assert(simulation->cells[i][j].state == CELL_ZERO ); }
void schurFactorization(long n, complex **A, complex **T, complex **U) { /* Schur factorization: A = U*T*U', T = upper triangular, U = unitary */ long i,j,iter,maxIter; double tol, diff1,diff2; complex T11, T12, T21, T22; complex sigma1, sigma2, sigma; complex z, z1, z2; complex **P, **Q, **R; /* Allocate auxiliary matrices */ P = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); Q = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); R = (complex **)Mem(MEM_ALLOC,n, sizeof(complex *)); for (i=0; i<n; i++){ P[i] = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); Q[i] = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); R[i] = (complex *)Mem(MEM_ALLOC,n, sizeof(complex)); } /* ------------------------------------------------------------*/ /* Parameters for iteration */ maxIter = 500; tol = 1E-30; /* ------------------------------------------------------------*/ /* Init U = eye(n) (identity matrix) */ for (i=0; i<n; i++){ U[i][i].re = 1.0; U[i][i].im = 0.0; } /* ------------------------------------------------------------*/ /* Reduce A to Hessenberg form */ hessFactorization(n,A,P,T); /* ------------------------------------------------------------*/ /* Compute Schur factorization of Hessenberg matrix T */ for (j=n-1; j>0; j--){ /* Main loop */ for (iter=0; iter<maxIter; iter++){ /* Iteration loop */ sigma.re = T[j][j].re; sigma.im = T[j][j].im; /* -- Use Wilkinson shift -- */ /* submatrix considered in the shift */ T11 = T[j-1][j-1]; T12 = T[j-1][j]; T21 = T[j][j-1]; T22 = T[j][j]; /* Compute eigenvalues of submatrix */ z.re = 0.0; z.im = 0.0; z2.re = 0.0; z2.im = 0.0; /* z = T11*T11 + T22*T22 - 2*T11*T22 + 4*T12*T21 */ z1 = c_mul(T11,T11); z = c_add(z ,z1); z2 = c_add(z2,z1); z1 = c_mul(T22,T22); z = c_add(z ,z1); z2 = c_add(z2,z1); z1 = c_mul(T11,T22); z1.re = -2.0 * z1.re; z1.im = -2.0 * z1.im; z = c_add(z,z1); z1 = c_mul(T12,T21); z1.re = 4.0 * z1.re; z1.im = 4.0 * z1.im; z = c_add(z,z1); /* Square root*/ z = c_sqrt(z); /* Eigenvalues */ sigma1 = c_add(z2,z); sigma2 = c_sub(z2,z); /* printf("sigma1 = %e %e\n", sigma1.re, sigma1.im); */ /* printf("sigma2 = %e %e\n", sigma2.re, sigma2.im); */ /* Select eigenvalue for shift*/ diff1 = c_norm( c_sub(T[j][j], sigma1) ); diff2 = c_norm( c_sub(T[j][j], sigma2) ); if (diff1 < diff2){ sigma.re = sigma1.re; sigma.im = sigma1.im; }else{ sigma.re = sigma2.re; sigma.im = sigma2.im; } /* --- QR step with Wilkinson shift --- */ /* Shift: T(1:j,1:j) = T(1:j,1:j) - sigma * eye(j) */ for (i=0; i<j+1; i++){ CheckValue(FUNCTION_NAME, "T[i][i].re","", T[i][i].re, -INFTY, INFTY); CheckValue(FUNCTION_NAME, "T[i][i].im","", T[i][i].im, -INFTY, INFTY); T[i][i].re = T[i][i].re - sigma.re; T[i][i].im = T[i][i].im - sigma.im; } /* Compute QR factorization of shifted Hessenberg matrix */ for (i=0; i<n; i++){ memset(Q[i], 0, n*sizeof(complex)); memset(R[i], 0, n*sizeof(complex)); } QRfactorization(n,T,Q,R); /* T = T_new = R * Q */ for (i=0; i<n; i++){ memset(T[i], 0, n*sizeof(complex)); } matProduct(n, n, n, R, Q, T); /* T(1:j,1:j) = T(1:j,1:j) + sigma * eye(j) */ for (i=0; i<j+1; i++){ T[i][i].re = T[i][i].re + sigma.re; T[i][i].im = T[i][i].im + sigma.im; } /* R = U_new = U * Q */ for (i=0; i<n; i++){ memset(R[i], 0, n*sizeof(complex)); } matProduct(n,n,n,U,Q,R); /* U = R */ for (i=0; i<n; i++){ memcpy(U[i],R[i], n*sizeof(complex)); } /* Check convergence */ if (c_norm( T[j][j-1] ) <= tol * (c_norm(T[j-1][j-1]) + c_norm(T[j][j]))){ T[j][j-1].re = 0.0; T[j][j-1].im = 0.0; break; } } /* end of iter loop */ } /* end of main loop */ /* -------------------------------------------------------------*/ /* U = P*U */ for (i=0; i<n; i++){ memset(U[i], 0, n*sizeof(complex)); } matProduct(n,n,n,P,R,U); /* -------------------------------------------------------------*/ /* Free auxiliary variables */ for (i=0; i<n; i++){ Mem(MEM_FREE,P[i]); Mem(MEM_FREE,Q[i]); Mem(MEM_FREE,R[i]); } Mem(MEM_FREE,P); Mem(MEM_FREE,Q); Mem(MEM_FREE,R); /* Return */ return; }
int main() { //initialize testing array float testVector[] = {0.1f,0.2f,0.3f,0.4f,0.5f}; /*COMMENTED OUT LENGTH PARAM AS IT IS INCLUDED IN HEADER FILE*/ //get the size of the array //int length = sizeof(testVector)/sizeof(float); //initiate empty output array of size length float outputArrayC[length]; //initialize the struct at p=r=q 0.1 and x=k=0 kalman_state currentState = {0.1f, 0.1f, 0.0f , 0.1f, 0.0f}; //call function Kalmanfilter_C Kalmanfilter_C(measurements, outputArrayC, ¤tState, length); //initiate empty output array of size length float outputArrayASM[length]; //reinitialize the struct at p=r=q 0.1 and x=k=0 currentState.p = DEF_p; currentState.r = DEF_r; currentState.k = DEF_k; currentState.q = DEF_q; currentState.x = DEF_x; //call subroutine Kalmanfilter_asm Kalmanfilter_asm(measurements, outputArrayASM, ¤tState, length ); //Check for correctness with a error tolerance of 0.000001 float errorTolerance = 0.000001f; float errorPercentage = 0.01; //is_valid(outputArrayC, outputArrayASM, length, errorTolerance, "c vs asm"); //is_valid_relative(outputArrayC, outputArrayASM, length, errorTolerance, errorPercentage,"c vs asm"); int p; //print KalmanFilter output for ( p = 0; p < length; p++ ) { printf("OutputASM: %f & OutputC %f\n", outputArrayASM[p], outputArrayC[p]); } float differenceC[length]; float differenceCMSIS[length]; //Difference arm_sub_f32 (measurements, outputArrayC, differenceCMSIS, length); c_sub(measurements, outputArrayC, differenceC, length); //is_valid(differenceC, differenceCMSIS, length, errorTolerance, "Difference"); //is_valid_relative(differenceC, differenceCMSIS, length, errorTolerance, errorPercentage,"Difference"); //Print difference vector for ( p = 0; p < length; p++ ) { printf("DifferenceC: %f & DifferenceCMSIS %f \n", differenceC[p], differenceCMSIS[p]); } //Mean float meanCMSIS; float meanC; arm_mean_f32 (differenceCMSIS, length , &meanCMSIS); c_mean(differenceC,length, &meanC); //is_valid(&meanC, &meanCMSIS, 1, errorTolerance, "mean"); //is_valid_relative(&meanC, &meanCMSIS, 1, errorTolerance, errorPercentage, "mean"); //Print mean values printf("MeanC: %f & MeanCMSIS %f \n", meanC, meanCMSIS); //STD float stdC; float stdCMSIS; arm_std_f32 (differenceCMSIS, length, &stdCMSIS); c_std(differenceC, length, &stdC); //is_valid(&stdC, &stdCMSIS, 1, errorTolerance, "STD"); //is_valid_relative(&stdC, &stdCMSIS, 1, errorTolerance, errorPercentage,"STD"); //Print std values printf("StandardDevC: %f & StandardDevCMSIS %f \n", stdC, stdCMSIS); //correlation float corC[2*length-1]; float corCMSIS[2*length-1]; arm_correlate_f32 (measurements, length, outputArrayC, length, corCMSIS); c_correlate(measurements, outputArrayC, corC, length); //is_valid(corC, corCMSIS, 2*length-1, errorTolerance, "correlation"); //is_valid_relative(corC, corCMSIS, 2*length-1, errorTolerance, errorPercentage, "correlation"); //convolution float convC[2*length-1]; float convCMSIS[2*length-1]; arm_conv_f32 (measurements, length, outputArrayC, length, convCMSIS); c_conv(measurements, outputArrayC, convC, length); //is_valid(convC, convCMSIS, 2*length-1, errorTolerance, "convolution"); //is_valid_relative(convC, convCMSIS, 2*length-1, errorTolerance, errorPercentage, "convolution"); //Print correlation and convolution values for ( p = 0; p < (2*length-1); p++ ) { printf("ConvC: %f & ConvCMSIS: %f \n", convC[p], convCMSIS[p]); } for ( p = 0; p < (2*length-1); p++ ) { printf("CorrelateC: %f & CorrelatCMSIS: %f \n", corC[p], corCMSIS[p]); } return 0; }
int pcgstrf_column_bmod( const int pnum, /* process number */ const int jcol, /* current column in the panel */ const int fpanelc,/* first column in the panel */ const int nseg, /* number of s-nodes to update jcol */ int *segrep,/* in */ int *repfnz,/* in */ complex *dense, /* modified */ complex *tempv, /* working array */ pxgstrf_shared_t *pxgstrf_shared, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose: * ======== * Performs numeric block updates (sup-col) in topological order. * It features: col-col, 2cols-col, 3cols-col, and sup-col updates. * Special processing on the supernodal portion of L\U[*,j]. * * Return value: * ============= * 0 - successful return * > 0 - number of bytes allocated when run out of space * */ #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; complex alpha, beta; #endif GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ /* krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in supernode * nsupr = no of rows in supernode (used as leading dimension) * luptr = location of supernodal LU-block in storage * kfnz = first nonz in the k-th supernodal segment * no_zeros = no of leading zeros in a supernodal U-segment */ complex ukj, ukj1, ukj2; register int lptr, kfnz, isub, irow, i, no_zeros; register int luptr, luptr1, luptr2; int fsupc, nsupc, nsupr, segsze; int nrow; /* No of rows in the matrix of matrix-vector */ int jsupno, k, ksub, krep, krep_ind, ksupno; int ufirst, nextlu; int fst_col; /* First column within small LU update */ int d_fsupc; /* Distance between the first column of the current panel and the first column of the current snode.*/ int *xsup, *supno; int *lsub, *xlsub, *xlsub_end; complex *lusup; int *xlusup, *xlusup_end; complex *tempv1; int mem_error; register float flopcnt; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; complex comp_temp, comp_temp1; xsup = Glu->xsup; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; xlusup_end = Glu->xlusup_end; jsupno = supno[jcol]; /* * For each nonz supernode segment of U[*,j] in topological order */ k = nseg - 1; for (ksub = 0; ksub < nseg; ksub++) { krep = segrep[k]; k--; ksupno = supno[krep]; #if ( DEBUGlvel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n", pnum, jcol, nseg, krep, jsupno, ksupno); #endif if ( jsupno != ksupno ) { /* Outside the rectangular supernode */ fsupc = xsup[ksupno]; fst_col = SUPERLU_MAX ( fsupc, fpanelc ); /* Distance from the current supernode to the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; luptr = xlusup[fst_col] + d_fsupc; lptr = xlsub[fsupc] + d_fsupc; kfnz = repfnz[krep]; kfnz = SUPERLU_MAX ( kfnz, fpanelc ); segsze = krep - kfnz + 1; nsupc = krep - fst_col + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nrow = nsupr - d_fsupc - nsupc; krep_ind = lptr + nsupc - 1; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze;//sj Gstat->procstat[pnum].fcops += flopcnt; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\ fsupc %d, nsupr %d, nsupc %d\n", pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc); #endif /* * Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; cc_mult(&comp_temp, &ukj, &lusup[luptr]); c_sub(&dense[irow], &dense[irow], &comp_temp); luptr++; } } else if ( segsze <= 3 ) { ukj = dense[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc-1; ukj1 = dense[lsub[krep_ind - 1]]; luptr1 = luptr - nsupr; if ( segsze == 2 ) { /* Case 2: 2cols-col update */ cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } else { /* Case 3: 3cols-col update */ ukj2 = dense[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]); c_sub(&ukj1, &ukj1, &comp_temp); cc_mult(&comp_temp, &ukj1, &lusup[luptr1]); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&ukj, &ukj, &comp_temp); dense[lsub[krep_ind]] = ukj; dense[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; cc_mult(&comp_temp, &ukj, &lusup[luptr]); cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]); c_add(&comp_temp, &comp_temp, &comp_temp1); cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]); c_add(&comp_temp, &comp_temp, &comp_temp1); c_sub(&dense[irow], &dense[irow], &comp_temp); } } } else { /* * Case: sup-col update * Perform a triangular solve and block update, * then scatter the result of sup-col update to dense */ no_zeros = kfnz - fst_col; /* Copy U[*,j] segment from dense[*] to tempv[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; tempv[i] = dense[irow]; ++isub; } /* Dense triangular solve -- start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #else ctrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, tempv, &incx ); #endif luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; alpha = one; beta = zero; #if ( MACH==CRAY_PVP ) CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #else cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tempv, &incx, &beta, tempv1, &incy ); #endif #else clsolve ( nsupr, segsze, &lusup[luptr], tempv ); luptr += segsze; /* Dense matrix-vector */ tempv1 = &tempv[segsze]; cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1); #endif /* Scatter tempv[] into SPA dense[*] */ isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense[irow] = tempv[i]; /* Scatter */ tempv[i] = zero; isub++; } /* Scatter tempv1[] into SPA dense[*] */ for (i = 0; i < nrow; i++) { irow = lsub[isub]; c_sub(&dense[irow], &dense[irow], &tempv1[i]); tempv1[i] = zero; ++isub; } } /* else segsze >= 4 */ } /* if jsupno ... */ } /* for each segment... */ /* ------------------------------------------ Process the supernodal portion of L\U[*,j] ------------------------------------------ */ fsupc = SUPER_FSUPC (jsupno); nsupr = xlsub_end[fsupc] - xlsub[fsupc]; if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, pxgstrf_shared)) ) return mem_error; xlusup[jcol] = nextlu; lusup = Glu->lusup; /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */ for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) { irow = lsub[isub]; lusup[nextlu] = dense[irow]; dense[irow] = zero; #ifdef DEBUG if (jcol == -1) printf("(%d) pcgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n", pnum, jcol, irow, lusup[nextlu]); #endif ++nextlu; } xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */ #if ( DEBUGlevel>=2 ) if (jcol == -1) { nrow = xlusup_end[jcol] - xlusup[jcol]; print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]], &lusup[xlusup[jcol]]); } #endif /* * For more updates within the panel (also within the current supernode), * should start from the first column of the panel, or the first column * of the supernode, whichever is bigger. There are 2 cases: * (1) fsupc < fpanelc, then fst_col := fpanelc * (2) fsupc >= fpanelc, then fst_col := fsupc */ fst_col = SUPERLU_MAX ( fsupc, fpanelc ); if ( fst_col < jcol ) { /* distance between the current supernode and the current panel; d_fsupc=0 if fsupc >= fpanelc. */ d_fsupc = fst_col - fsupc; lptr = xlsub[fsupc] + d_fsupc; luptr = xlusup[fst_col] + d_fsupc; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */ nsupc = jcol - fst_col; /* Excluding jcol */ nrow = nsupr - d_fsupc - nsupc; /* points to the beginning of jcol in supernode L\U[*,jsupno] */ ufirst = xlusup[jcol] + d_fsupc; #if ( DEBUGlevel>=2 ) if (jcol==BADCOL) printf("(%d) pcgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n", pnum, jcol, fsupc, nsupr, nsupc, nrow); #endif flopcnt = nsupc * (nsupc - 1) + 2 * nrow * nsupc; //sj Gstat->procstat[pnum].fcops += flopcnt; /* ops[TRSV] += nsupc * (nsupc - 1); ops[GEMV] += 2 * nrow * nsupc; */ #ifdef USE_VENDOR_BLAS alpha = none; beta = one; /* y := beta*y + alpha*A*x */ #if ( MACH==CRAY_PVP ) CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #else ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, &lusup[ufirst], &incx ); cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy ); #endif #else clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] ); cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], &lusup[ufirst], tempv ); /* Copy updates from tempv[*] into lusup[*] */ isub = ufirst + nsupc; for (i = 0; i < nrow; i++) { c_sub(&lusup[isub], &lusup[isub], &tempv[i]); tempv[i] = zero; ++isub; } #endif } /* if fst_col < jcol ... */ return 0; }
int main(int argc, char *argv[]) { void cmatvec_mult(complex alpha, complex x[], complex beta, complex y[]); void cpsolve(int n, complex x[], complex y[]); extern int cfgmr( int n, void (*matvec_mult)(complex, complex [], complex, complex []), void (*psolve)(int n, complex [], complex[]), complex *rhs, complex *sol, double tol, int restrt, int *itmax, FILE *fits); extern int cfill_diag(int n, NCformat *Astore); char equed[1] = {'B'}; yes_no_t equil; trans_t trans; SuperMatrix A, L, U; SuperMatrix B, X; NCformat *Astore; NCformat *Ustore; SCformat *Lstore; complex *a; int *asub, *xa; int *etree; int *perm_c; /* column permutation vector */ int *perm_r; /* row permutations from partial pivoting */ int nrhs, ldx, lwork, info, m, n, nnz; complex *rhsb, *rhsx, *xact; complex *work = NULL; float *R, *C; float u, rpg, rcond; complex zero = {0.0, 0.0}; complex one = {1.0, 0.0}; complex none = {-1.0, 0.0}; mem_usage_t mem_usage; superlu_options_t options; SuperLUStat_t stat; int restrt, iter, maxit, i; double resid; complex *x, *b; #ifdef DEBUG extern int num_drop_L, num_drop_U; #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Enter main()"); #endif /* Defaults */ lwork = 0; nrhs = 1; equil = YES; u = 0.1; /* u=1.0 for complete factorization */ trans = NOTRANS; /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; options.ColPerm = COLAMD; options.DiagPivotThresh = 0.1; //different from complete LU options.Trans = NOTRANS; options.IterRefine = NOREFINE; options.SymmetricMode = NO; options.PivotGrowth = NO; options.ConditionNumber = NO; options.PrintStat = YES; options.RowPerm = LargeDiag; options.ILU_DropTol = 1e-4; options.ILU_FillTol = 1e-2; options.ILU_FillFactor = 10.0; options.ILU_DropRule = DROP_BASIC | DROP_AREA; options.ILU_Norm = INF_NORM; options.ILU_MILU = SMILU_2; */ ilu_set_default_options(&options); /* Modify the defaults. */ options.PivotGrowth = YES; /* Compute reciprocal pivot growth */ options.ConditionNumber = YES;/* Compute reciprocal condition number */ if ( lwork > 0 ) { work = SUPERLU_MALLOC(lwork); if ( !work ) ABORT("Malloc fails for work[]."); } /* Read matrix A from a file in Harwell-Boeing format.*/ if (argc < 2) { printf("Usage:\n%s [OPTION] < [INPUT] > [OUTPUT]\nOPTION:\n" "-h -hb:\n\t[INPUT] is a Harwell-Boeing format matrix.\n" "-r -rb:\n\t[INPUT] is a Rutherford-Boeing format matrix.\n" "-t -triplet:\n\t[INPUT] is a triplet format matrix.\n", argv[0]); return 0; } else { switch (argv[1][1]) { case 'H': case 'h': printf("Input a Harwell-Boeing format matrix:\n"); creadhb(&m, &n, &nnz, &a, &asub, &xa); break; case 'R': case 'r': printf("Input a Rutherford-Boeing format matrix:\n"); creadrb(&m, &n, &nnz, &a, &asub, &xa); break; case 'T': case 't': printf("Input a triplet format matrix:\n"); creadtriple(&m, &n, &nnz, &a, &asub, &xa); break; default: printf("Unrecognized format.\n"); return 0; } } cCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, SLU_NC, SLU_C, SLU_GE); Astore = A.Store; cfill_diag(n, Astore); printf("Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz); fflush(stdout); if ( !(rhsb = complexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsb[]."); if ( !(rhsx = complexMalloc(m * nrhs)) ) ABORT("Malloc fails for rhsx[]."); cCreate_Dense_Matrix(&B, m, nrhs, rhsb, m, SLU_DN, SLU_C, SLU_GE); cCreate_Dense_Matrix(&X, m, nrhs, rhsx, m, SLU_DN, SLU_C, SLU_GE); xact = complexMalloc(n * nrhs); ldx = n; cGenXtrue(n, nrhs, xact, ldx); cFillRHS(trans, nrhs, xact, ldx, &A, &B); if ( !(etree = intMalloc(n)) ) ABORT("Malloc fails for etree[]."); if ( !(perm_r = intMalloc(m)) ) ABORT("Malloc fails for perm_r[]."); if ( !(perm_c = intMalloc(n)) ) ABORT("Malloc fails for perm_c[]."); if ( !(R = (float *) SUPERLU_MALLOC(A.nrow * sizeof(float))) ) ABORT("SUPERLU_MALLOC fails for R[]."); if ( !(C = (float *) SUPERLU_MALLOC(A.ncol * sizeof(float))) ) ABORT("SUPERLU_MALLOC fails for C[]."); info = 0; #ifdef DEBUG num_drop_L = 0; num_drop_U = 0; #endif /* Initialize the statistics variables. */ StatInit(&stat); /* Compute the incomplete factorization and compute the condition number and pivot growth using dgsisx. */ cgsisx(&options, &A, perm_c, perm_r, etree, equed, R, C, &L, &U, work, lwork, &B, &X, &rpg, &rcond, &mem_usage, &stat, &info); Lstore = (SCformat *) L.Store; Ustore = (NCformat *) U.Store; printf("cgsisx(): info %d\n", info); if (info > 0 || rcond < 1e-8 || rpg > 1e8) printf("WARNING: This preconditioner might be unstable.\n"); if ( info == 0 || info == n+1 ) { if ( options.PivotGrowth == YES ) printf("Recip. pivot growth = %e\n", rpg); if ( options.ConditionNumber == YES ) printf("Recip. condition number = %e\n", rcond); } else if ( info > 0 && lwork == -1 ) { printf("** Estimated memory: %d bytes\n", info - n); } printf("n(A) = %d, nnz(A) = %d\n", n, Astore->nnz); printf("No of nonzeros in factor L = %d\n", Lstore->nnz); printf("No of nonzeros in factor U = %d\n", Ustore->nnz); printf("No of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz - n); printf("Fill ratio: nnz(F)/nnz(A) = %.3f\n", ((double)(Lstore->nnz) + (double)(Ustore->nnz) - (double)n) / (double)Astore->nnz); printf("L\\U MB %.3f\ttotal MB needed %.3f\n", mem_usage.for_lu/1e6, mem_usage.total_needed/1e6); fflush(stdout); /* Set the global variables. */ GLOBAL_A = &A; GLOBAL_L = &L; GLOBAL_U = &U; GLOBAL_STAT = &stat; GLOBAL_PERM_C = perm_c; GLOBAL_PERM_R = perm_r; /* Set the variables used by GMRES. */ restrt = SUPERLU_MIN(n / 3 + 1, 50); maxit = 1000; iter = maxit; resid = 1e-8; if (!(b = complexMalloc(m))) ABORT("Malloc fails for b[]."); if (!(x = complexMalloc(n))) ABORT("Malloc fails for x[]."); sp_cgemv("N", one, &A, xact, 1, zero, b, 1); if (info <= n + 1) { int i_1 = 1; double maxferr = 0.0, nrmA, nrmB, res, t; complex temp; extern float scnrm2_(int *, complex [], int *); extern void caxpy_(int *, complex *, complex [], int *, complex [], int *); /* Call GMRES. */ /*double *sol = (double*) ((DNformat*) X.Store)->nzval; for (i = 0; i < n; i++) x[i] = sol[i];*/ for (i = 0; i < n; i++) x[i] = zero; t = SuperLU_timer_(); cfgmr(n, cmatvec_mult, cpsolve, b, x, resid, restrt, &iter, stdout); t = SuperLU_timer_() - t; /* Output the result. */ nrmA = scnrm2_(&(Astore->nnz), (complex *)((DNformat *)A.Store)->nzval, &i_1); nrmB = scnrm2_(&m, b, &i_1); sp_cgemv("N", none, &A, x, 1, one, b, 1); res = scnrm2_(&m, b, &i_1); resid = res / nrmB; printf("||A||_F = %.1e, ||B||_2 = %.1e, ||B-A*X||_2 = %.1e, " "relres = %.1e\n", nrmA, nrmB, res, resid); if (iter >= maxit) { if (resid >= 1.0) iter = -180; else if (resid > 1e-8) iter = -111; } printf("iteration: %d\nresidual: %.1e\nGMRES time: %.2f seconds.\n", iter, resid, t); for (i = 0; i < m; i++) c_sub(&temp, &x[i], &xact[i]); maxferr = SUPERLU_MAX(maxferr, c_abs1(&temp)); printf("||X-X_true||_oo = %.1e\n", maxferr); } #ifdef DEBUG printf("%d entries in L and %d entries in U dropped.\n", num_drop_L, num_drop_U); #endif fflush(stdout); if ( options.PrintStat ) StatPrint(&stat); StatFree(&stat); SUPERLU_FREE (rhsb); SUPERLU_FREE (rhsx); SUPERLU_FREE (xact); SUPERLU_FREE (etree); SUPERLU_FREE (perm_r); SUPERLU_FREE (perm_c); SUPERLU_FREE (R); SUPERLU_FREE (C); Destroy_CompCol_Matrix(&A); Destroy_SuperMatrix_Store(&B); Destroy_SuperMatrix_Store(&X); if ( lwork >= 0 ) { Destroy_SuperNode_Matrix(&L); Destroy_CompCol_Matrix(&U); } SUPERLU_FREE(b); SUPERLU_FREE(x); #if ( DEBUGlevel>=1 ) CHECK_MALLOC("Exit main()"); #endif return 0; }
void polydiv (header *hd) { header *st=hd,*hd1,*result,*rest; int c1,c2,i,r,j; double *m1,*m2,*mr,*mh,x,l; complex *mc1,*mc2,*mcr,*mch,xc,lc,hc; interval *mi1,*mi2,*mir,*mih,xi,li,hi; hd1=next_param(st); equal_params_2(&hd,&hd1); if (error) return; getmatrix(hd,&r,&c1,&m1); if (r!=1) wrong_arg(); getmatrix(hd1,&r,&c2,&m2); if (r!=1) wrong_arg(); if (c1<c2) { result=new_real(0.0,""); rest=(header *)newram; moveresult(rest,hd1); } else if (iscomplex(hd)) { mc1=(complex *)m1; mc2=(complex *)m2; result=new_cmatrix(1,c1-c2+1,""); if (error) return; mcr=(complex *)matrixof(result); rest=new_cmatrix(1,c2,""); if (error) return; mch=(complex *)newram; if (!freeram(c1*sizeof(complex))) { output("Out of memory!\n"); error=190; return; } memmove((char *)mch,(char *)mc1,c1*sizeof(complex)); c_copy(lc,mc2[c2-1]); if (lc[0]==0.0 && lc[1]==0.0) wrong_arg(); for (i=c1-c2; i>=0; i--) { c_div(mch[c2+i-1],lc,xc); c_copy(mcr[i],xc); for(j=0; j<c2; j++) { c_mult(mc2[j],xc,hc); c_sub(mch[i+j],hc,mch[i+j]); } } memmove((char *)matrixof(rest),(char *)mch,c2*sizeof(complex)); } else if (isinterval(hd)) { mi1=(interval *)m1; mi2=(interval *)m2; result=new_imatrix(1,c1-c2+1,""); if (error) return; mir=(interval *)matrixof(result); rest=new_imatrix(1,c2,""); if (error) return; mih=(complex *)newram; if (!freeram(c1*sizeof(complex))) { output("Out of memory!\n"); error=190; return; } memmove((char *)mih,(char *)mi1,c1*sizeof(interval)); i_copy(li,mi2[c2-1]); if (li[0]<=0.0 && li[1]>=0.0) wrong_arg(); for (i=c1-c2; i>=0; i--) { i_div(mih[c2+i-1],li,xi); c_copy(mir[i],xi); for(j=0; j<c2; j++) { i_mult(mi2[j],xi,hi); i_sub(mih[i+j],hi,mih[i+j]); } } memmove((char *)matrixof(rest),(char *)mih,c2*sizeof(interval)); } else if (isreal(hd)) { result=new_matrix(1,c1-c2+1,""); if (error) return; mr=matrixof(result); rest=new_matrix(1,c2,""); if (error) return; mh=(double *)newram; if (!freeram(c1*sizeof(double))) { output("Out of memory!\n"); error=190; return; } memmove((char *)mh,(char *)m1,c1*sizeof(double)); l=m2[c2-1]; if (l==0.0) wrong_arg(); for (i=c1-c2; i>=0; i--) { x=mh[c2+i-1]/l; mr[i]=x; for(j=0; j<c2; j++) mh[i+j]-=m2[j]*x; } memmove((char *)matrixof(rest),(char *)mh,c2*sizeof(double)); } else wrong_arg(); moveresult(st,result); moveresult(nextof(st),rest); }
void recalculate (int plane, number_t * x1, number_t * y1) { number_t x = *x1, y = *y1; switch (plane) { case 1: { /* 1/mu */ number_t t; if (myabs (x) + myabs (y) < 0.000001) t = INT_MAX, y = INT_MAX; else { c_div (1, 0, x, y, t, y); } x = t; } break; case 2: { /* 1/(mu + 0.25) */ number_t t; if (myabs (x) + myabs (y) < 0.000001) t = INT_MAX, y = INT_MAX; else { c_div (1, 0, x, y, t, y); } x = t; x += 0.25; } break; case 3: /* lambda */ { number_t tr, ti, mr, mi; mr = x, mi = y; c_pow2 (x, y, tr, ti); c_div (tr, ti, 4, 0, x, y); c_div (mr, mi, 2, 0, tr, ti); c_sub (tr, ti, x, y, mr, mi); x = mr, y = mi; } break; case 4: /* 1/lambda */ { number_t tr, ti, mr, mi; c_div (1, 0, x, y, tr, y); x = tr; mr = x, mi = y; c_pow2 (x, y, tr, ti); c_div (tr, ti, 4, 0, x, y); c_div (mr, mi, 2, 0, tr, ti); c_sub (tr, ti, x, y, mr, mi); x = mr, y = mi; } break; case 5: /* 1/(lambda-1) */ { number_t tr, ti, mr, mi; c_div (1, 0, x, y, tr, y); x = tr + 1; mr = x, mi = y; c_pow2 (x, y, tr, ti); c_div (tr, ti, 4, 0, x, y); c_div (mr, mi, 2, 0, tr, ti); c_sub (tr, ti, x, y, mr, mi); x = mr, y = mi; } break; case 6: { /* 1/(mu + 0.25) */ number_t t; if (myabs (x) + myabs (y) < 0.000001) t = INT_MAX, y = INT_MAX; else { c_div (1, 0, x, y, t, y); } x = t; x -= 1.40115; } break; default: break; } *x1 = x; *y1 = y; }