예제 #1
0
int
f2c_ztrsv(char* uplo, char* trans, char* diag, integer* N,
          doublecomplex* A, integer* lda,
          doublecomplex* X, integer* incX)
{
    ztrsv_(uplo, trans, diag,
           N, A, lda, X, incX);
    return 0;
}
예제 #2
0
/* Subroutine */ int zggglm_(integer *n, integer *m, integer *p, 
	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, 
	doublecomplex *d, doublecomplex *x, doublecomplex *y, doublecomplex *
	work, integer *lwork, integer *info)
{
/*  -- LAPACK driver routine (version 2.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   


    Purpose   
    =======   

    ZGGGLM solves a general Gauss-Markov linear model (GLM) problem:   

            minimize || y ||_2   subject to   d = A*x + B*y   
                x   

    where A is an N-by-M matrix, B is an N-by-P matrix, and d is a   
    given N-vector. It is assumed that M <= N <= M+P, and   

               rank(A) = M    and    rank( A B ) = N.   

    Under these assumptions, the constrained equation is always   
    consistent, and there is a unique solution x and a minimal 2-norm   
    solution y, which is obtained using a generalized QR factorization   
    of A and B.   

    In particular, if matrix B is square nonsingular, then the problem   
    GLM is equivalent to the following weighted linear least squares   
    problem   

                 minimize || inv(B)*(d-A*x) ||_2   
                     x   

    where inv(B) denotes the inverse of B.   

    Arguments   
    =========   

    N       (input) INTEGER   
            The number of rows of the matrices A and B.  N >= 0.   

    M       (input) INTEGER   
            The number of columns of the matrix A.  0 <= M <= N.   

    P       (input) INTEGER   
            The number of columns of the matrix B.  P >= N-M.   

    A       (input/output) COMPLEX*16 array, dimension (LDA,M)   
            On entry, the N-by-M matrix A.   
            On exit, A is destroyed.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1,N).   

    B       (input/output) COMPLEX*16 array, dimension (LDB,P)   
            On entry, the N-by-P matrix B.   
            On exit, B is destroyed.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1,N).   

    D       (input/output) COMPLEX*16 array, dimension (N)   
            On entry, D is the left hand side of the GLM equation.   
            On exit, D is destroyed.   

    X       (output) COMPLEX*16 array, dimension (M)   
    Y       (output) COMPLEX*16 array, dimension (P)   
            On exit, X and Y are the solutions of the GLM problem.   

    WORK    (workspace/output) COMPLEX*16 array, dimension (LWORK)   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK >= max(1,N+M+P).   
            For optimum performance, LWORK >= M+min(N,P)+max(N,P)*NB,   
            where NB is an upper bound for the optimal blocksizes for   
            ZGEQRF, CGERQF, ZUNMQR and CUNMRQ.   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    ===================================================================   


       Test the input parameters   

    
   Parameter adjustments   
       Function Body */
    /* Table of constant values */
    static doublecomplex c_b2 = {1.,0.};
    static integer c__1 = 1;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
    doublereal d__1;
    doublecomplex z__1;
    /* Local variables */
    static integer lopt, i;
    extern /* Subroutine */ int zgemv_(char *, integer *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    integer *, doublecomplex *, doublecomplex *, integer *), 
	    zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, 
	    integer *), ztrsv_(char *, char *, char *, integer *, 
	    doublecomplex *, integer *, doublecomplex *, integer *);
    static integer np;
    extern /* Subroutine */ int xerbla_(char *, integer *), zggqrf_(
	    integer *, integer *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    doublecomplex *, integer *, integer *), zunmqr_(char *, char *, 
	    integer *, integer *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    integer *, integer *), zunmrq_(char *, char *, 
	    integer *, integer *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    integer *, integer *);



#define D(I) d[(I)-1]
#define X(I) x[(I)-1]
#define Y(I) y[(I)-1]
#define WORK(I) work[(I)-1]

#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]

    *info = 0;
    np = min(*n,*p);
    if (*n < 0) {
	*info = -1;
    } else if (*m < 0 || *m > *n) {
	*info = -2;
    } else if (*p < 0 || *p < *n - *m) {
	*info = -3;
    } else if (*lda < max(1,*n)) {
	*info = -5;
    } else if (*ldb < max(1,*n)) {
	*info = -7;
    } else /* if(complicated condition) */ {
/* Computing MAX */
	i__1 = 1, i__2 = *n + *m + *p;
	if (*lwork < max(i__1,i__2)) {
	    *info = -12;
	}
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("ZGGGLM", &i__1);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Compute the GQR factorization of matrices A and B:   

              Q'*A = ( R11 ) M,    Q'*B*Z' = ( T11   T12 ) M   
                     (  0  ) N-M             (  0    T22 ) N-M   
                        M                     M+P-N  N-M   

       where R11 and T22 are upper triangular, and Q and Z are   
       unitary. */

    i__1 = *lwork - *m - np;
    zggqrf_(n, m, p, &A(1,1), lda, &WORK(1), &B(1,1), ldb, &WORK(*m 
	    + 1), &WORK(*m + np + 1), &i__1, info);
    i__1 = *m + np + 1;
    lopt = (integer) WORK(*m+np+1).r;

/*     Update left-hand-side vector d = Q'*d = ( d1 ) M   
                                               ( d2 ) N-M */

    i__1 = max(1,*n);
    i__2 = *lwork - *m - np;
    zunmqr_("Left", "Conjugate transpose", n, &c__1, m, &A(1,1), lda, &
	    WORK(1), &D(1), &i__1, &WORK(*m + np + 1), &i__2, info);
/* Computing MAX */
    i__3 = *m + np + 1;
    i__1 = lopt, i__2 = (integer) WORK(*m+np+1).r;
    lopt = max(i__1,i__2);

/*     Solve T22*y2 = d2 for y2 */

    i__1 = *n - *m;
    ztrsv_("Upper", "No transpose", "Non unit", &i__1, &B(*m+1,*m+*p-*n+1), ldb, &D(*m + 1), &c__1);
    i__1 = *n - *m;
    zcopy_(&i__1, &D(*m + 1), &c__1, &Y(*m + *p - *n + 1), &c__1);

/*     Set y1 = 0 */

    i__1 = *m + *p - *n;
    for (i = 1; i <= *m+*p-*n; ++i) {
	i__2 = i;
	Y(i).r = 0., Y(i).i = 0.;
/* L10: */
    }

/*     Update d1 = d1 - T12*y2 */

    i__1 = *n - *m;
    z__1.r = -1., z__1.i = 0.;
    zgemv_("No transpose", m, &i__1, &z__1, &B(1,*m+*p-*n+1), ldb, &Y(*m + *p - *n + 1), &c__1, &c_b2, &D(1), &c__1);

/*     Solve triangular system: R11*x = d1 */

    ztrsv_("Upper", "No Transpose", "Non unit", m, &A(1,1), lda, &D(1), &
	    c__1);

/*     Copy D to X */

    zcopy_(m, &D(1), &c__1, &X(1), &c__1);

/*     Backward transformation y = Z'*y   

   Computing MAX */
    i__1 = 1, i__2 = *n - *p + 1;
    i__3 = max(1,*p);
    i__4 = *lwork - *m - np;
    zunmrq_("Left", "Conjugate transpose", p, &c__1, &np, &B(max(1,*n-*p+1),1), ldb, &WORK(*m + 1), &Y(1), &i__3, &WORK(*m + np + 1), &
	    i__4, info);
/* Computing MAX */
    i__3 = *m + np + 1;
    i__1 = lopt, i__2 = (integer) WORK(*m+np+1).r;
    d__1 = (doublereal) max(i__1,i__2);
    WORK(1).r = d__1, WORK(1).i = 0.;

    return 0;

/*     End of ZGGGLM */

} /* zggglm_ */
예제 #3
0
int
pzgstrf_column_bmod(
		    const int  pnum,   /* process number */
		    const int  jcol,   /* current column in the panel */
		    const int  fpanelc,/* first column in the panel */
		    const int  nseg,   /* number of s-nodes to update jcol */
		    int        *segrep,/* in */
		    int        *repfnz,/* in */
		    doublecomplex     *dense, /* modified */
		    doublecomplex     *tempv, /* working array */
		    pxgstrf_shared_t *pxgstrf_shared, /* modified */
		    Gstat_t *Gstat     /* modified */
		    )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j].
 *
 * Return value:
 * =============
 *      0 - successful return
 *    > 0 - number of bytes allocated when run out of space
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    
#ifdef USE_VENDOR_BLAS    
    int         incx = 1, incy = 1;
    doublecomplex      alpha, beta;
#endif
    GlobalLU_t *Glu = pxgstrf_shared->Glu;   /* modified */
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    doublecomplex	  ukj, ukj1, ukj2;
    register int lptr, kfnz, isub, irow, i, no_zeros;
    register int luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jsupno, k, ksub, krep, krep_ind, ksupno;
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode.*/
    int          *xsup, *supno;
    int          *lsub, *xlsub, *xlsub_end;
    doublecomplex       *lusup;
    int          *xlusup, *xlusup_end;
    doublecomplex       *tempv1;
    int          mem_error;
    register float flopcnt;

    doublecomplex      zero = {0.0, 0.0};
    doublecomplex      one = {1.0, 0.0};
    doublecomplex      none = {-1.0, 0.0};
    doublecomplex      comp_temp, comp_temp1;

    xsup       = Glu->xsup;
    supno      = Glu->supno;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;
    jsupno     = supno[jcol];

    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
#if ( DEBUGlvel>=2 )
if (jcol==BADCOL)
printf("(%d) pzgstrf_column_bmod[1]: %d, nseg %d, krep %d, jsupno %d, ksupno %d\n",
       pnum, jcol, nseg, krep, jsupno, ksupno);
#endif    
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc >= fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;
	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

            flopcnt = 4 * segsze * (segsze - 1) + 8 * nrow * segsze;
	    Gstat->procstat[pnum].fcops += flopcnt;

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)	    
printf("(%d) pzgstrf_column_bmod[2]: %d, krep %d, kfnz %d, segsze %d, d_fsupc %d,\
fsupc %d, nsupr %d, nsupc %d\n",
       pnum, jcol, krep, kfnz, segsze, d_fsupc, fsupc, nsupr, nsupc);

#endif



            /*
             * Case 1: Update U-segment of size 1 -- col-col update
             */
            if ( segsze == 1 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc;

                for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                    irow = lsub[i];
                    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                    z_sub(&dense[irow], &dense[irow], &comp_temp);
                    luptr++;
                }

            } else if ( segsze <= 3 ) {
                ukj = dense[lsub[krep_ind]];
                luptr += nsupr*(nsupc-1) + nsupc-1;
                ukj1 = dense[lsub[krep_ind - 1]];
                luptr1 = luptr - nsupr;

                if ( segsze == 2 ) { /* Case 2: 2cols-col update */
                    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    z_sub(&ukj, &ukj, &comp_temp);
                    dense[lsub[krep_ind]] = ukj;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                        zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        z_add(&comp_temp, &comp_temp, &comp_temp1);
                        z_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                } else { /* Case 3: 3cols-col update */
                    ukj2 = dense[lsub[krep_ind - 2]];
                    luptr2 = luptr1 - nsupr;
                    zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                    z_sub(&ukj1, &ukj1, &comp_temp);

                    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                    z_add(&comp_temp, &comp_temp, &comp_temp1);
                    z_sub(&ukj, &ukj, &comp_temp);

                    dense[lsub[krep_ind]] = ukj;
                    dense[lsub[krep_ind-1]] = ukj1;
                    for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
                        irow = lsub[i];
                        luptr++;
                        luptr1++;
                        luptr2++;
                        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                        zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                        z_add(&comp_temp, &comp_temp, &comp_temp1);
                        zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        z_add(&comp_temp, &comp_temp, &comp_temp1);
                        z_sub(&dense[irow], &dense[irow], &comp_temp);
                    }
                }


	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */
		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else
		ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif
		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		alpha = one;
		beta = zero;
#if ( MACH==CRAY_PVP )
		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		zlsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
                /* Scatter tempv[] into SPA dense[*] */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i]; /* Scatter */
                    tempv[i] = zero;
                    isub++;
                }

		/* Scatter tempv1[] into SPA dense[*] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
                    z_sub(&dense[irow], &dense[irow], &tempv1[i]);
		    tempv1[i] = zero;
		    ++isub;
		}
	    } /* else segsze >= 4 */
	    
	} /* if jsupno ... */

    } /* for each segment... */

    
    /* ------------------------------------------
       Process the supernodal portion of L\U[*,j]
       ------------------------------------------ */
    
    fsupc = SUPER_FSUPC (jsupno);
    nsupr = xlsub_end[fsupc] - xlsub[fsupc];
    if ( (mem_error = Glu_alloc(pnum, jcol, nsupr, LUSUP, &nextlu, 
			       pxgstrf_shared)) )
	return mem_error;
    xlusup[jcol] = nextlu;
    lusup = Glu->lusup;
    
    /* Gather the nonzeros from SPA dense[*,j] into L\U[*,j] */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; ++isub) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
#ifdef DEBUG
if (jcol == -1)
    printf("(%d) pzgstrf_column_bmod[lusup] jcol %d, irow %d, lusup %.10e\n",
	   pnum, jcol, irow, lusup[nextlu]);
#endif	
	++nextlu;
    }
    xlusup_end[jcol] = nextlu; /* close L\U[*,jcol] */

#if ( DEBUGlevel>=2 )
if (jcol == -1) {
    nrow = xlusup_end[jcol] - xlusup[jcol];
    print_double_vec("before sup-col update", nrow, &lsub[xlsub[fsupc]],
		     &lusup[xlusup[jcol]]);
}
#endif    
    
    /*
     * For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    (1) fsupc < fpanelc,  then fst_col := fpanelc
     *    (2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* distance between the current supernode and the current panel;
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	lptr = xlsub[fsupc] + d_fsupc;
	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc]; /* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* points to the beginning of jcol in supernode L\U[*,jsupno] */
	ufirst = xlusup[jcol] + d_fsupc;	

#if ( DEBUGlevel>=2 )
if (jcol==BADCOL)
printf("(%d) pzgstrf_column_bmod[3] jcol %d, fsupc %d, nsupr %d, nsupc %d, nrow %d\n",
       pnum, jcol, fsupc, nsupr, nsupc, nrow);
#endif    

        flopcnt = 4 * nsupc * (nsupc - 1) + 8 * nrow * nsupc;
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */
	
#ifdef USE_VENDOR_BLAS
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
#if ( MACH==CRAY_PVP )
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		 &lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            z_sub(&lusup[isub], &lusup[isub], &tempv[i]);
            tempv[i] = zero;
	    ++isub;
	}
#endif
    } /* if fst_col < jcol ... */ 

    return 0;
}
예제 #4
0
/* Subroutine */ int zlatrs_(char *uplo, char *trans, char *diag, char *
	normin, integer *n, doublecomplex *a, integer *lda, doublecomplex *x, 
	doublereal *scale, doublereal *cnorm, integer *info)
{
    /* System generated locals */
    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
    doublereal d__1, d__2, d__3, d__4;
    doublecomplex z__1, z__2, z__3, z__4;

    /* Local variables */
    integer i__, j;
    doublereal xj, rec, tjj;
    integer jinc;
    doublereal xbnd;
    integer imax;
    doublereal tmax;
    doublecomplex tjjs;
    doublereal xmax, grow;
    doublereal tscal;
    doublecomplex uscal;
    integer jlast;
    doublecomplex csumj;
    logical upper;
    doublereal bignum;
    logical notran;
    integer jfirst;
    doublereal smlnum;
    logical nounit;

/*  -- LAPACK auxiliary routine (version 3.2) -- */
/*     November 2006 */

/*  Purpose */
/*  ======= */

/*  ZLATRS solves one of the triangular systems */

/*     A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b, */

/*  with scaling to prevent overflow.  Here A is an upper or lower */
/*  triangular matrix, A**T denotes the transpose of A, A**H denotes the */
/*  conjugate transpose of A, x and b are n-element vectors, and s is a */
/*  scaling factor, usually less than or equal to 1, chosen so that the */
/*  components of x will be less than the overflow threshold.  If the */
/*  unscaled problem will not cause overflow, the Level 2 BLAS routine */
/*  ZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j), */
/*  then s is set to 0 and a non-trivial solution to A*x = 0 is returned. */

/*  Arguments */
/*  ========= */

/*  UPLO    (input) CHARACTER*1 */
/*          Specifies whether the matrix A is upper or lower triangular. */
/*          = 'U':  Upper triangular */
/*          = 'L':  Lower triangular */

/*  TRANS   (input) CHARACTER*1 */
/*          Specifies the operation applied to A. */
/*          = 'N':  Solve A * x = s*b     (No transpose) */
/*          = 'T':  Solve A**T * x = s*b  (Transpose) */
/*          = 'C':  Solve A**H * x = s*b  (Conjugate transpose) */

/*  DIAG    (input) CHARACTER*1 */
/*          Specifies whether or not the matrix A is unit triangular. */
/*          = 'N':  Non-unit triangular */
/*          = 'U':  Unit triangular */

/*  NORMIN  (input) CHARACTER*1 */
/*          Specifies whether CNORM has been set or not. */
/*          = 'Y':  CNORM contains the column norms on entry */
/*          = 'N':  CNORM is not set on entry.  On exit, the norms will */
/*                  be computed and stored in CNORM. */

/*  N       (input) INTEGER */
/*          The order of the matrix A.  N >= 0. */

/*  A       (input) COMPLEX*16 array, dimension (LDA,N) */
/*          The triangular matrix A.  If UPLO = 'U', the leading n by n */
/*          upper triangular part of the array A contains the upper */
/*          triangular matrix, and the strictly lower triangular part of */
/*          A is not referenced.  If UPLO = 'L', the leading n by n lower */
/*          triangular part of the array A contains the lower triangular */
/*          matrix, and the strictly upper triangular part of A is not */
/*          referenced.  If DIAG = 'U', the diagonal elements of A are */
/*          also not referenced and are assumed to be 1. */

/*  LDA     (input) INTEGER */
/*          The leading dimension of the array A.  LDA >= max (1,N). */

/*  X       (input/output) COMPLEX*16 array, dimension (N) */
/*          On entry, the right hand side b of the triangular system. */
/*          On exit, X is overwritten by the solution vector x. */

/*  SCALE   (output) DOUBLE PRECISION */
/*          The scaling factor s for the triangular system */
/*             A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b. */
/*          If SCALE = 0, the matrix A is singular or badly scaled, and */
/*          the vector x is an exact or approximate solution to A*x = 0. */

/*  CNORM   (input or output) DOUBLE PRECISION array, dimension (N) */

/*          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
/*          contains the norm of the off-diagonal part of the j-th column */
/*          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
/*          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
/*          must be greater than or equal to the 1-norm. */

/*          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
/*          returns the 1-norm of the offdiagonal part of the j-th column */
/*          of A. */

/*  INFO    (output) INTEGER */
/*          = 0:  successful exit */
/*          < 0:  if INFO = -k, the k-th argument had an illegal value */

/*  Further Details */
/*  ======= ======= */

/*  A rough bound on x is computed; if that is less than overflow, ZTRSV */
/*  is called, otherwise, specific code is used which checks for possible */
/*  overflow or divide-by-zero at every operation. */

/*  A columnwise scheme is used for solving A*x = b.  The basic algorithm */
/*  if A is lower triangular is */

/*       x[1:n] := b[1:n] */
/*            x(j) := x(j) / A(j,j) */
/*            x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] */
/*       end */

/*  Define bounds on the components of x after j iterations of the loop: */
/*     M(j) = bound on x[1:j] */
/*     G(j) = bound on x[j+1:n] */

/*  Then for iteration j+1 we have */
/*     M(j+1) <= G(j) / | A(j+1,j+1) | */
/*     G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | */
/*            <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) */

/*  where CNORM(j+1) is greater than or equal to the infinity-norm of */
/*  column j+1 of A, not counting the diagonal.  Hence */

/*     G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) */
/*                  1<=i<=j */
/*  and */

/*     |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) */
/*                                   1<=i< j */

/*  Since |x(j)| <= M(j), we use the Level 2 BLAS routine ZTRSV if the */
/*  max(underflow, 1/overflow). */

/*  The bound on x(j) is also used to determine when a step in the */
/*  columnwise method can be performed without fear of overflow.  If */
/*  the computed bound is greater than a large constant, x is scaled to */
/*  prevent overflow, but if the bound overflows, x is set to 0, x(j) to */
/*  1, and scale to 0, and a non-trivial solution to A*x = 0 is found. */

/*  Similarly, a row-wise scheme is used to solve A**T *x = b  or */
/*  A**H *x = b.  The basic algorithm for A upper triangular is */

/*            x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) */
/*       end */

/*  We simultaneously compute two bounds */
/*       G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j */
/*       M(j) = bound on x(i), 1<=i<=j */

/*  add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. */
/*  Then the bound on x(j) is */

/*       M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | */

/*            <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) */
/*                      1<=i<=j */

/*  and we can safely call ZTRSV if 1/M(n) and 1/G(n) are both greater */
/*  than max(underflow, 1/overflow). */

/*  ===================================================================== */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --x;
    --cnorm;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    notran = lsame_(trans, "N");
    nounit = lsame_(diag, "N");

/*     Test the input parameters. */

    if (! upper && ! lsame_(uplo, "L")) {
	*info = -1;
    } else if (! notran && ! lsame_(trans, "T") && ! 
	    lsame_(trans, "C")) {
	*info = -2;
    } else if (! nounit && ! lsame_(diag, "U")) {
	*info = -3;
    } else if (! lsame_(normin, "Y") && ! lsame_(normin, 
	     "N")) {
	*info = -4;
    } else if (*n < 0) {
	*info = -5;
    } else if (*lda < max(1,*n)) {
	*info = -7;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("ZLATRS", &i__1);
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Determine machine dependent parameters to control overflow. */

    smlnum = dlamch_("Safe minimum");
    bignum = 1. / smlnum;
    dlabad_(&smlnum, &bignum);
    smlnum /= dlamch_("Precision");
    bignum = 1. / smlnum;
    *scale = 1.;

    if (lsame_(normin, "N")) {

/*        Compute the 1-norm of each column, not including the diagonal. */

	if (upper) {

/*           A is upper triangular. */

	    i__1 = *n;
	    for (j = 1; j <= i__1; ++j) {
		i__2 = j - 1;
		cnorm[j] = dzasum_(&i__2, &a[j * a_dim1 + 1], &c__1);
	    }
	} else {

/*           A is lower triangular. */

	    i__1 = *n - 1;
	    for (j = 1; j <= i__1; ++j) {
		i__2 = *n - j;
		cnorm[j] = dzasum_(&i__2, &a[j + 1 + j * a_dim1], &c__1);
	    }
	    cnorm[*n] = 0.;
	}
    }

/*     Scale the column norms by TSCAL if the maximum element in CNORM is */
/*     greater than BIGNUM/2. */

    imax = idamax_(n, &cnorm[1], &c__1);
    tmax = cnorm[imax];
    if (tmax <= bignum * .5) {
	tscal = 1.;
    } else {
	tscal = .5 / (smlnum * tmax);
	dscal_(n, &tscal, &cnorm[1], &c__1);
    }

/*     Compute a bound on the computed solution vector to see if the */
/*     Level 2 BLAS routine ZTRSV can be used. */

    xmax = 0.;
    i__1 = *n;
    for (j = 1; j <= i__1; ++j) {
/* Computing MAX */
	i__2 = j;
	d__3 = xmax, d__4 = (d__1 = x[i__2].r / 2., abs(d__1)) + (d__2 = 
		d_imag(&x[j]) / 2., abs(d__2));
	xmax = max(d__3,d__4);
    }
    xbnd = xmax;

    if (notran) {

/*        Compute the growth in A * x = b. */

	if (upper) {
	    jfirst = *n;
	    jlast = 1;
	    jinc = -1;
	} else {
	    jfirst = 1;
	    jlast = *n;
	    jinc = 1;
	}

	if (tscal != 1.) {
	    grow = 0.;
	    goto L60;
	}

	if (nounit) {

/*           A is non-unit triangular. */

/*           Compute GROW = 1/G(j) and XBND = 1/M(j). */

	    grow = .5 / max(xbnd,smlnum);
	    xbnd = grow;
	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L60;
		}

		i__3 = j + j * a_dim1;
		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
			d__2));

		if (tjj >= smlnum) {

/*                 M(j) = G(j-1) / abs(A(j,j)) */

/* Computing MIN */
		    d__1 = xbnd, d__2 = min(1.,tjj) * grow;
		    xbnd = min(d__1,d__2);
		} else {

/*                 M(j) could overflow, set XBND to 0. */

		    xbnd = 0.;
		}

		if (tjj + cnorm[j] >= smlnum) {

/*                 G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */

		    grow *= tjj / (tjj + cnorm[j]);
		} else {

/*                 G(j) could overflow, set GROW to 0. */

		    grow = 0.;
		}
	    }
	    grow = xbnd;
	} else {

/*           A is unit triangular. */

/* Computing MIN */
	    d__1 = 1., d__2 = .5 / max(xbnd,smlnum);
	    grow = min(d__1,d__2);
	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L60;
		}

/*              G(j) = G(j-1)*( 1 + CNORM(j) ) */

		grow *= 1. / (cnorm[j] + 1.);
	    }
	}
L60:

	;
    } else {

/*        Compute the growth in A**T * x = b  or  A**H * x = b. */

	if (upper) {
	    jfirst = 1;
	    jlast = *n;
	    jinc = 1;
	} else {
	    jfirst = *n;
	    jlast = 1;
	    jinc = -1;
	}

	if (tscal != 1.) {
	    grow = 0.;
	    goto L90;
	}

	if (nounit) {

/*           A is non-unit triangular. */

/*           Compute GROW = 1/G(j) and XBND = 1/M(j). */

	    grow = .5 / max(xbnd,smlnum);
	    xbnd = grow;
	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L90;
		}

/*              G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */

		xj = cnorm[j] + 1.;
/* Computing MIN */
		d__1 = grow, d__2 = xbnd / xj;
		grow = min(d__1,d__2);

		i__3 = j + j * a_dim1;
		tjjs.r = a[i__3].r, tjjs.i = a[i__3].i;
		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
			d__2));

		if (tjj >= smlnum) {

/*                 M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */

		    if (xj > tjj) {
			xbnd *= tjj / xj;
		    }
		} else {

/*                 M(j) could overflow, set XBND to 0. */

		    xbnd = 0.;
		}
	    }
	    grow = min(grow,xbnd);
	} else {

/*           A is unit triangular. */

/* Computing MIN */
	    d__1 = 1., d__2 = .5 / max(xbnd,smlnum);
	    grow = min(d__1,d__2);
	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Exit the loop if the growth factor is too small. */

		if (grow <= smlnum) {
		    goto L90;
		}

/*              G(j) = ( 1 + CNORM(j) )*G(j-1) */

		xj = cnorm[j] + 1.;
		grow /= xj;
	    }
	}
L90:
	;
    }

    if (grow * tscal > smlnum) {

/*        Use the Level 2 BLAS solve if the reciprocal of the bound on */
/*        elements of X is not too small. */

	ztrsv_(uplo, trans, diag, n, &a[a_offset], lda, &x[1], &c__1);
    } else {

/*        Use a Level 1 BLAS solve, scaling intermediate results. */

	if (xmax > bignum * .5) {

/*           Scale X so that its components are less than or equal to */
/*           BIGNUM in absolute value. */

	    *scale = bignum * .5 / xmax;
	    zdscal_(n, scale, &x[1], &c__1);
	    xmax = bignum;
	} else {
	    xmax *= 2.;
	}

	if (notran) {

/*           Solve A * x = b */

	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Compute x(j) = b(j) / A(j,j), scaling x if necessary. */

		i__3 = j;
		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]), 
			abs(d__2));
		if (nounit) {
		    i__3 = j + j * a_dim1;
		    z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3].i;
		    tjjs.r = z__1.r, tjjs.i = z__1.i;
		} else {
		    tjjs.r = tscal, tjjs.i = 0.;
		    if (tscal == 1.) {
			goto L110;
		    }
		}
		tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), abs(
			d__2));
		if (tjj > smlnum) {

/*                    abs(A(j,j)) > SMLNUM: */

		    if (tjj < 1.) {
			if (xj > tjj * bignum) {

/*                          Scale x by 1/b(j). */

			    rec = 1. / xj;
			    zdscal_(n, &rec, &x[1], &c__1);
			    *scale *= rec;
			    xmax *= rec;
			}
		    }
		    i__3 = j;
		    zladiv_(&z__1, &x[j], &tjjs);
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    i__3 = j;
		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
			    , abs(d__2));
		} else if (tjj > 0.) {

/*                    0 < abs(A(j,j)) <= SMLNUM: */

		    if (xj > tjj * bignum) {

/*                       Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */
/*                       to avoid overflow when dividing by A(j,j). */

			rec = tjj * bignum / xj;
			if (cnorm[j] > 1.) {

/*                          Scale by 1/CNORM(j) to avoid overflow when */
/*                          multiplying x(j) times column j. */

			    rec /= cnorm[j];
			}
			zdscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
			xmax *= rec;
		    }
		    i__3 = j;
		    zladiv_(&z__1, &x[j], &tjjs);
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    i__3 = j;
		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
			    , abs(d__2));
		} else {

/*                    A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
/*                    scale = 0, and compute a solution to A*x = 0. */

		    i__3 = *n;
		    for (i__ = 1; i__ <= i__3; ++i__) {
			i__4 = i__;
			x[i__4].r = 0., x[i__4].i = 0.;
		    }
		    i__3 = j;
		    x[i__3].r = 1., x[i__3].i = 0.;
		    xj = 1.;
		    *scale = 0.;
		    xmax = 0.;
		}
L110:

/*              Scale x if necessary to avoid overflow when adding a */
/*              multiple of column j of A. */

		if (xj > 1.) {
		    rec = 1. / xj;
		    if (cnorm[j] > (bignum - xmax) * rec) {

/*                    Scale x by 1/(2*abs(x(j))). */

			rec *= .5;
			zdscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
		    }
		} else if (xj * cnorm[j] > bignum - xmax) {

/*                 Scale x by 1/2. */

		    zdscal_(n, &c_b36, &x[1], &c__1);
		    *scale *= .5;
		}

		if (upper) {
		    if (j > 1) {

/*                    Compute the update */
/*                       x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */

			i__3 = j - 1;
			i__4 = j;
			z__2.r = -x[i__4].r, z__2.i = -x[i__4].i;
			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
			zaxpy_(&i__3, &z__1, &a[j * a_dim1 + 1], &c__1, &x[1], 
				 &c__1);
			i__3 = j - 1;
			i__ = izamax_(&i__3, &x[1], &c__1);
			i__3 = i__;
			xmax = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(
				&x[i__]), abs(d__2));
		    }
		} else {
		    if (j < *n) {

/*                    Compute the update */
/*                       x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */

			i__3 = *n - j;
			i__4 = j;
			z__2.r = -x[i__4].r, z__2.i = -x[i__4].i;
			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
			zaxpy_(&i__3, &z__1, &a[j + 1 + j * a_dim1], &c__1, &
				x[j + 1], &c__1);
			i__3 = *n - j;
			i__ = j + izamax_(&i__3, &x[j + 1], &c__1);
			i__3 = i__;
			xmax = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(
				&x[i__]), abs(d__2));
		    }
		}
	    }

	} else if (lsame_(trans, "T")) {

/*           Solve A**T * x = b */

	    i__2 = jlast;
	    i__1 = jinc;
	    for (j = jfirst; i__1 < 0 ? j >= i__2 : j <= i__2; j += i__1) {

/*              Compute x(j) = b(j) - sum A(k,j)*x(k). */
/*                                    k<>j */

		i__3 = j;
		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]), 
			abs(d__2));
		uscal.r = tscal, uscal.i = 0.;
		rec = 1. / max(xmax,1.);
		if (cnorm[j] > (bignum - xj) * rec) {

/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */

		    rec *= .5;
		    if (nounit) {
			i__3 = j + j * a_dim1;
			z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3]
				.i;
			tjjs.r = z__1.r, tjjs.i = z__1.i;
		    } else {
			tjjs.r = tscal, tjjs.i = 0.;
		    }
		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), 
			    abs(d__2));
		    if (tjj > 1.) {

/*                       Divide by A(j,j) when scaling x if A(j,j) > 1. */

/* Computing MIN */
			d__1 = 1., d__2 = rec * tjj;
			rec = min(d__1,d__2);
			zladiv_(&z__1, &uscal, &tjjs);
			uscal.r = z__1.r, uscal.i = z__1.i;
		    }
		    if (rec < 1.) {
			zdscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
			xmax *= rec;
		    }
		}

		csumj.r = 0., csumj.i = 0.;
		if (uscal.r == 1. && uscal.i == 0.) {

/*                 If the scaling needed for A in the dot product is 1, */
/*                 call ZDOTU to perform the dot product. */

		    if (upper) {
			i__3 = j - 1;
			zdotu_(&z__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1], 
				 &c__1);
			csumj.r = z__1.r, csumj.i = z__1.i;
		    } else if (j < *n) {
			i__3 = *n - j;
			zdotu_(&z__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
				x[j + 1], &c__1);
			csumj.r = z__1.r, csumj.i = z__1.i;
		    }
		} else {

/*                 Otherwise, use in-line code for the dot product. */

		    if (upper) {
			i__3 = j - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    i__4 = i__ + j * a_dim1;
			    z__3.r = a[i__4].r * uscal.r - a[i__4].i * 
				    uscal.i, z__3.i = a[i__4].r * uscal.i + a[
				    i__4].i * uscal.r;
			    i__5 = i__;
			    z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, 
				    z__2.i = z__3.r * x[i__5].i + z__3.i * x[
				    i__5].r;
			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i + 
				    z__2.i;
			    csumj.r = z__1.r, csumj.i = z__1.i;
			}
		    } else if (j < *n) {
			i__3 = *n;
			for (i__ = j + 1; i__ <= i__3; ++i__) {
			    i__4 = i__ + j * a_dim1;
			    z__3.r = a[i__4].r * uscal.r - a[i__4].i * 
				    uscal.i, z__3.i = a[i__4].r * uscal.i + a[
				    i__4].i * uscal.r;
			    i__5 = i__;
			    z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, 
				    z__2.i = z__3.r * x[i__5].i + z__3.i * x[
				    i__5].r;
			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i + 
				    z__2.i;
			    csumj.r = z__1.r, csumj.i = z__1.i;
			}
		    }
		}

		z__1.r = tscal, z__1.i = 0.;
		if (uscal.r == z__1.r && uscal.i == z__1.i) {

/*                 Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
/*                 was not used to scale the dotproduct. */

		    i__3 = j;
		    i__4 = j;
		    z__1.r = x[i__4].r - csumj.r, z__1.i = x[i__4].i - 
			    csumj.i;
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    i__3 = j;
		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
			    , abs(d__2));
		    if (nounit) {
			i__3 = j + j * a_dim1;
			z__1.r = tscal * a[i__3].r, z__1.i = tscal * a[i__3]
				.i;
			tjjs.r = z__1.r, tjjs.i = z__1.i;
		    } else {
			tjjs.r = tscal, tjjs.i = 0.;
			if (tscal == 1.) {
			    goto L160;
			}
		    }

/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */

		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), 
			    abs(d__2));
		    if (tjj > smlnum) {

/*                       abs(A(j,j)) > SMLNUM: */

			if (tjj < 1.) {
			    if (xj > tjj * bignum) {

/*                             Scale X by 1/abs(x(j)). */

				rec = 1. / xj;
				zdscal_(n, &rec, &x[1], &c__1);
				*scale *= rec;
				xmax *= rec;
			    }
			}
			i__3 = j;
			zladiv_(&z__1, &x[j], &tjjs);
			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    } else if (tjj > 0.) {

/*                       0 < abs(A(j,j)) <= SMLNUM: */

			if (xj > tjj * bignum) {

/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */

			    rec = tjj * bignum / xj;
			    zdscal_(n, &rec, &x[1], &c__1);
			    *scale *= rec;
			    xmax *= rec;
			}
			i__3 = j;
			zladiv_(&z__1, &x[j], &tjjs);
			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    } else {

/*                       A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
/*                       scale = 0 and compute a solution to A**T *x = 0. */

			i__3 = *n;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    i__4 = i__;
			    x[i__4].r = 0., x[i__4].i = 0.;
			}
			i__3 = j;
			x[i__3].r = 1., x[i__3].i = 0.;
			*scale = 0.;
			xmax = 0.;
		    }
L160:
		    ;
		} else {

/*                 Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
/*                 product has already been divided by 1/A(j,j). */

		    i__3 = j;
		    zladiv_(&z__2, &x[j], &tjjs);
		    z__1.r = z__2.r - csumj.r, z__1.i = z__2.i - csumj.i;
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		}
/* Computing MAX */
		i__3 = j;
		d__3 = xmax, d__4 = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = 
			d_imag(&x[j]), abs(d__2));
		xmax = max(d__3,d__4);
	    }

	} else {

/*           Solve A**H * x = b */

	    i__1 = jlast;
	    i__2 = jinc;
	    for (j = jfirst; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {

/*              Compute x(j) = b(j) - sum A(k,j)*x(k). */
/*                                    k<>j */

		i__3 = j;
		xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j]), 
			abs(d__2));
		uscal.r = tscal, uscal.i = 0.;
		rec = 1. / max(xmax,1.);
		if (cnorm[j] > (bignum - xj) * rec) {

/*                 If x(j) could overflow, scale x by 1/(2*XMAX). */

		    rec *= .5;
		    if (nounit) {
			d_cnjg(&z__2, &a[j + j * a_dim1]);
			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
			tjjs.r = z__1.r, tjjs.i = z__1.i;
		    } else {
			tjjs.r = tscal, tjjs.i = 0.;
		    }
		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), 
			    abs(d__2));
		    if (tjj > 1.) {

/*                       Divide by A(j,j) when scaling x if A(j,j) > 1. */

/* Computing MIN */
			d__1 = 1., d__2 = rec * tjj;
			rec = min(d__1,d__2);
			zladiv_(&z__1, &uscal, &tjjs);
			uscal.r = z__1.r, uscal.i = z__1.i;
		    }
		    if (rec < 1.) {
			zdscal_(n, &rec, &x[1], &c__1);
			*scale *= rec;
			xmax *= rec;
		    }
		}

		csumj.r = 0., csumj.i = 0.;
		if (uscal.r == 1. && uscal.i == 0.) {

/*                 If the scaling needed for A in the dot product is 1, */
/*                 call ZDOTC to perform the dot product. */

		    if (upper) {
			i__3 = j - 1;
			zdotc_(&z__1, &i__3, &a[j * a_dim1 + 1], &c__1, &x[1], 
				 &c__1);
			csumj.r = z__1.r, csumj.i = z__1.i;
		    } else if (j < *n) {
			i__3 = *n - j;
			zdotc_(&z__1, &i__3, &a[j + 1 + j * a_dim1], &c__1, &
				x[j + 1], &c__1);
			csumj.r = z__1.r, csumj.i = z__1.i;
		    }
		} else {

/*                 Otherwise, use in-line code for the dot product. */

		    if (upper) {
			i__3 = j - 1;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    d_cnjg(&z__4, &a[i__ + j * a_dim1]);
			    z__3.r = z__4.r * uscal.r - z__4.i * uscal.i, 
				    z__3.i = z__4.r * uscal.i + z__4.i * 
				    uscal.r;
			    i__4 = i__;
			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
				    i__4].r;
			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i + 
				    z__2.i;
			    csumj.r = z__1.r, csumj.i = z__1.i;
			}
		    } else if (j < *n) {
			i__3 = *n;
			for (i__ = j + 1; i__ <= i__3; ++i__) {
			    d_cnjg(&z__4, &a[i__ + j * a_dim1]);
			    z__3.r = z__4.r * uscal.r - z__4.i * uscal.i, 
				    z__3.i = z__4.r * uscal.i + z__4.i * 
				    uscal.r;
			    i__4 = i__;
			    z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, 
				    z__2.i = z__3.r * x[i__4].i + z__3.i * x[
				    i__4].r;
			    z__1.r = csumj.r + z__2.r, z__1.i = csumj.i + 
				    z__2.i;
			    csumj.r = z__1.r, csumj.i = z__1.i;
			}
		    }
		}

		z__1.r = tscal, z__1.i = 0.;
		if (uscal.r == z__1.r && uscal.i == z__1.i) {

/*                 Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
/*                 was not used to scale the dotproduct. */

		    i__3 = j;
		    i__4 = j;
		    z__1.r = x[i__4].r - csumj.r, z__1.i = x[i__4].i - 
			    csumj.i;
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    i__3 = j;
		    xj = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = d_imag(&x[j])
			    , abs(d__2));
		    if (nounit) {
			d_cnjg(&z__2, &a[j + j * a_dim1]);
			z__1.r = tscal * z__2.r, z__1.i = tscal * z__2.i;
			tjjs.r = z__1.r, tjjs.i = z__1.i;
		    } else {
			tjjs.r = tscal, tjjs.i = 0.;
			if (tscal == 1.) {
			    goto L210;
			}
		    }

/*                    Compute x(j) = x(j) / A(j,j), scaling if necessary. */

		    tjj = (d__1 = tjjs.r, abs(d__1)) + (d__2 = d_imag(&tjjs), 
			    abs(d__2));
		    if (tjj > smlnum) {

/*                       abs(A(j,j)) > SMLNUM: */

			if (tjj < 1.) {
			    if (xj > tjj * bignum) {

/*                             Scale X by 1/abs(x(j)). */

				rec = 1. / xj;
				zdscal_(n, &rec, &x[1], &c__1);
				*scale *= rec;
				xmax *= rec;
			    }
			}
			i__3 = j;
			zladiv_(&z__1, &x[j], &tjjs);
			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    } else if (tjj > 0.) {

/*                       0 < abs(A(j,j)) <= SMLNUM: */

			if (xj > tjj * bignum) {

/*                          Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */

			    rec = tjj * bignum / xj;
			    zdscal_(n, &rec, &x[1], &c__1);
			    *scale *= rec;
			    xmax *= rec;
			}
			i__3 = j;
			zladiv_(&z__1, &x[j], &tjjs);
			x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		    } else {

/*                       A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
/*                       scale = 0 and compute a solution to A**H *x = 0. */

			i__3 = *n;
			for (i__ = 1; i__ <= i__3; ++i__) {
			    i__4 = i__;
			    x[i__4].r = 0., x[i__4].i = 0.;
			}
			i__3 = j;
			x[i__3].r = 1., x[i__3].i = 0.;
			*scale = 0.;
			xmax = 0.;
		    }
L210:
		    ;
		} else {

/*                 Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
/*                 product has already been divided by 1/A(j,j). */

		    i__3 = j;
		    zladiv_(&z__2, &x[j], &tjjs);
		    z__1.r = z__2.r - csumj.r, z__1.i = z__2.i - csumj.i;
		    x[i__3].r = z__1.r, x[i__3].i = z__1.i;
		}
/* Computing MAX */
		i__3 = j;
		d__3 = xmax, d__4 = (d__1 = x[i__3].r, abs(d__1)) + (d__2 = 
			d_imag(&x[j]), abs(d__2));
		xmax = max(d__3,d__4);
	    }
	}
	*scale /= tscal;
    }

/*     Scale the column norms by 1/TSCAL for return. */

    if (tscal != 1.) {
	d__1 = 1. / tscal;
	dscal_(n, &d__1, &cnorm[1], &c__1);
    }

    return 0;

/*     End of ZLATRS */

} /* zlatrs_ */
int
sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
	 SuperMatrix *U, doublecomplex *x, int *info)
{
/*
 *   Purpose
 *   =======
 *
 *   sp_ztrsv() solves one of the systems of equations   
 *       A*x = b,   or   A'*x = b,
 *   where b and x are n element vectors and A is a sparse unit , or   
 *   non-unit, upper or lower triangular matrix.   
 *   No test for singularity or near-singularity is included in this   
 *   routine. Such tests must be performed before calling this routine.   
 *
 *   Parameters   
 *   ==========   
 *
 *   uplo   - (input) char*
 *            On entry, uplo specifies whether the matrix is an upper or   
 *             lower triangular matrix as follows:   
 *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
 *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
 *
 *   trans  - (input) char*
 *             On entry, trans specifies the equations to be solved as   
 *             follows:   
 *                trans = 'N' or 'n'   A*x = b.   
 *                trans = 'T' or 't'   A'*x = b.   
 *                trans = 'C' or 'c'   A'*x = b.   
 *
 *   diag   - (input) char*
 *             On entry, diag specifies whether or not A is unit   
 *             triangular as follows:   
 *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
 *                diag = 'N' or 'n'   A is not assumed to be unit   
 *                                    triangular.   
 *	     
 *   L       - (input) SuperMatrix*
 *	       The factor L from the factorization Pr*A*Pc=L*U. Use
 *             compressed row subscripts storage for supernodes,
 *             i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
 *
 *   U       - (input) SuperMatrix*
 *	        The factor U from the factorization Pr*A*Pc=L*U.
 *	        U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU.
 *    
 *   x       - (input/output) doublecomplex*
 *             Before entry, the incremented array X must contain the n   
 *             element right-hand side vector b. On exit, X is overwritten 
 *             with the solution vector x.
 *
 *   info    - (output) int*
 *             If *info = -i, the i-th argument had an illegal value.
 *
 */
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
	 ftcs2 = _cptofcd("N", strlen("N")),
	 ftcs3 = _cptofcd("U", strlen("U"));
#endif
    SCformat *Lstore;
    NCformat *Ustore;
    doublecomplex   *Lval, *Uval;
    int incx = 1, incy = 1;
    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
    doublecomplex comp_zero = {0.0, 0.0};
    int nrow;
    int fsupc, nsupr, nsupc, luptr, istart, irow;
    int i, k, iptr, jcol;
    doublecomplex *work;
    flops_t solve_ops;
    extern SuperLUStat_t SuperLUStat;

    /* Test the input parameters */
    *info = 0;
    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") ) *info = -2;
    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
    if ( *info ) {
	i = -(*info);
	xerbla_("sp_ztrsv", &i);
	return 0;
    }

    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    solve_ops = 0;

    if ( !(work = doublecomplexCalloc(L->nrow)) )
	ABORT("Malloc fails for work in sp_ztrsv().");
    
    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L)*x */
    	    if ( L->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = 0; k <= Lstore->nsuper; k++) {
		fsupc = L_FST_SUPC(k);
		istart = L_SUB_START(fsupc);
		nsupr = L_SUB_START(fsupc+1) - istart;
		nsupc = L_FST_SUPC(k+1) - fsupc;
		luptr = L_NZ_START(fsupc);
		nrow = nsupr - nsupc;

	        solve_ops += 4 * nsupc * (nsupc - 1);
	        solve_ops += 8 * nrow * nsupc;

		if ( nsupc == 1 ) {
		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
			irow = L_SUB(iptr);
			++luptr;
			zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
			z_sub(&x[irow], &x[irow], &comp_zero);
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#else
		    ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
		       	&x[fsupc], &incx);
		
		    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#endif
#else
		    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
		
		    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
			&x[fsupc], &work[0] );
#endif		
		
		    iptr = istart + nsupc;
		    for (i = 0; i < nrow; ++i, ++iptr) {
			irow = L_SUB(iptr);
			z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
			work[i] = comp_zero;

		    }
	 	}
	    } /* for k ... */
	    
	} else {
	    /* Form x := inv(U)*x */
	    
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = Lstore->nsuper; k >= 0; k--) {
	    	fsupc = L_FST_SUPC(k);
	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);
		
    	        solve_ops += 4 * nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
			irow = U_SUB(i);
			zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
			z_sub(&x[irow], &x[irow], &comp_zero);
		    }
		} else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
		       &x[fsupc], &incx);
#else
		    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
		       &x[fsupc], &incx);
#endif
#else		
		    zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
#endif		

		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
				i++) {
			    irow = U_SUB(i);
			zz_mult(&comp_zero, &x[jcol], &Uval[i]);
			z_sub(&x[irow], &x[irow], &comp_zero);
		    	}
                    }
		}
	    } /* for k ... */
	    
	}
    } else { /* Form x := inv(A')*x */
	
	if ( lsame_(uplo, "L") ) {
	    /* Form x := inv(L')*x */
    	    if ( L->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = Lstore->nsuper; k >= 0; --k) {
	    	fsupc = L_FST_SUPC(k);
	    	istart = L_SUB_START(fsupc);
	    	nsupr = L_SUB_START(fsupc+1) - istart;
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		solve_ops += 8 * (nsupr - nsupc) * nsupc;

		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		    iptr = istart + nsupc;
		    for (i = L_NZ_START(jcol) + nsupc; 
				i < L_NZ_START(jcol+1); i++) {
			irow = L_SUB(iptr);
			zz_mult(&comp_zero, &x[irow], &Lval[i]);
		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
			iptr++;
		    }
		}
		
		if ( nsupc > 1 ) {
		    solve_ops += 4 * nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
		    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#else
		    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
			&x[fsupc], &incx);
#endif
		}
	    }
	} else {
	    /* Form x := inv(U')*x */
	    if ( U->nrow == 0 ) return 0; /* Quick return */
	    
	    for (k = 0; k <= Lstore->nsuper; k++) {
	    	fsupc = L_FST_SUPC(k);
	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
	    	nsupc = L_FST_SUPC(k+1) - fsupc;
	    	luptr = L_NZ_START(fsupc);

		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
		    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
			irow = U_SUB(i);
			zz_mult(&comp_zero, &x[irow], &Uval[i]);
		    	z_sub(&x[jcol], &x[jcol], &comp_zero);
		    }
		}

		solve_ops += 4 * nsupc * (nsupc + 1);

		if ( nsupc == 1 ) {
		    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
		} else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
		    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#else
		    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
			    &x[fsupc], &incx);
#endif
		}
	    } /* for k ... */
	}
    }

    SuperLUStat.ops[SOLVE] += solve_ops;
    SUPERLU_FREE(work);
    return 0;
}
예제 #6
0
void
ztrsv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
{
   ztrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
}
예제 #7
0
/* Return value:   0 - successful return
 *               > 0 - number of bytes allocated when run out of space
 */
int
zcolumn_bmod (
	     const int  jcol,	  /* in */
	     const int  nseg,	  /* in */
	     doublecomplex     *dense,	  /* in */
	     doublecomplex     *tempv,	  /* working array */
	     int        *segrep,  /* in */
	     int        *repfnz,  /* in */
	     int        fpanelc,  /* in -- first column in the current panel */
	     GlobalLU_t *Glu,     /* modified */
	     SuperLUStat_t *stat  /* output */
	     )
{
/*
 * Purpose:
 * ========
 *    Performs numeric block updates (sup-col) in topological order.
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
 *    Special processing on the supernodal portion of L\U[*,j]
 *
 */
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int         incx = 1, incy = 1;
    doublecomplex      alpha, beta;
    
    /* krep = representative of current k-th supernode
     * fsupc = first supernodal column
     * nsupc = no of columns in supernode
     * nsupr = no of rows in supernode (used as leading dimension)
     * luptr = location of supernodal LU-block in storage
     * kfnz = first nonz in the k-th supernodal segment
     * no_zeros = no of leading zeros in a supernodal U-segment
     */
    doublecomplex       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          fsupc, nsupc, nsupr, segsze;
    int          nrow;	  /* No of rows in the matrix of matrix-vector */
    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
    register int lptr, kfnz, isub, irow, i;
    register int no_zeros, new_next; 
    int          ufirst, nextlu;
    int          fst_col; /* First column within small LU update */
    int          d_fsupc; /* Distance between the first column of the current
			     panel and the first column of the current snode. */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    doublecomplex       *lusup;
    int          *xlusup;
    int          nzlumax;
    doublecomplex       *tempv1;
    doublecomplex      zero = {0.0, 0.0};
    doublecomplex      one = {1.0, 0.0};
    doublecomplex      none = {-1.0, 0.0};
    doublecomplex	 comp_temp, comp_temp1;
    int          mem_error;
    flops_t      *ops = stat->ops;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;
    nzlumax = Glu->nzlumax;
    jcolp1 = jcol + 1;
    jsupno = supno[jcol];
    
    /* 
     * For each nonz supernode segment of U[*,j] in topological order 
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) {

	krep = segrep[k];
	k--;
	ksupno = supno[krep];
	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */

	    fsupc = xsup[ksupno];
	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

  	    /* Distance from the current supernode to the current panel; 
	       d_fsupc=0 if fsupc > fpanelc. */
  	    d_fsupc = fst_col - fsupc; 

	    luptr = xlusup[fst_col] + d_fsupc;
	    lptr = xlsub[fsupc] + d_fsupc;

	    kfnz = repfnz[krep];
	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );

	    segsze = krep - kfnz + 1;
	    nsupc = krep - fst_col + 1;
	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
	    nrow = nsupr - d_fsupc - nsupc;
	    krep_ind = lptr + nsupc - 1;

	    ops[TRSV] += 4 * segsze * (segsze - 1);
	    ops[GEMV] += 8 * nrow * segsze;



	    /* 
	     * Case 1: Update U-segment of size 1 -- col-col update 
	     */
	    if ( segsze == 1 ) {
	  	ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc;

		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    irow = lsub[i];
		    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
		    z_sub(&dense[irow], &dense[irow], &comp_temp);
		    luptr++;
		}

	    } else if ( segsze <= 3 ) {
		ukj = dense[lsub[krep_ind]];
		luptr += nsupr*(nsupc-1) + nsupc-1;
		ukj1 = dense[lsub[krep_ind - 1]];
		luptr1 = luptr - nsupr;

		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
		    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
		    z_sub(&ukj, &ukj, &comp_temp);
		    dense[lsub[krep_ind]] = ukj;
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
			zz_mult(&comp_temp, &ukj, &lusup[luptr]);
			zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
			z_add(&comp_temp, &comp_temp, &comp_temp1);
			z_sub(&dense[irow], &dense[irow], &comp_temp);
		    }
		} else { /* Case 3: 3cols-col update */
		    ukj2 = dense[lsub[krep_ind - 2]];
		    luptr2 = luptr1 - nsupr;
  		    zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
		    z_sub(&ukj1, &ukj1, &comp_temp);

		    zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
		    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
		    z_add(&comp_temp, &comp_temp, &comp_temp1);
		    z_sub(&ukj, &ukj, &comp_temp);

		    dense[lsub[krep_ind]] = ukj;
		    dense[lsub[krep_ind-1]] = ukj1;
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
		    	irow = lsub[i];
		    	luptr++;
		    	luptr1++;
			luptr2++;
			zz_mult(&comp_temp, &ukj, &lusup[luptr]);
			zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
			z_add(&comp_temp, &comp_temp, &comp_temp1);
			zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
			z_add(&comp_temp, &comp_temp, &comp_temp1);
			z_sub(&dense[irow], &dense[irow], &comp_temp);
		    }
		}


	    } else {
	  	/*
		 * Case: sup-col update
		 * Perform a triangular solve and block update,
		 * then scatter the result of sup-col update to dense
		 */

		no_zeros = kfnz - fst_col;

	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
	        isub = lptr + no_zeros;
	        for (i = 0; i < segsze; i++) {
	  	    irow = lsub[isub];
		    tempv[i] = dense[irow];
		    ++isub; 
	        }

	        /* Dense triangular solve -- start effective triangle */
		luptr += nsupr * no_zeros + no_zeros; 
		
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
		CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#else		
		ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		       &nsupr, tempv, &incx );
#endif		
 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
                alpha = one;
                beta = zero;
#ifdef _CRAY
		CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
		zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
		zlsolve ( nsupr, segsze, &lusup[luptr], tempv );

 		luptr += segsze;  /* Dense matrix-vector */
		tempv1 = &tempv[segsze];
		zmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
#endif
		
		
                /* Scatter tempv[] into SPA dense[] as a temporary storage */
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense[irow] = tempv[i];
                    tempv[i] = zero;
                    ++isub;
                }

		/* Scatter tempv1[] into SPA dense[] */
		for (i = 0; i < nrow; i++) {
		    irow = lsub[isub];
		    z_sub(&dense[irow], &dense[irow], &tempv1[i]);
		    tempv1[i] = zero;
		    ++isub;
		}
	    }
	    
	} /* if jsupno ... */

    } /* for each segment... */

    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    nextlu = xlusup[jcol];
    fsupc = xsup[jsupno];

    /* Copy the SPA dense into L\U[*,j] */
    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
    while ( new_next > nzlumax ) {
	if (mem_error = zLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
	    return (mem_error);
	lusup = Glu->lusup;
	lsub = Glu->lsub;
    }

    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
        dense[irow] = zero;
	++nextlu;
    }

    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */

    /* For more updates within the panel (also within the current supernode), 
     * should start from the first column of the panel, or the first column 
     * of the supernode, whichever is bigger. There are 2 cases:
     *    1) fsupc < fpanelc, then fst_col := fpanelc
     *    2) fsupc >= fpanelc, then fst_col := fsupc
     */
    fst_col = SUPERLU_MAX ( fsupc, fpanelc );

    if ( fst_col < jcol ) {

  	/* Distance between the current supernode and the current panel.
	   d_fsupc=0 if fsupc >= fpanelc. */
  	d_fsupc = fst_col - fsupc;

	lptr = xlsub[fsupc] + d_fsupc;
	luptr = xlusup[fst_col] + d_fsupc;
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
	nsupc = jcol - fst_col;	/* Excluding jcol */
	nrow = nsupr - d_fsupc - nsupc;

	/* Points to the beginning of jcol in snode L\U(jsupno) */
	ufirst = xlusup[jcol] + d_fsupc;	

	ops[TRSV] += 4 * nsupc * (nsupc - 1);
	ops[GEMV] += 8 * nrow * nsupc;
	
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
#else
	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
	       &nsupr, &lusup[ufirst], &incx );
#endif
	
	alpha = none; beta = one; /* y := beta*y + alpha*A*x */

#ifdef _CRAY
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );

	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
		&lusup[ufirst], tempv );
	
        /* Copy updates from tempv[*] into lusup[*] */
	isub = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
	    z_sub(&lusup[isub], &lusup[isub], &tempv[i]);
	    tempv[i] = zero;
	    ++isub;
	}

#endif
	
	
    } /* if fst_col < jcol ... */ 

    return 0;
}
예제 #8
0
static void pzgstrs2
/************************************************************************/
#ifdef _CRAY
(
 int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
 )
#else
(
 int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
 LocalLU_t *Llu, SuperLUStat_t *stat
 )
#endif
/* 
 * Purpose
 * =======
 *   Perform parallel triangular solves
 *           U(k,:) := A(k,:) \ L(k,k). 
 *   Only the process column that owns block column *k* participates
 *   in the work.
 * 
 * Arguments
 * =========
 *
 * m      (input) int (global)
 *        Number of rows in the matrix.
 *
 * k      (input) int (global)
 *        The row number of the block row to be factorized.
 *
 * Glu_persist (input) Glu_persist_t*
 *        Global data structures (xsup, supno) replicated on all processes.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh.
 *
 * Llu    (input/output) LocalLU_t*
 *        Local data structures to store distributed L and U matrices.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics about the factorization; 
 *        See SuperLUStat_t structure defined in util.h.
 *
 */
{
    int    iam, pkk;
    int    incx = 1;
    int    nsupr; /* number of rows in the block L(:,k) (LDA) */
    int    segsize;
    int_t  nsupc; /* number of columns in the block */
    int_t  luptr, iukp, rukp;
    int_t  b, gb, j, klst, knsupc, lk, nb;
    int_t  *xsup = Glu_persist->xsup;
    int_t  *usub;
    doublecomplex *lusup, *uval;

    /* Quick return. */
    lk = LBi( k, grid ); /* Local block number */
    if ( !Llu->Unzval_br_ptr[lk] ) return;

    /* Initialization. */
    iam  = grid->iam;
    pkk  = PNUM( PROW(k, grid), PCOL(k, grid), grid );
    klst = FstBlockC( k+1 );
    knsupc = SuperSize( k );
    usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
    uval = Llu->Unzval_br_ptr[lk];
    nb = usub[0];
    iukp = BR_HEADER;
    rukp = 0;
    if ( iam == pkk ) {
	lk = LBj( k, grid );
	nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
	lusup = Llu->Lnzval_bc_ptr[lk];
    } else {
	nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
	lusup = Llu->Lval_buf_2[k%2];
    }

    /* Loop through all the row blocks. */
    for (b = 0; b < nb; ++b) {
	gb = usub[iukp];
	nsupc = SuperSize( gb );
	iukp += UB_DESCRIPTOR;

	/* Loop through all the segments in the block. */
	for (j = 0; j < nsupc; ++j) {
	    segsize = klst - usub[iukp++]; 
	    if ( segsize ) { /* Nonzero segment. */
		luptr = (knsupc - segsize) * (nsupr + 1);
#ifdef _CRAY
		CTRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, 
		      &uval[rukp], &incx);
#else
		ztrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, 
		       &uval[rukp], &incx);
#endif
		stat->ops[FACT] += 4 * segsize * (segsize + 1)
		    + 10 * segsize;  /* complex division */
		rukp += segsize;
	    }
	}
    } /* for b ... */

} /* PZGSTRS2 */
예제 #9
0
void
zpanel_bmod (
            const int  m,          /* in - number of rows in the matrix */
            const int  w,          /* in */
            const int  jcol,       /* in */
            const int  nseg,       /* in */
            doublecomplex     *dense,     /* out, of size n by w */
            doublecomplex     *tempv,     /* working array */
            int        *segrep,    /* in */
            int        *repfnz,    /* in, of size n by w */
            GlobalLU_t *Glu,       /* modified */
            SuperLUStat_t *stat    /* output */
            )
{


#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int          incx = 1, incy = 1;
    doublecomplex       alpha, beta;
#endif

    register int k, ksub;
    int          fsupc, nsupc, nsupr, nrow;
    int          krep, krep_ind;
    doublecomplex       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    int          block_nrow;  /* no of rows in a block row */
    register int lptr;        /* Points to the row subscripts of a supernode */
    int          kfnz, irow, no_zeros;
    register int isub, isub1, i;
    register int jj;          /* Index through each column in the panel */
    int          *xsup, *supno;
    int          *lsub, *xlsub;
    doublecomplex       *lusup;
    int          *xlusup;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    doublecomplex       *dense_col;  /* dense[] for a column in the panel */
    doublecomplex       *tempv1;             /* Used in 1-D update */
    doublecomplex       *TriTmp, *MatvecTmp; /* used in 2-D update */
    doublecomplex      zero = {0.0, 0.0};
    doublecomplex      one = {1.0, 0.0};
    doublecomplex      comp_temp, comp_temp1;
    register int ldaTmp;
    register int r_ind, r_hi;
    static   int first = 1, maxsuper, rowblk, colblk;
    flops_t  *ops = stat->ops;

    xsup    = Glu->xsup;
    supno   = Glu->supno;
    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;

    if ( first ) {
        maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
        rowblk   = sp_ienv(4);
        colblk   = sp_ienv(5);
        first = 0;
    }
    ldaTmp = maxsuper + rowblk;

    /*
     * For each nonz supernode segment of U[*,j] in topological order
     */
    k = nseg - 1;
    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */

        /* krep = representative of current k-th supernode
         * fsupc = first supernodal column
         * nsupc = no of columns in a supernode
         * nsupr = no of rows in a supernode
         */
        krep = segrep[k--];
        fsupc = xsup[supno[krep]];
        nsupc = krep - fsupc + 1;
        nsupr = xlsub[fsupc+1] - xlsub[fsupc];
        nrow = nsupr - nsupc;
        lptr = xlsub[fsupc];
        krep_ind = lptr + nsupc - 1;

        repfnz_col = repfnz;
        dense_col = dense;

        if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */

            TriTmp = tempv;

            /* Sequence through each column in panel -- triangular solves */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        z_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++;
                            zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                            zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        z_sub(&ukj1, &ukj1, &comp_temp);

                        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        z_add(&comp_temp, &comp_temp, &comp_temp1);
                        z_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            luptr++; luptr1++; luptr2++;
                            zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                            zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  {       /* segsze >= 4 */

                    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
                       holds the result of triangular solves.    */
                    no_zeros = kfnz - fsupc;
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        TriTmp[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#else
                    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, TriTmp, &incx );
#endif
#else
                    zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
#endif


                } /* else ... */

            }  /* for jj ... end tri-solves */

            /* Block row updates; push all the way into dense[*] block */
            for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {

                r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
                block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
                luptr = xlusup[fsupc] + nsupc + r_ind;
                isub1 = lptr + nsupc + r_ind;

                repfnz_col = repfnz;
                TriTmp = tempv;
                dense_col = dense;

                /* Sequence through each column in panel -- matrix-vector */
                for (jj = jcol; jj < jcol + w; jj++,
                     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {

                    kfnz = repfnz_col[krep];
                    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                    segsze = krep - kfnz + 1;
                    if ( segsze <= 3 ) continue;   /* skip unrolled cases */

                    /* Perform a block update, and scatter the result of
                       matrix-vector to dense[].                 */
                    no_zeros = kfnz - fsupc;
                    luptr1 = luptr + nsupr * no_zeros;
                    MatvecTmp = &TriTmp[maxsuper];

#ifdef USE_VENDOR_BLAS
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#else
                    zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
                           &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
#endif
#else
                    zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
                           TriTmp, MatvecTmp);
#endif

                    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
                     * such that MatvecTmp[*] can be re-used for the
                     * the next blok row update. dense[] will be copied into
                     * global store after the whole panel has been finished.
                     */
                    isub = isub1;
                    for (i = 0; i < block_nrow; i++) {
                        irow = lsub[isub];
                        z_sub(&dense_col[irow], &dense_col[irow],
                              &MatvecTmp[i]);
                        MatvecTmp[i] = zero;
                        ++isub;
                    }

                } /* for jj ... */

            } /* for each block row ... */

            /* Scatter the triangular solves into SPA dense[*] */
            repfnz_col = repfnz;
            TriTmp = tempv;
            dense_col = dense;

            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue; /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                if ( segsze <= 3 ) continue; /* skip unrolled cases */

                no_zeros = kfnz - fsupc;
                isub = lptr + no_zeros;
                for (i = 0; i < segsze; i++) {
                    irow = lsub[isub];
                    dense_col[irow] = TriTmp[i];
                    TriTmp[i] = zero;
                    ++isub;
                }

            } /* for jj ... */

        } else { /* 1-D block modification */


            /* Sequence through each column in the panel */
            for (jj = jcol; jj < jcol + w; jj++,
                 repfnz_col += m, dense_col += m) {

                kfnz = repfnz_col[krep];
                if ( kfnz == EMPTY ) continue;  /* Skip any zero segment */

                segsze = krep - kfnz + 1;
                luptr = xlusup[fsupc];

                ops[TRSV] += 4 * segsze * (segsze - 1);
                ops[GEMV] += 8 * nrow * segsze;

                /* Case 1: Update U-segment of size 1 -- col-col update */
                if ( segsze == 1 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc;

                    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
                        irow = lsub[i];
                        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        ++luptr;
                    }

                } else if ( segsze <= 3 ) {
                    ukj = dense_col[lsub[krep_ind]];
                    luptr += nsupr*(nsupc-1) + nsupc-1;
                    ukj1 = dense_col[lsub[krep_ind - 1]];
                    luptr1 = luptr - nsupr;

                    if ( segsze == 2 ) {
                        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        z_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr;  ++luptr1;
                            zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                            zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    } else {
                        ukj2 = dense_col[lsub[krep_ind - 2]];
                        luptr2 = luptr1 - nsupr;
                        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                        z_sub(&ukj1, &ukj1, &comp_temp);

                        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                        zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                        z_add(&comp_temp, &comp_temp, &comp_temp1);
                        z_sub(&ukj, &ukj, &comp_temp);
                        dense_col[lsub[krep_ind]] = ukj;
                        dense_col[lsub[krep_ind-1]] = ukj1;
                        for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
                            irow = lsub[i];
                            ++luptr; ++luptr1; ++luptr2;
                            zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                            zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                            z_add(&comp_temp, &comp_temp, &comp_temp1);
                            z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
                        }
                    }

                } else  { /* segsze >= 4 */
                    /*
                     * Perform a triangular solve and block update,
                     * then scatter the result of sup-col update to dense[].
                     */
                    no_zeros = kfnz - fsupc;

                    /* Copy U[*,j] segment from dense[*] to tempv[*]:
                     *    The result of triangular solve is in tempv[*];
                     *    The result of matrix vector update is in dense_col[*]
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; ++i) {
                        irow = lsub[isub];
                        tempv[i] = dense_col[irow]; /* Gather */
                        ++isub;
                    }

                    /* start effective triangle */
                    luptr += nsupr * no_zeros + no_zeros;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#else
                    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
                           &nsupr, tempv, &incx );
#endif

                    luptr += segsze;    /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    alpha = one;
                    beta = zero;
#ifdef _CRAY
                    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#else
                    zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
                           &nsupr, tempv, &incx, &beta, tempv1, &incy );
#endif
#else
                    zlsolve ( nsupr, segsze, &lusup[luptr], tempv );

                    luptr += segsze;        /* Dense matrix-vector */
                    tempv1 = &tempv[segsze];
                    zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
#endif

                    /* Scatter tempv[*] into SPA dense[*] temporarily, such
                     * that tempv[*] can be used for the triangular solve of
                     * the next column of the panel. They will be copied into
                     * ucol[*] after the whole panel has been finished.
                     */
                    isub = lptr + no_zeros;
                    for (i = 0; i < segsze; i++) {
                        irow = lsub[isub];
                        dense_col[irow] = tempv[i];
                        tempv[i] = zero;
                        isub++;
                    }

                    /* Scatter the update from tempv1[*] into SPA dense[*] */
                    /* Start dense rectangular L */
                    for (i = 0; i < nrow; i++) {
                        irow = lsub[isub];
                        z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
                        tempv1[i] = zero;
                        ++isub;
                    }

                } /* else segsze>=4 ... */

            } /* for each column in the panel... */

        } /* else 1-D update ... */

    } /* for each updating supernode ... */

}
예제 #10
0
/* Subroutine */ int zhegs2_(integer *itype, char *uplo, integer *n,
                             doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb,
                             integer *info)
{
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
    doublereal d__1, d__2;
    doublecomplex z__1;

    /* Local variables */
    integer k;
    doublecomplex ct;
    doublereal akk, bkk;
    logical upper;

    /*  -- LAPACK routine (version 3.2) -- */
    /*     November 2006 */

    /*  Purpose */
    /*  ======= */

    /*  ZHEGS2 reduces a complex Hermitian-definite generalized */
    /*  eigenproblem to standard form. */

    /*  If ITYPE = 1, the problem is A*x = lambda*B*x, */
    /*  and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L') */

    /*  If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or */
    /*  B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L. */

    /*  B must have been previously factorized as U'*U or L*L' by ZPOTRF. */

    /*  Arguments */
    /*  ========= */

    /*  ITYPE   (input) INTEGER */
    /*          = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L'); */
    /*          = 2 or 3: compute U*A*U' or L'*A*L. */

    /*  UPLO    (input) CHARACTER*1 */
    /*          Specifies whether the upper or lower triangular part of the */
    /*          Hermitian matrix A is stored, and how B has been factorized. */
    /*          = 'U':  Upper triangular */
    /*          = 'L':  Lower triangular */

    /*  N       (input) INTEGER */
    /*          The order of the matrices A and B.  N >= 0. */

    /*  A       (input/output) COMPLEX*16 array, dimension (LDA,N) */
    /*          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading */
    /*          n by n upper triangular part of A contains the upper */
    /*          triangular part of the matrix A, and the strictly lower */
    /*          triangular part of A is not referenced.  If UPLO = 'L', the */
    /*          leading n by n lower triangular part of A contains the lower */
    /*          triangular part of the matrix A, and the strictly upper */
    /*          triangular part of A is not referenced. */

    /*          On exit, if INFO = 0, the transformed matrix, stored in the */
    /*          same format as A. */

    /*  LDA     (input) INTEGER */
    /*          The leading dimension of the array A.  LDA >= max(1,N). */

    /*  B       (input) COMPLEX*16 array, dimension (LDB,N) */
    /*          The triangular factor from the Cholesky factorization of B, */
    /*          as returned by ZPOTRF. */

    /*  LDB     (input) INTEGER */
    /*          The leading dimension of the array B.  LDB >= max(1,N). */

    /*  INFO    (output) INTEGER */
    /*          = 0:  successful exit. */
    /*          < 0:  if INFO = -i, the i-th argument had an illegal value. */

    /*  ===================================================================== */

    /*     Test the input parameters. */

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;

    /* Function Body */
    *info = 0;
    upper = lsame_(uplo, "U");
    if (*itype < 1 || *itype > 3) {
        *info = -1;
    } else if (! upper && ! lsame_(uplo, "L")) {
        *info = -2;
    } else if (*n < 0) {
        *info = -3;
    } else if (*lda < max(1,*n)) {
        *info = -5;
    } else if (*ldb < max(1,*n)) {
        *info = -7;
    }
    if (*info != 0) {
        i__1 = -(*info);
        xerbla_("ZHEGS2", &i__1);
        return 0;
    }

    if (*itype == 1) {
        if (upper) {

            /*           Compute inv(U')*A*inv(U) */

            i__1 = *n;
            for (k = 1; k <= i__1; ++k) {

                /*              Update the upper triangle of A(k:n,k:n) */

                i__2 = k + k * a_dim1;
                akk = a[i__2].r;
                i__2 = k + k * b_dim1;
                bkk = b[i__2].r;
                /* Computing 2nd power */
                d__1 = bkk;
                akk /= d__1 * d__1;
                i__2 = k + k * a_dim1;
                a[i__2].r = akk, a[i__2].i = 0.;
                if (k < *n) {
                    i__2 = *n - k;
                    d__1 = 1. / bkk;
                    zdscal_(&i__2, &d__1, &a[k + (k + 1) * a_dim1], lda);
                    d__1 = akk * -.5;
                    ct.r = d__1, ct.i = 0.;
                    i__2 = *n - k;
                    zlacgv_(&i__2, &a[k + (k + 1) * a_dim1], lda);
                    i__2 = *n - k;
                    zlacgv_(&i__2, &b[k + (k + 1) * b_dim1], ldb);
                    i__2 = *n - k;
                    zaxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + (
                                k + 1) * a_dim1], lda);
                    i__2 = *n - k;
                    z__1.r = -1., z__1.i = -0.;
                    zher2_(uplo, &i__2, &z__1, &a[k + (k + 1) * a_dim1], lda,
                           &b[k + (k + 1) * b_dim1], ldb, &a[k + 1 + (k + 1)
                                   * a_dim1], lda);
                    i__2 = *n - k;
                    zaxpy_(&i__2, &ct, &b[k + (k + 1) * b_dim1], ldb, &a[k + (
                                k + 1) * a_dim1], lda);
                    i__2 = *n - k;
                    zlacgv_(&i__2, &b[k + (k + 1) * b_dim1], ldb);
                    i__2 = *n - k;
                    ztrsv_(uplo, "Conjugate transpose", "Non-unit", &i__2, &b[
                               k + 1 + (k + 1) * b_dim1], ldb, &a[k + (k + 1) *
                                       a_dim1], lda);
                    i__2 = *n - k;
                    zlacgv_(&i__2, &a[k + (k + 1) * a_dim1], lda);
                }
            }
        } else {

            /*           Compute inv(L)*A*inv(L') */

            i__1 = *n;
            for (k = 1; k <= i__1; ++k) {

                /*              Update the lower triangle of A(k:n,k:n) */

                i__2 = k + k * a_dim1;
                akk = a[i__2].r;
                i__2 = k + k * b_dim1;
                bkk = b[i__2].r;
                /* Computing 2nd power */
                d__1 = bkk;
                akk /= d__1 * d__1;
                i__2 = k + k * a_dim1;
                a[i__2].r = akk, a[i__2].i = 0.;
                if (k < *n) {
                    i__2 = *n - k;
                    d__1 = 1. / bkk;
                    zdscal_(&i__2, &d__1, &a[k + 1 + k * a_dim1], &c__1);
                    d__1 = akk * -.5;
                    ct.r = d__1, ct.i = 0.;
                    i__2 = *n - k;
                    zaxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k +
                            1 + k * a_dim1], &c__1);
                    i__2 = *n - k;
                    z__1.r = -1., z__1.i = -0.;
                    zher2_(uplo, &i__2, &z__1, &a[k + 1 + k * a_dim1], &c__1,
                           &b[k + 1 + k * b_dim1], &c__1, &a[k + 1 + (k + 1)
                                   * a_dim1], lda);
                    i__2 = *n - k;
                    zaxpy_(&i__2, &ct, &b[k + 1 + k * b_dim1], &c__1, &a[k +
                            1 + k * a_dim1], &c__1);
                    i__2 = *n - k;
                    ztrsv_(uplo, "No transpose", "Non-unit", &i__2, &b[k + 1
                            + (k + 1) * b_dim1], ldb, &a[k + 1 + k * a_dim1],
                           &c__1);
                }
            }
        }
    } else {
        if (upper) {

            /*           Compute U*A*U' */

            i__1 = *n;
            for (k = 1; k <= i__1; ++k) {

                /*              Update the upper triangle of A(1:k,1:k) */

                i__2 = k + k * a_dim1;
                akk = a[i__2].r;
                i__2 = k + k * b_dim1;
                bkk = b[i__2].r;
                i__2 = k - 1;
                ztrmv_(uplo, "No transpose", "Non-unit", &i__2, &b[b_offset],
                       ldb, &a[k * a_dim1 + 1], &c__1);
                d__1 = akk * .5;
                ct.r = d__1, ct.i = 0.;
                i__2 = k - 1;
                zaxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 +
                        1], &c__1);
                i__2 = k - 1;
                zher2_(uplo, &i__2, &c_b1, &a[k * a_dim1 + 1], &c__1, &b[k *
                        b_dim1 + 1], &c__1, &a[a_offset], lda);
                i__2 = k - 1;
                zaxpy_(&i__2, &ct, &b[k * b_dim1 + 1], &c__1, &a[k * a_dim1 +
                        1], &c__1);
                i__2 = k - 1;
                zdscal_(&i__2, &bkk, &a[k * a_dim1 + 1], &c__1);
                i__2 = k + k * a_dim1;
                /* Computing 2nd power */
                d__2 = bkk;
                d__1 = akk * (d__2 * d__2);
                a[i__2].r = d__1, a[i__2].i = 0.;
            }
        } else {

            /*           Compute L'*A*L */

            i__1 = *n;
            for (k = 1; k <= i__1; ++k) {

                /*              Update the lower triangle of A(1:k,1:k) */

                i__2 = k + k * a_dim1;
                akk = a[i__2].r;
                i__2 = k + k * b_dim1;
                bkk = b[i__2].r;
                i__2 = k - 1;
                zlacgv_(&i__2, &a[k + a_dim1], lda);
                i__2 = k - 1;
                ztrmv_(uplo, "Conjugate transpose", "Non-unit", &i__2, &b[
                           b_offset], ldb, &a[k + a_dim1], lda);
                d__1 = akk * .5;
                ct.r = d__1, ct.i = 0.;
                i__2 = k - 1;
                zlacgv_(&i__2, &b[k + b_dim1], ldb);
                i__2 = k - 1;
                zaxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda);
                i__2 = k - 1;
                zher2_(uplo, &i__2, &c_b1, &a[k + a_dim1], lda, &b[k + b_dim1]
                       , ldb, &a[a_offset], lda);
                i__2 = k - 1;
                zaxpy_(&i__2, &ct, &b[k + b_dim1], ldb, &a[k + a_dim1], lda);
                i__2 = k - 1;
                zlacgv_(&i__2, &b[k + b_dim1], ldb);
                i__2 = k - 1;
                zdscal_(&i__2, &bkk, &a[k + a_dim1], lda);
                i__2 = k - 1;
                zlacgv_(&i__2, &a[k + a_dim1], lda);
                i__2 = k + k * a_dim1;
                /* Computing 2nd power */
                d__2 = bkk;
                d__1 = akk * (d__2 * d__2);
                a[i__2].r = d__1, a[i__2].i = 0.;
            }
        }
    }
    return 0;

    /*     End of ZHEGS2 */

} /* zhegs2_ */
예제 #11
0
/*! \brief Solves one of the systems of equations A*x = b,   or   A'*x = b
 *
 * <pre>
 *   Purpose
 *   =======
 *
 *   sp_ztrsv() solves one of the systems of equations
 *       A*x = b,   or   A'*x = b,
 *   where b and x are n element vectors and A is a sparse unit , or
 *   non-unit, upper or lower triangular matrix.
 *   No test for singularity or near-singularity is included in this
 *   routine. Such tests must be performed before calling this routine.
 *
 *   Parameters
 *   ==========
 *
 *   uplo   - (input) char*
 *            On entry, uplo specifies whether the matrix is an upper or
 *             lower triangular matrix as follows:
 *                uplo = 'U' or 'u'   A is an upper triangular matrix.
 *                uplo = 'L' or 'l'   A is a lower triangular matrix.
 *
 *   trans  - (input) char*
 *             On entry, trans specifies the equations to be solved as
 *             follows:
 *                trans = 'N' or 'n'   A*x = b.
 *                trans = 'T' or 't'   A'*x = b.
 *                trans = 'C' or 'c'   A^H*x = b.
 *
 *   diag   - (input) char*
 *             On entry, diag specifies whether or not A is unit
 *             triangular as follows:
 *                diag = 'U' or 'u'   A is assumed to be unit triangular.
 *                diag = 'N' or 'n'   A is not assumed to be unit
 *                                    triangular.
 *
 *   L       - (input) SuperMatrix*
 *             The factor L from the factorization Pr*A*Pc=L*U. Use
 *             compressed row subscripts storage for supernodes,
 *             i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
 *
 *   U       - (input) SuperMatrix*
 *              The factor U from the factorization Pr*A*Pc=L*U.
 *              U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU.
 *
 *   x       - (input/output) doublecomplex*
 *             Before entry, the incremented array X must contain the n
 *             element right-hand side vector b. On exit, X is overwritten
 *             with the solution vector x.
 *
 *   info    - (output) int*
 *             If *info = -i, the i-th argument had an illegal value.
 * </pre>
 */
int
sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L,
         SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info)
{
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
    SCformat *Lstore;
    NCformat *Ustore;
    doublecomplex   *Lval, *Uval;
    int incx = 1, incy = 1;
    doublecomplex temp;
    doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
    doublecomplex comp_zero = {0.0, 0.0};
    int nrow;
    int fsupc, nsupr, nsupc, luptr, istart, irow;
    int i, k, iptr, jcol;
    doublecomplex *work;
    flops_t solve_ops;

    /* Test the input parameters */
    *info = 0;
    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") &&
              !lsame_(trans, "C")) *info = -2;
    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
    if ( *info ) {
        i = -(*info);
        xerbla_("sp_ztrsv", &i);
        return 0;
    }

    Lstore = L->Store;
    Lval = Lstore->nzval;
    Ustore = U->Store;
    Uval = Ustore->nzval;
    solve_ops = 0;

    if ( !(work = doublecomplexCalloc(L->nrow)) )
        ABORT("Malloc fails for work in sp_ztrsv().");

    if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */

        if ( lsame_(uplo, "L") ) {
            /* Form x := inv(L)*x */
            if ( L->nrow == 0 ) return 0; /* Quick return */

            for (k = 0; k <= Lstore->nsuper; k++) {
                fsupc = L_FST_SUPC(k);
                istart = L_SUB_START(fsupc);
                nsupr = L_SUB_START(fsupc+1) - istart;
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);
                nrow = nsupr - nsupc;

                /* 1 z_div costs 10 flops */
                solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc;
                solve_ops += 8 * nrow * nsupc;

                if ( nsupc == 1 ) {
                    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
                        irow = L_SUB(iptr);
                        ++luptr;
                        zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
                        z_sub(&x[irow], &x[irow], &comp_zero);
                    }
                } else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);

                    CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
                        &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#else
                    ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);

                    zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
                        &nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
#endif
#else
                    zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);

                    zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
                             &x[fsupc], &work[0] );
#endif

                    iptr = istart + nsupc;
                    for (i = 0; i < nrow; ++i, ++iptr) {
                        irow = L_SUB(iptr);
                        z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
                        work[i] = comp_zero;

                    }
                }
            } /* for k ... */

        } else {
            /* Form x := inv(U)*x */

            if ( U->nrow == 0 ) return 0; /* Quick return */

            for (k = Lstore->nsuper; k >= 0; k--) {
                fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                /* 1 z_div costs 10 flops */
                solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;

                if ( nsupc == 1 ) {
                    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
                    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
                        irow = U_SUB(i);
                        zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
                        z_sub(&x[irow], &x[irow], &comp_zero);
                    }
                } else {
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
                    CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
                       &x[fsupc], &incx);
#else
                    ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
                           &x[fsupc], &incx);
#endif
#else
                    zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
#endif

                    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                        solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
                        for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1);
                                i++) {
                            irow = U_SUB(i);
                        zz_mult(&comp_zero, &x[jcol], &Uval[i]);
                        z_sub(&x[irow], &x[irow], &comp_zero);
                        }
                    }
                }
            } /* for k ... */

        }
    } else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */

        if ( lsame_(uplo, "L") ) {
            /* Form x := inv(L')*x */
            if ( L->nrow == 0 ) return 0; /* Quick return */

            for (k = Lstore->nsuper; k >= 0; --k) {
                fsupc = L_FST_SUPC(k);
                istart = L_SUB_START(fsupc);
                nsupr = L_SUB_START(fsupc+1) - istart;
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                solve_ops += 8 * (nsupr - nsupc) * nsupc;

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    iptr = istart + nsupc;
                    for (i = L_NZ_START(jcol) + nsupc;
                                i < L_NZ_START(jcol+1); i++) {
                        irow = L_SUB(iptr);
                        zz_mult(&comp_zero, &x[irow], &Lval[i]);
                        z_sub(&x[jcol], &x[jcol], &comp_zero);
                        iptr++;
                    }
                }

                if ( nsupc > 1 ) {
                    solve_ops += 4 * nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
                    CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);
#else
                    ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);
#endif
                }
            }
        } else {
            /* Form x := inv(U')*x */
            if ( U->nrow == 0 ) return 0; /* Quick return */

            for (k = 0; k <= Lstore->nsuper; k++) {
                fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
                    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
                        irow = U_SUB(i);
                        zz_mult(&comp_zero, &x[irow], &Uval[i]);
                        z_sub(&x[jcol], &x[jcol], &comp_zero);
                    }
                }

                /* 1 z_div costs 10 flops */
                solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;

                if ( nsupc == 1 ) {
                    z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
                } else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd("T", strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
                    CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                            &x[fsupc], &incx);
#else
                    ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
                            &x[fsupc], &incx);
#endif
                }
            } /* for k ... */
        }
    } else { /* Form x := conj(inv(A'))*x */

        if ( lsame_(uplo, "L") ) {
            /* Form x := conj(inv(L'))*x */
            if ( L->nrow == 0 ) return 0; /* Quick return */

            for (k = Lstore->nsuper; k >= 0; --k) {
                fsupc = L_FST_SUPC(k);
                istart = L_SUB_START(fsupc);
                nsupr = L_SUB_START(fsupc+1) - istart;
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                solve_ops += 8 * (nsupr - nsupc) * nsupc;

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    iptr = istart + nsupc;
                    for (i = L_NZ_START(jcol) + nsupc;
                                i < L_NZ_START(jcol+1); i++) {
                        irow = L_SUB(iptr);
                        zz_conj(&temp, &Lval[i]);
                        zz_mult(&comp_zero, &x[irow], &temp);
                        z_sub(&x[jcol], &x[jcol], &comp_zero);
                        iptr++;
                    }
                }

                if ( nsupc > 1 ) {
                    solve_ops += 4 * nsupc * (nsupc - 1);
#ifdef _CRAY
                    ftcs1 = _cptofcd("L", strlen("L"));
                    ftcs2 = _cptofcd(trans, strlen("T"));
                    ftcs3 = _cptofcd("U", strlen("U"));
                    ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                        &x[fsupc], &incx);
#else
                    ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr,
                           &x[fsupc], &incx);
#endif
                }
            }
        } else {
            /* Form x := conj(inv(U'))*x */
            if ( U->nrow == 0 ) return 0; /* Quick return */

            for (k = 0; k <= Lstore->nsuper; k++) {
                fsupc = L_FST_SUPC(k);
                nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
                nsupc = L_FST_SUPC(k+1) - fsupc;
                luptr = L_NZ_START(fsupc);

                for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
                    solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
                    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
                        irow = U_SUB(i);
                        zz_conj(&temp, &Uval[i]);
                        zz_mult(&comp_zero, &x[irow], &temp);
                        z_sub(&x[jcol], &x[jcol], &comp_zero);
                    }
                }

                /* 1 z_div costs 10 flops */
                solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;

                if ( nsupc == 1 ) {
                    zz_conj(&temp, &Lval[luptr]);
                    z_div(&x[fsupc], &x[fsupc], &temp);
                } else {
#ifdef _CRAY
                    ftcs1 = _cptofcd("U", strlen("U"));
                    ftcs2 = _cptofcd(trans, strlen("T"));
                    ftcs3 = _cptofcd("N", strlen("N"));
                    ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
                            &x[fsupc], &incx);
#else
                    ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr,
                               &x[fsupc], &incx);
#endif
                }
            } /* for k ... */
        }
    }

    stat->ops[SOLVE] += solve_ops;
    SUPERLU_FREE(work);
    return 0;
}
예제 #12
0
파일: zsnode_bmod.c 프로젝트: 7924102/scipy
/*! \brief Performs numeric block updates within the relaxed snode. 
 */
int
zsnode_bmod (
	    const int  jcol,	  /* in */
	    const int  jsupno,    /* in */
	    const int  fsupc,     /* in */
	    doublecomplex     *dense,    /* in */
	    doublecomplex     *tempv,    /* working array */
	    GlobalLU_t *Glu,      /* modified */
	    SuperLUStat_t *stat   /* output */
	    )
{
#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
	 ftcs2 = _cptofcd("N", strlen("N")),
	 ftcs3 = _cptofcd("U", strlen("U"));
#endif
    int            incx = 1, incy = 1;
    doublecomplex         alpha = {-1.0, 0.0},  beta = {1.0, 0.0};
#endif

    doublecomplex   comp_zero = {0.0, 0.0};
    int            luptr, nsupc, nsupr, nrow;
    int            isub, irow, i, iptr; 
    register int   ufirst, nextlu;
    int            *lsub, *xlsub;
    doublecomplex         *lusup;
    int            *xlusup;
    flops_t *ops = stat->ops;

    lsub    = Glu->lsub;
    xlsub   = Glu->xlsub;
    lusup   = Glu->lusup;
    xlusup  = Glu->xlusup;

    nextlu = xlusup[jcol];
    
    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
        dense[irow] = comp_zero;
	++nextlu;
    }

    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
    
    if ( fsupc < jcol ) {

	luptr = xlusup[fsupc];
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
	nsupc = jcol - fsupc;	/* Excluding jcol */
	ufirst = xlusup[jcol];	/* Points to the beginning of column
				   jcol in supernode L\U(jsupno). */
	nrow = nsupr - nsupc;

	ops[TRSV] += 4 * nsupc * (nsupc - 1);
	ops[GEMV] += 8 * nrow * nsupc;

#ifdef USE_VENDOR_BLAS
#ifdef _CRAY
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
#if SCIPY_FIX
	if (nsupr < nsupc) {
	    /* Invalid input to LAPACK: fail more gracefully */
	    ABORT("superlu failure (singular matrix?)");
	}
#endif
	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
			&lusup[ufirst], &tempv[0] );

        /* Scatter tempv[*] into lusup[*] */
	iptr = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
	    z_sub(&lusup[iptr], &lusup[iptr], &tempv[i]);
            ++iptr;
	    tempv[i] = comp_zero;
	}
#endif

    }

    return 0;
}
예제 #13
0
/* Subroutine */ int zgglse_(integer *m, integer *n, integer *p, 
	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, 
	doublecomplex *c__, doublecomplex *d__, doublecomplex *x, 
	doublecomplex *work, integer *lwork, integer *info)
{
/*  -- LAPACK driver routine (version 3.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       June 30, 1999   


    Purpose   
    =======   

    ZGGLSE solves the linear equality-constrained least squares (LSE)   
    problem:   

            minimize || c - A*x ||_2   subject to   B*x = d   

    where A is an M-by-N matrix, B is a P-by-N matrix, c is a given   
    M-vector, and d is a given P-vector. It is assumed that   
    P <= N <= M+P, and   

             rank(B) = P and  rank( ( A ) ) = N.   
                                  ( ( B ) )   

    These conditions ensure that the LSE problem has a unique solution,   
    which is obtained using a GRQ factorization of the matrices B and A.   

    Arguments   
    =========   

    M       (input) INTEGER   
            The number of rows of the matrix A.  M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrices A and B. N >= 0.   

    P       (input) INTEGER   
            The number of rows of the matrix B. 0 <= P <= N <= M+P.   

    A       (input/output) COMPLEX*16 array, dimension (LDA,N)   
            On entry, the M-by-N matrix A.   
            On exit, A is destroyed.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1,M).   

    B       (input/output) COMPLEX*16 array, dimension (LDB,N)   
            On entry, the P-by-N matrix B.   
            On exit, B is destroyed.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1,P).   

    C       (input/output) COMPLEX*16 array, dimension (M)   
            On entry, C contains the right hand side vector for the   
            least squares part of the LSE problem.   
            On exit, the residual sum of squares for the solution   
            is given by the sum of squares of elements N-P+1 to M of   
            vector C.   

    D       (input/output) COMPLEX*16 array, dimension (P)   
            On entry, D contains the right hand side vector for the   
            constrained equation.   
            On exit, D is destroyed.   

    X       (output) COMPLEX*16 array, dimension (N)   
            On exit, X is the solution of the LSE problem.   

    WORK    (workspace/output) COMPLEX*16 array, dimension (LWORK)   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK >= max(1,M+N+P).   
            For optimum performance LWORK >= P+min(M,N)+max(M,N)*NB,   
            where NB is an upper bound for the optimal blocksizes for   
            ZGEQRF, CGERQF, ZUNMQR and CUNMRQ.   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    =====================================================================   


       Test the input parameters   

       Parameter adjustments */
    /* Table of constant values */
    static doublecomplex c_b1 = {1.,0.};
    static integer c__1 = 1;
    static integer c_n1 = -1;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
    doublecomplex z__1;
    /* Local variables */
    static integer lopt;
    extern /* Subroutine */ int zgemv_(char *, integer *, integer *, 
	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
	    integer *, doublecomplex *, doublecomplex *, integer *), 
	    zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, 
	    integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, 
	    integer *, doublecomplex *, integer *), ztrmv_(char *, char *, 
	    char *, integer *, doublecomplex *, integer *, doublecomplex *, 
	    integer *), ztrsv_(char *, char *, char *,
	     integer *, doublecomplex *, integer *, doublecomplex *, integer *
	    );
    static integer nb, mn, nr;
    extern /* Subroutine */ int xerbla_(char *, integer *);
    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
	    integer *, integer *, ftnlen, ftnlen);
    extern /* Subroutine */ int zggrqf_(integer *, integer *, integer *, 
	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
	    integer *, doublecomplex *, doublecomplex *, integer *, integer *)
	    ;
    static integer nb1, nb2, nb3, nb4, lwkopt;
    static logical lquery;
    extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, 
	    integer *, doublecomplex *, integer *, doublecomplex *, 
	    doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrq_(char *, char *, integer *, integer *, 
	    integer *, doublecomplex *, integer *, doublecomplex *, 
	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
#define a_subscr(a_1,a_2) (a_2)*a_dim1 + a_1
#define a_ref(a_1,a_2) a[a_subscr(a_1,a_2)]
#define b_subscr(a_1,a_2) (a_2)*b_dim1 + a_1
#define b_ref(a_1,a_2) b[b_subscr(a_1,a_2)]


    a_dim1 = *lda;
    a_offset = 1 + a_dim1 * 1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1 * 1;
    b -= b_offset;
    --c__;
    --d__;
    --x;
    --work;

    /* Function Body */
    *info = 0;
    mn = min(*m,*n);
    nb1 = ilaenv_(&c__1, "ZGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb2 = ilaenv_(&c__1, "ZGERQF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb3 = ilaenv_(&c__1, "ZUNMQR", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1);
    nb4 = ilaenv_(&c__1, "ZUNMRQ", " ", m, n, p, &c_n1, (ftnlen)6, (ftnlen)1);
/* Computing MAX */
    i__1 = max(nb1,nb2), i__1 = max(i__1,nb3);
    nb = max(i__1,nb4);
    lwkopt = *p + mn + max(*m,*n) * nb;
    work[1].r = (doublereal) lwkopt, work[1].i = 0.;
    lquery = *lwork == -1;
    if (*m < 0) {
	*info = -1;
    } else if (*n < 0) {
	*info = -2;
    } else if (*p < 0 || *p > *n || *p < *n - *m) {
	*info = -3;
    } else if (*lda < max(1,*m)) {
	*info = -5;
    } else if (*ldb < max(1,*p)) {
	*info = -7;
    } else /* if(complicated condition) */ {
/* Computing MAX */
	i__1 = 1, i__2 = *m + *n + *p;
	if (*lwork < max(i__1,i__2) && ! lquery) {
	    *info = -12;
	}
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("ZGGLSE", &i__1);
	return 0;
    } else if (lquery) {
	return 0;
    }

/*     Quick return if possible */

    if (*n == 0) {
	return 0;
    }

/*     Compute the GRQ factorization of matrices B and A:   

              B*Q' = (  0  T12 ) P   Z'*A*Q' = ( R11 R12 ) N-P   
                       N-P  P                  (  0  R22 ) M+P-N   
                                                 N-P  P   

       where T12 and R11 are upper triangular, and Q and Z are   
       unitary. */

    i__1 = *lwork - *p - mn;
    zggrqf_(p, m, n, &b[b_offset], ldb, &work[1], &a[a_offset], lda, &work[*p 
	    + 1], &work[*p + mn + 1], &i__1, info);
    i__1 = *p + mn + 1;
    lopt = (integer) work[i__1].r;

/*     Update c = Z'*c = ( c1 ) N-P   
                         ( c2 ) M+P-N */

    i__1 = max(1,*m);
    i__2 = *lwork - *p - mn;
    zunmqr_("Left", "Conjugate Transpose", m, &c__1, &mn, &a[a_offset], lda, &
	    work[*p + 1], &c__[1], &i__1, &work[*p + mn + 1], &i__2, info);
/* Computing MAX */
    i__3 = *p + mn + 1;
    i__1 = lopt, i__2 = (integer) work[i__3].r;
    lopt = max(i__1,i__2);

/*     Solve T12*x2 = d for x2 */

    ztrsv_("Upper", "No transpose", "Non unit", p, &b_ref(1, *n - *p + 1), 
	    ldb, &d__[1], &c__1);

/*     Update c1 */

    i__1 = *n - *p;
    z__1.r = -1., z__1.i = 0.;
    zgemv_("No transpose", &i__1, p, &z__1, &a_ref(1, *n - *p + 1), lda, &d__[
	    1], &c__1, &c_b1, &c__[1], &c__1);

/*     Sovle R11*x1 = c1 for x1 */

    i__1 = *n - *p;
    ztrsv_("Upper", "No transpose", "Non unit", &i__1, &a[a_offset], lda, &
	    c__[1], &c__1);

/*     Put the solutions in X */

    i__1 = *n - *p;
    zcopy_(&i__1, &c__[1], &c__1, &x[1], &c__1);
    zcopy_(p, &d__[1], &c__1, &x[*n - *p + 1], &c__1);

/*     Compute the residual vector: */

    if (*m < *n) {
	nr = *m + *p - *n;
	i__1 = *n - *m;
	z__1.r = -1., z__1.i = 0.;
	zgemv_("No transpose", &nr, &i__1, &z__1, &a_ref(*n - *p + 1, *m + 1),
		 lda, &d__[nr + 1], &c__1, &c_b1, &c__[*n - *p + 1], &c__1);
    } else {
	nr = *p;
    }
    ztrmv_("Upper", "No transpose", "Non unit", &nr, &a_ref(*n - *p + 1, *n - 
	    *p + 1), lda, &d__[1], &c__1);
    z__1.r = -1., z__1.i = 0.;
    zaxpy_(&nr, &z__1, &d__[1], &c__1, &c__[*n - *p + 1], &c__1);

/*     Backward transformation x = Q'*x */

    i__1 = *lwork - *p - mn;
    zunmrq_("Left", "Conjugate Transpose", n, &c__1, p, &b[b_offset], ldb, &
	    work[1], &x[1], n, &work[*p + mn + 1], &i__1, info);
/* Computing MAX */
    i__4 = *p + mn + 1;
    i__2 = lopt, i__3 = (integer) work[i__4].r;
    i__1 = *p + mn + max(i__2,i__3);
    work[1].r = (doublereal) i__1, work[1].i = 0.;

    return 0;

/*     End of ZGGLSE */

} /* zgglse_ */
예제 #14
0
int
pzgstrf_snode_bmod(
		   const int  pnum,   /* process number */
		   const int  jcol,   /* in - current column in the s-node */
		   const int  jsupno, /* in */
		   const int  fsupc,  /* in - first column in the s-node */
		   doublecomplex     *dense, /* in */
		   doublecomplex     *tempv, /* working array */
		   GlobalLU_t *Glu,   /* modified */
		   Gstat_t *Gstat     /* modified */
		   )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Performs numeric block updates within the relaxed supernode. 
 */

    doublecomplex      zero = {0.0, 0.0};
    doublecomplex      one = {1.0, 0.0};
    doublecomplex      none = {-1.0, 0.0};

#if ( MACH==CRAY_PVP )
    _fcd ftcs1, ftcs2, ftcs3;
#endif
#ifdef USE_VENDOR_BLAS    
    int            incx = 1, incy = 1;
    doublecomplex         alpha = none, beta = one;
#endif
    
    int            luptr, nsupc, nsupr, nrow;
    int            isub, irow, i, iptr; 
    register int   ufirst, nextlu;
    doublecomplex         *lusup;
    int            *lsub, *xlsub, *xlsub_end, *xlusup, *xlusup_end;
    register float flopcnt;

    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;
    lusup      = Glu->lusup;
    xlusup     = Glu->xlusup;
    xlusup_end = Glu->xlusup_end;

    nextlu = xlusup[jcol];
    
    /*
     *	Process the supernodal portion of L\U[*,j]
     */
    for (isub = xlsub[fsupc]; isub < xlsub_end[fsupc]; isub++) {
  	irow = lsub[isub];
	lusup[nextlu] = dense[irow];
	dense[irow] = zero;
	++nextlu;
    }

    xlusup_end[jcol] = nextlu;
    
    if ( fsupc < jcol ) {

	luptr = xlusup[fsupc];
	nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	nsupc = jcol - fsupc;	/* Excluding jcol */
	ufirst = xlusup[jcol];	/* Points to the beginning of column
				   jcol in supernode L\U(jsupno). */
	nrow = nsupr - nsupc;
	
        flopcnt = 4 * nsupc * (nsupc - 1) + 8 * nrow * nsupc;
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += nsupc * (nsupc - 1);
	ops[GEMV] += 2 * nrow * nsupc;    */

#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	ftcs1 = _cptofcd("L", strlen("L"));
	ftcs2 = _cptofcd("N", strlen("N"));
	ftcs3 = _cptofcd("U", strlen("U"));
	CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
	      &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#else
	ztrsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
	      &lusup[ufirst], &incx );
	zgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
#endif
#else
	zlsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
	zmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
		 &lusup[ufirst], &tempv[0] );

        /* Scatter tempv[*] into lusup[*] */
	iptr = ufirst + nsupc;
	for (i = 0; i < nrow; i++) {
            z_sub(&lusup[iptr], &lusup[iptr], &tempv[i]);
            ++iptr;
            tempv[i] = zero;
	}
#endif

    }

    return 0;
}
예제 #15
0
void
pzgstrf_bmod2D(
	       const int pnum,   /* process number */
	       const int m,      /* number of columns in the matrix */
	       const int w,      /* current panel width */
	       const int jcol,   /* leading column of the current panel */
	       const int fsupc,  /* leading column of the updating supernode */
	       const int krep,   /* last column of the updating supernode */
	       const int nsupc,  /* number of columns in the updating s-node */
	       int nsupr,        /* number of rows in the updating s-node */
	       int nrow,         /* number of rows below the diagonal block of
				    the updating supernode */
	       int *repfnz,      /* in */
	       int *panel_lsub,  /* modified */
	       int *w_lsub_end,  /* modified */
	       int *spa_marker,  /* modified; size n-by-w */
	       doublecomplex *dense,    /* modified */
	       doublecomplex *tempv,    /* working array - zeros on entry/exit */
	       GlobalLU_t *Glu,  /* modified */
	       Gstat_t *Gstat    /* modified */
	       )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *    Performs numeric 2-D block updates (sup-panel) in topological order.
 *    Results are returned in SPA dense[*,w].
 *
 */
#if ( MACH==CRAY_PVP )
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
         ftcs2 = _cptofcd("N", strlen("N")),
         ftcs3 = _cptofcd("U", strlen("U"));
#endif
#ifdef USE_VENDOR_BLAS
    int          incx = 1, incy = 1;
    doublecomplex      alpha, beta;
#endif
    doublecomplex      zero = {0.0, 0.0};
    doublecomplex      one = {1.0, 0.0};
    doublecomplex      comp_temp, comp_temp1;

    doublecomplex       ukj, ukj1, ukj2;
    int          luptr, luptr1, luptr2;
    int          segsze;
    int          block_nrow;  /* no of rows in a block row */
    register int lptr;	      /* points to the row subscripts of a supernode */
    int          kfnz, irow, no_zeros; 
    register int isub, isub1, i;
    register int jj;	      /* index through each column in the panel */
    int          krep_ind;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    int          *col_marker; /* each column of the spa_marker[*,w] */
    int          *col_lsub;   /* each column of the panel_lsub[*,w] */
    doublecomplex       *dense_col;  /* dense[] for a column in the panel */
    doublecomplex       *TriTmp, *MatvecTmp;
    register int ldaTmp;
    register int r_ind, r_hi;
    static   int first = 1, maxsuper, rowblk;
    int          *lsub, *xlsub_end;
    doublecomplex       *lusup;
    int          *xlusup;
    register float flopcnt;
    
#ifdef TIMING    
    double *utime = Gstat->utime;
    double f_time;
#endif    
    
    if ( first ) {
	maxsuper = sp_ienv(3);
	rowblk   = sp_ienv(4);
	first = 0;
    }
    ldaTmp = maxsuper + rowblk;

    lsub      = Glu->lsub;
    xlsub_end = Glu->xlsub_end;
    lusup     = Glu->lusup;
    xlusup    = Glu->xlusup;
    lptr      = Glu->xlsub[fsupc];
    krep_ind  = lptr + nsupc - 1;
    repfnz_col= repfnz;
    dense_col = dense;
    TriTmp    = tempv;
    col_marker= spa_marker;
    col_lsub  = panel_lsub;
	
	
    /* ---------------------------------------------------------------
     * Sequence through each column in the panel -- triangular solves.
     * The results of the triangular solves of all columns in the
     * panel are temporaroly stored in TriTemp array.
     * For the unrolled small supernodes of size <= 3, we also perform
     * matrix-vector updates from below the diagonal block.
     * ---------------------------------------------------------------
     */
    for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m,
	 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {

	kfnz = repfnz_col[krep];
	if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
	    
	segsze = krep - kfnz + 1;
	luptr = xlusup[fsupc];

        flopcnt = 4 * segsze * (segsze - 1) + 8 * nrow * segsze;
	Gstat->procstat[pnum].fcops += flopcnt;

/*	ops[TRSV] += segsze * (segsze - 1);
	ops[GEMV] += 2 * nrow * segsze;        */
	
#ifdef TIMING	    
	f_time = SuperLU_timer_();
#endif
	
	/* Case 1: Update U-segment of size 1 -- col-col update */
	if ( segsze == 1 ) {
	    ukj = dense_col[lsub[krep_ind]];
	    luptr += nsupr*(nsupc-1) + nsupc;
	    for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) {
		irow = lsub[i];
                zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
		++luptr;
#ifdef SCATTER_FOUND		
		if ( col_marker[irow] != jj ) {
		    col_marker[irow] = jj;
		    col_lsub[w_lsub_end[jj-jcol]++] = irow;
		}
#endif		
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
	} else if ( segsze <= 3 ) {
	    ukj = dense_col[lsub[krep_ind]];
	    ukj1 = dense_col[lsub[krep_ind - 1]];
	    luptr += nsupr*(nsupc-1) + nsupc-1;
	    luptr1 = luptr - nsupr;
	    if ( segsze == 2 ) {
                zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                z_sub(&ukj, &ukj, &comp_temp);
		dense_col[lsub[krep_ind]] = ukj;
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    luptr++; luptr1++;
                    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                    z_add(&comp_temp, &comp_temp, &comp_temp1);
                    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
#ifdef TIMING
		utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
	    } else {
		ukj2 = dense_col[lsub[krep_ind - 2]];
		luptr2 = luptr1 - nsupr;
                zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
                z_sub(&ukj1, &ukj1, &comp_temp);

                zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
                zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                z_add(&comp_temp, &comp_temp, &comp_temp1);
                z_sub(&ukj, &ukj, &comp_temp);
		dense_col[lsub[krep_ind]] = ukj;
		dense_col[lsub[krep_ind-1]] = ukj1;
		for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) {
		    irow = lsub[i];
		    luptr++; luptr1++; luptr2++;
                    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
                    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
                    z_add(&comp_temp, &comp_temp, &comp_temp1);
                    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
                    z_add(&comp_temp, &comp_temp, &comp_temp1);
                    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
#ifdef SCATTER_FOUND		
		    if ( col_marker[irow] != jj ) {
			col_marker[irow] = jj;
			col_lsub[w_lsub_end[jj-jcol]++] = irow;
		    }
#endif		
		}
	    }
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif
	} else  { /* segsze >= 4 */
	    /* Copy A[*,j] segment from dense[*] to TriTmp[*], which
	       holds the result of triangular solve.    */
	    no_zeros = kfnz - fsupc;
	    isub = lptr + no_zeros;
	    for (i = 0; i < segsze; ++i) {
		irow = lsub[isub];
		TriTmp[i] = dense_col[irow]; /* Gather */
		++isub;
	    }

	    /* start effective triangle */
	    luptr += nsupr * no_zeros + no_zeros;
	    
#ifdef TIMING	    
	    f_time = SuperLU_timer_();
#endif
	    
#ifdef USE_VENDOR_BLAS
#if ( MACH==CRAY_PVP )
	    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
		   &nsupr, TriTmp, &incx );
#else
	    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
		   &nsupr, TriTmp, &incx );
#endif
#else		
	    zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
#endif
		
#ifdef TIMING	    
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    
	} /* else ... */
	    
    }  /* for jj ... end tri-solves */

    /* --------------------------------------------------------
     * Perform block row updates from below the diagonal block.
     * Push each block all the way into SPA dense[*].
     * --------------------------------------------------------
     */
    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
	    
	r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
	block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
	luptr = xlusup[fsupc] + nsupc + r_ind;
	isub1 = lptr + nsupc + r_ind;
	    
	repfnz_col = repfnz;
	TriTmp = tempv;
	dense_col = dense;
	col_marker= spa_marker;
	col_lsub  = panel_lsub;
	
	/* Sequence through each column in the panel -- matrix-vector */
	for (jj = jcol; jj < jcol + w; ++jj, col_marker += m, col_lsub += m,
	     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {

	    kfnz = repfnz_col[krep];
	    if ( kfnz == EMPTY ) continue; /* skip any zero segment */
	    
	    segsze = krep - kfnz + 1;
	    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
		
	    /* Perform a block update, and scatter the result of
	       matrix-vector into SPA dense[*].		 */
	    no_zeros = kfnz - fsupc;
	    luptr1 = luptr + nsupr * no_zeros;
	    MatvecTmp = &TriTmp[maxsuper];
	    
#ifdef TIMING
	    f_time = SuperLU_timer_();
#endif	    
	    
#ifdef USE_VENDOR_BLAS
            alpha = one;
            beta = zero;
#if ( MACH==CRAY_PVP )
	    CGEMV( ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr], 
		  &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy );
#else
	    zgemv_( "N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
		   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy );
#endif /* _CRAY_PVP */
#else
	    zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
		    TriTmp, MatvecTmp);
#endif
		
#ifdef TIMING
	    utime[FLOAT] += SuperLU_timer_() - f_time;
#endif	    

	    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily,
	     * such that MatvecTmp[*] can be re-used for the
	     * the next block row update. dense[] will be copied into 
	     * global store after the whole panel has been finished.
	     */
	    isub = isub1;
	    for (i = 0; i < block_nrow; i++) {
		irow = lsub[isub];
                z_sub(&dense_col[irow], &dense_col[irow],
                              &MatvecTmp[i]); /* Scatter-add */
#ifdef SCATTER_FOUND		
		if ( col_marker[irow] != jj ) {
		    col_marker[irow] = jj;
		    col_lsub[w_lsub_end[jj-jcol]++] = irow;
		}
#endif		
		MatvecTmp[i] = zero;
		++isub;
	    }
	    
	} /* for jj ... */

    } /* for each block row ... */

    
    /* ------------------------------------------------
       Scatter the triangular solves into SPA dense[*].
       ------------------------------------------------ */
    repfnz_col = repfnz;
    TriTmp = tempv;
    dense_col = dense;
    
    for (jj = 0; jj < w; ++jj, repfnz_col += m, dense_col += m, 
	 TriTmp += ldaTmp) {
	kfnz = repfnz_col[krep];
	if ( kfnz == EMPTY ) continue; /* skip any zero segment */
	
	segsze = krep - kfnz + 1;
	if ( segsze <= 3 ) continue; /* skip unrolled cases */
	
	no_zeros = kfnz - fsupc;		
	isub = lptr + no_zeros;
	for (i = 0; i < segsze; i++) {
	    irow = lsub[isub];
	    dense_col[irow] = TriTmp[i]; /* Scatter */
	    TriTmp[i] = zero;
	    ++isub;
	}
    } /* for jj ... */
	
}
예제 #16
0
/* Subroutine */ int zhegs2_(integer *itype, char *uplo, integer *n, 
	doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, 
	integer *info)
{
/*  -- LAPACK routine (version 2.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   


    Purpose   
    =======   

    ZHEGS2 reduces a complex Hermitian-definite generalized   
    eigenproblem to standard form.   

    If ITYPE = 1, the problem is A*x = lambda*B*x,   
    and A is overwritten by inv(U')*A*inv(U) or inv(L)*A*inv(L')   

    If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or   
    B*A*x = lambda*x, and A is overwritten by U*A*U` or L'*A*L.   

    B must have been previously factorized as U'*U or L*L' by ZPOTRF.   

    Arguments   
    =========   

    ITYPE   (input) INTEGER   
            = 1: compute inv(U')*A*inv(U) or inv(L)*A*inv(L');   
            = 2 or 3: compute U*A*U' or L'*A*L.   

    UPLO    (input) CHARACTER   
            Specifies whether the upper or lower triangular part of the   
            Hermitian matrix A is stored, and how B has been factorized. 
  
            = 'U':  Upper triangular   
            = 'L':  Lower triangular   

    N       (input) INTEGER   
            The order of the matrices A and B.  N >= 0.   

    A       (input/output) COMPLEX*16 array, dimension (LDA,N)   
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading 
  
            n by n upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading n by n lower triangular part of A contains the lower 
  
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the transformed matrix, stored in the   
            same format as A.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    B       (input) COMPLEX*16 array, dimension (LDB,N)   
            The triangular factor from the Cholesky factorization of B,   
            as returned by ZPOTRF.   

    LDB     (input) INTEGER   
            The leading dimension of the array B.  LDB >= max(1,N).   

    INFO    (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    ===================================================================== 
  


       Test the input parameters.   

    
   Parameter adjustments   
       Function Body */
    /* Table of constant values */
    static doublecomplex c_b1 = {1.,0.};
    static integer c__1 = 1;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
    doublereal d__1, d__2;
    doublecomplex z__1;
    /* Local variables */
    extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, 
	    doublecomplex *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, integer *);
    static integer k;
    extern logical lsame_(char *, char *);
    static logical upper;
    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, 
	    doublecomplex *, integer *, doublecomplex *, integer *), ztrmv_(
	    char *, char *, char *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, integer *), ztrsv_(char *
	    , char *, char *, integer *, doublecomplex *, integer *, 
	    doublecomplex *, integer *);
    static doublecomplex ct;
    extern /* Subroutine */ int xerbla_(char *, integer *), zdscal_(
	    integer *, doublereal *, doublecomplex *, integer *), zlacgv_(
	    integer *, doublecomplex *, integer *);
    static doublereal akk, bkk;




#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]

    *info = 0;
    upper = lsame_(uplo, "U");
    if (*itype < 1 || *itype > 3) {
	*info = -1;
    } else if (! upper && ! lsame_(uplo, "L")) {
	*info = -2;
    } else if (*n < 0) {
	*info = -3;
    } else if (*lda < max(1,*n)) {
	*info = -5;
    } else if (*ldb < max(1,*n)) {
	*info = -7;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("ZHEGS2", &i__1);
	return 0;
    }

    if (*itype == 1) {
	if (upper) {

/*           Compute inv(U')*A*inv(U) */

	    i__1 = *n;
	    for (k = 1; k <= *n; ++k) {

/*              Update the upper triangle of A(k:n,k:n) */

		i__2 = k + k * a_dim1;
		akk = A(k,k).r;
		i__2 = k + k * b_dim1;
		bkk = B(k,k).r;
/* Computing 2nd power */
		d__1 = bkk;
		akk /= d__1 * d__1;
		i__2 = k + k * a_dim1;
		A(k,k).r = akk, A(k,k).i = 0.;
		if (k < *n) {
		    i__2 = *n - k;
		    d__1 = 1. / bkk;
		    zdscal_(&i__2, &d__1, &A(k,k+1), lda);
		    d__1 = akk * -.5;
		    ct.r = d__1, ct.i = 0.;
		    i__2 = *n - k;
		    zlacgv_(&i__2, &A(k,k+1), lda);
		    i__2 = *n - k;
		    zlacgv_(&i__2, &B(k,k+1), ldb);
		    i__2 = *n - k;
		    zaxpy_(&i__2, &ct, &B(k,k+1), ldb, &A(k,k+1), lda);
		    i__2 = *n - k;
		    z__1.r = -1., z__1.i = 0.;
		    zher2_(uplo, &i__2, &z__1, &A(k,k+1), lda, 
			    &B(k,k+1), ldb, &A(k+1,k+1), lda);
		    i__2 = *n - k;
		    zaxpy_(&i__2, &ct, &B(k,k+1), ldb, &A(k,k+1), lda);
		    i__2 = *n - k;
		    zlacgv_(&i__2, &B(k,k+1), ldb);
		    i__2 = *n - k;
		    ztrsv_(uplo, "Conjugate transpose", "Non-unit", &i__2, &B(k+1,k+1), ldb, &A(k,k+1), lda);
		    i__2 = *n - k;
		    zlacgv_(&i__2, &A(k,k+1), lda);
		}
/* L10: */
	    }
	} else {

/*           Compute inv(L)*A*inv(L') */

	    i__1 = *n;
	    for (k = 1; k <= *n; ++k) {

/*              Update the lower triangle of A(k:n,k:n) */

		i__2 = k + k * a_dim1;
		akk = A(k,k).r;
		i__2 = k + k * b_dim1;
		bkk = B(k,k).r;
/* Computing 2nd power */
		d__1 = bkk;
		akk /= d__1 * d__1;
		i__2 = k + k * a_dim1;
		A(k,k).r = akk, A(k,k).i = 0.;
		if (k < *n) {
		    i__2 = *n - k;
		    d__1 = 1. / bkk;
		    zdscal_(&i__2, &d__1, &A(k+1,k), &c__1);
		    d__1 = akk * -.5;
		    ct.r = d__1, ct.i = 0.;
		    i__2 = *n - k;
		    zaxpy_(&i__2, &ct, &B(k+1,k), &c__1, &A(k+1,k), &c__1);
		    i__2 = *n - k;
		    z__1.r = -1., z__1.i = 0.;
		    zher2_(uplo, &i__2, &z__1, &A(k+1,k), &c__1, 
			    &B(k+1,k), &c__1, &A(k+1,k+1), lda);
		    i__2 = *n - k;
		    zaxpy_(&i__2, &ct, &B(k+1,k), &c__1, &A(k+1,k), &c__1);
		    i__2 = *n - k;
		    ztrsv_(uplo, "No transpose", "Non-unit", &i__2, &B(k+1,k+1), ldb, &A(k+1,k), 
			    &c__1);
		}
/* L20: */
	    }
	}
    } else {
	if (upper) {

/*           Compute U*A*U' */

	    i__1 = *n;
	    for (k = 1; k <= *n; ++k) {

/*              Update the upper triangle of A(1:k,1:k) */

		i__2 = k + k * a_dim1;
		akk = A(k,k).r;
		i__2 = k + k * b_dim1;
		bkk = B(k,k).r;
		i__2 = k - 1;
		ztrmv_(uplo, "No transpose", "Non-unit", &i__2, &B(1,1), 
			ldb, &A(1,k), &c__1);
		d__1 = akk * .5;
		ct.r = d__1, ct.i = 0.;
		i__2 = k - 1;
		zaxpy_(&i__2, &ct, &B(1,k), &c__1, &A(1,k), &c__1);
		i__2 = k - 1;
		zher2_(uplo, &i__2, &c_b1, &A(1,k), &c__1, &B(1,k), &c__1, &A(1,1), lda);
		i__2 = k - 1;
		zaxpy_(&i__2, &ct, &B(1,k), &c__1, &A(1,k), &c__1);
		i__2 = k - 1;
		zdscal_(&i__2, &bkk, &A(1,k), &c__1);
		i__2 = k + k * a_dim1;
/* Computing 2nd power */
		d__2 = bkk;
		d__1 = akk * (d__2 * d__2);
		A(k,k).r = d__1, A(k,k).i = 0.;
/* L30: */
	    }
	} else {

/*           Compute L'*A*L */

	    i__1 = *n;
	    for (k = 1; k <= *n; ++k) {

/*              Update the lower triangle of A(1:k,1:k) */

		i__2 = k + k * a_dim1;
		akk = A(k,k).r;
		i__2 = k + k * b_dim1;
		bkk = B(k,k).r;
		i__2 = k - 1;
		zlacgv_(&i__2, &A(k,1), lda);
		i__2 = k - 1;
		ztrmv_(uplo, "Conjugate transpose", "Non-unit", &i__2, &B(1,1), ldb, &A(k,1), lda);
		d__1 = akk * .5;
		ct.r = d__1, ct.i = 0.;
		i__2 = k - 1;
		zlacgv_(&i__2, &B(k,1), ldb);
		i__2 = k - 1;
		zaxpy_(&i__2, &ct, &B(k,1), ldb, &A(k,1), lda);
		i__2 = k - 1;
		zher2_(uplo, &i__2, &c_b1, &A(k,1), lda, &B(k,1)
			, ldb, &A(1,1), lda);
		i__2 = k - 1;
		zaxpy_(&i__2, &ct, &B(k,1), ldb, &A(k,1), lda);
		i__2 = k - 1;
		zlacgv_(&i__2, &B(k,1), ldb);
		i__2 = k - 1;
		zdscal_(&i__2, &bkk, &A(k,1), lda);
		i__2 = k - 1;
		zlacgv_(&i__2, &A(k,1), lda);
		i__2 = k + k * a_dim1;
/* Computing 2nd power */
		d__2 = bkk;
		d__1 = akk * (d__2 * d__2);
		A(k,k).r = d__1, A(k,k).i = 0.;
/* L40: */
	    }
	}
    }
    return 0;

/*     End of ZHEGS2 */

} /* zhegs2_ */