Пример #1
0
/**
    Purpose
    -------
    ZLAQPS computes a step of QR factorization with column pivoting
    of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. N >= 0

    @param[in]
    offset  INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    @param[in]
    nb      INTEGER
            The number of columns to factorize.

    @param[out]
    kb      INTEGER
            The number of columns actually factorized.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    @param[in,out]
    jpvt    INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    @param[out]
    tau     COMPLEX_16 array, dimension (KB)
            The scalar factors of the elementary reflectors.

    @param[in,out]
    vn1     DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    @param[in,out]
    vn2     DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    @param[in,out]
    auxv    COMPLEX_16 array, dimension (NB)
            Auxiliar vector.

    @param[in,out]
    F       COMPLEX_16 array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    @param[in]
    ldf     INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    @ingroup magma_zgeqp3_aux
    ********************************************************************/
extern "C" magma_int_t
magma_zlaqps(
    magma_int_t m, magma_int_t n, magma_int_t offset,
    magma_int_t nb, magma_int_t *kb,
    magmaDoubleComplex     *A, magma_int_t lda,
    magmaDoubleComplex_ptr dA, magma_int_t ldda,
    magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2,
    magmaDoubleComplex *auxv,
    magmaDoubleComplex     *F, magma_int_t ldf,
    magmaDoubleComplex_ptr dF, magma_int_t lddf)
{
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    magmaDoubleComplex c_zero    = MAGMA_Z_MAKE( 0.,0.);
    magmaDoubleComplex c_one     = MAGMA_Z_MAKE( 1.,0.);
    magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    double d__1;
    magmaDoubleComplex z__1;
    
    magma_int_t j, k, rk;
    magmaDoubleComplex Akk;
    magma_int_t pvt;
    double temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon"));

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // subtract 1 from Fortran idamax; pvt, k are 0-based.
        i__1 = n-k;
        pvt = k + blasf77_idamax( &i__1, &vn1[k], &ione ) - 1;
        
        if (pvt != k) {
            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_zgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, queue );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb) {
                /* no need of transfer if pivot is within the panel */
                blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( queue );

                /* 2. Swap as usual on CPU                          */
                blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_zsetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, queue );
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #ifdef COMPLEX
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_Z_CONJ( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #ifdef COMPLEX
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_Z_CONJ( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_zlarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_zlarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue );
        
            /* Multiply on GPU */
            // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione, queue );
            
            magma_zgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, queue );
            
            blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( queue );
            blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_Z_NEGATE( tau[k] );
            blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_zgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (double) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_dsqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_zsetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2, queue );

        magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda, queue );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb) {
            vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione );
        }
        else {
            /* Where is the data, CPU or GPU ? */
            double r1, r2;
            
            r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione );
            r2 = magma_dznrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue );
            
            //vn1[lsticc] = magma_dznrm2( i__1, dA(rk + 1, lsticc), ione, queue );
            vn1[lsticc] = magma_dsqrt(r1*r1 + r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( queue );

    return MAGMA_SUCCESS;
} /* magma_zlaqps */
Пример #2
0
magma_int_t magma_zlatrsd(
    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_bool_t normin,
    magma_int_t n, const magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex lambda,
    magmaDoubleComplex *x,
    double *scale, double *cnorm,
    magma_int_t *info)
{
#define A(i,j) (A + (i) + (j)*lda)

    /* constants */
    const magma_int_t ione = 1;
    const double d_half = 0.5;
    const magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    const magmaDoubleComplex c_one  = MAGMA_Z_ONE;

    /* System generated locals */
    magma_int_t len;
    magmaDoubleComplex ztmp;

    /* Local variables */
    magma_int_t i, j;
    double xj, rec, tjj;
    magma_int_t jinc;
    double xbnd;
    magma_int_t imax;
    double tmax;
    magmaDoubleComplex tjjs;
    double xmax, grow;

    double tscal;
    magmaDoubleComplex uscal;
    magma_int_t jlast;
    magmaDoubleComplex csumj;

    double bignum;
    magma_int_t jfirst;
    double smlnum;

    /* Function Body */
    *info = 0;
    magma_int_t upper  = (uplo  == MagmaUpper);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t nounit = (diag  == MagmaNonUnit);

    /* Test the input parameters. */
    if ( ! upper && uplo != MagmaLower ) {
        *info = -1;
    }
    else if (! notran &&
             trans != MagmaTrans &&
             trans != MagmaConjTrans) {
        *info = -2;
    }
    else if ( ! nounit && diag != MagmaUnit ) {
        *info = -3;
    }
    else if ( ! (normin == MagmaTrue) &&
              ! (normin == MagmaFalse) ) {
        *info = -4;
    }
    else if ( n < 0 ) {
        *info = -5;
    }
    else if ( lda < max(1,n) ) {
        *info = -7;
    }
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if ( n == 0 ) {
        return *info;
    }

    /* Determine machine dependent parameters to control overflow. */
    smlnum = lapackf77_dlamch( "Safe minimum" );
    bignum = 1. / smlnum;
    lapackf77_dlabad( &smlnum, &bignum );
    smlnum /= lapackf77_dlamch( "Precision" );
    bignum = 1. / smlnum;
    *scale = 1.;

    if ( normin == MagmaFalse ) {
        /* Compute the 1-norm of each column, not including the diagonal. */
        if ( upper ) {
            /* A is upper triangular. */
            cnorm[0] = 0.;
            for( j = 1; j < n; ++j ) {
                cnorm[j] = magma_cblas_dzasum( j, A(0,j), ione );
            }
        }
        else {
            /* A is lower triangular. */
            for( j = 0; j < n-1; ++j ) {
                cnorm[j] = magma_cblas_dzasum( n-(j+1), A(j+1,j), ione );
            }
            cnorm[n-1] = 0.;
        }
    }

    /* Scale the column norms by TSCAL if the maximum element in CNORM is */
    /* greater than BIGNUM/2. */
    imax = blasf77_idamax( &n, &cnorm[0], &ione ) - 1;
    tmax = cnorm[imax];
    if ( tmax <= bignum * 0.5 ) {
        tscal = 1.;
    }
    else {
        tscal = 0.5 / (smlnum * tmax);
        blasf77_dscal( &n, &tscal, &cnorm[0], &ione );
    }

    /* ================================================================= */
    /* Compute a bound on the computed solution vector to see if the */
    /* Level 2 BLAS routine ZTRSV can be used. */
    xmax = 0.;
    for( j = 0; j < n; ++j ) {
        xmax = max( xmax, 0.5*MAGMA_Z_ABS1( x[j] ));
    }
    xbnd = xmax;

    if ( notran ) {
        /* ---------------------------------------- */
        /* Compute the growth in A * x = b. */
        if ( upper ) {
            jfirst = n-1;
            jlast  = 0;
            jinc   = -1;
        }
        else {
            jfirst = 0;
            jlast  = n;
            jinc   = 1;
        }

        if ( tscal != 1. ) {
            grow = 0.;
            goto L60;
        }

        /* A is non-unit triangular. */
        /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
        /* Initially, G(0) = max{x(i), i=1,...,n}. */
        grow = 0.5 / max( xbnd, smlnum );
        xbnd = grow;
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Exit the loop if the growth factor is too small. */
            if ( grow <= smlnum ) {
                goto L60;
            }

            if ( nounit ) {
                tjjs = *A(j,j) - lambda;
            }
            else {
                tjjs = c_one - lambda;
            }
            tjj = MAGMA_Z_ABS1( tjjs );

            if ( tjj >= smlnum ) {
                /* M(j) = G(j-1) / abs(A(j,j)) */
                xbnd = min( xbnd, min(1.,tjj)*grow );
            }
            else {
                /* M(j) could overflow, set XBND to 0. */
                xbnd = 0.;
            }

            if ( tjj + cnorm[j] >= smlnum ) {
                /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */
                grow *= (tjj / (tjj + cnorm[j]));
            }
            else {
                /* G(j) could overflow, set GROW to 0. */
                grow = 0.;
            }
        }
        grow = xbnd;
L60:
        ;
    }
    else {
        /* ---------------------------------------- */
        /* Compute the growth in A**T * x = b  or  A**H * x = b. */
        if ( upper ) {
            jfirst = 0;
            jlast  = n;
            jinc   = 1;
        }
        else {
            jfirst = n-1;
            jlast  = 0;
            jinc   = -1;
        }

        if ( tscal != 1. ) {
            grow = 0.;
            goto L90;
        }

        /* A is non-unit triangular. */
        /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
        /* Initially, M(0) = max{x(i), i=1,...,n}. */
        grow = 0.5 / max( xbnd, smlnum );
        xbnd = grow;
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Exit the loop if the growth factor is too small. */
            if ( grow <= smlnum ) {
                goto L90;
            }

            /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */
            xj = 1. + cnorm[j];
            grow = min( grow, xbnd / xj );

            if ( nounit ) {
                tjjs = *A(j,j) - lambda;
            }
            else {
                tjjs = c_one - lambda;
            }
            tjj = MAGMA_Z_ABS1( tjjs );

            if ( tjj >= smlnum ) {
                /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */
                if ( xj > tjj ) {
                    xbnd *= (tjj / xj);
                }
            }
            else {
                /* M(j) could overflow, set XBND to 0. */
                xbnd = 0.;
            }
        }
        grow = min( grow, xbnd );
L90:
        ;
    }
        
    /* ================================================================= */
    /* Due to modified diagonal, we can't use regular BLAS ztrsv. */
    
    /* Use a Level 1 BLAS solve, scaling intermediate results. */
    if ( xmax > bignum * 0.5 ) {
        /* Scale X so that its components are less than or equal to */
        /* BIGNUM in absolute value. */
        *scale = (bignum * 0.5) / xmax;
        blasf77_zdscal( &n, scale, &x[0], &ione );
        xmax = bignum;
    }
    else {
        xmax *= 2.;
    }

    if ( notran ) {
        /* ---------------------------------------- */
        /* Solve A * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */
            xj = MAGMA_Z_ABS1( x[j] );
            if ( nounit ) {
                tjjs = (*A(j,j) - lambda ) * tscal;
            }
            else {
                tjjs = (c_one - lambda) * tscal;
                if ( tscal == 1. ) {
                    goto L110;
                }
            }
            tjj = MAGMA_Z_ABS1( tjjs );
            if ( tjj > smlnum ) {
                /* abs(A(j,j)) > SMLNUM: */
                if ( tjj < 1. ) {
                    if ( xj > tjj * bignum ) {
                        /* Scale x by 1/b(j). */
                        rec = 1. / xj;
                        blasf77_zdscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax *= rec;
                    }
                }
                x[j] = x[j] / tjjs;
                xj   = MAGMA_Z_ABS1( x[j] );
            }
            else if ( tjj > 0. ) {
                /* 0 < abs(A(j,j)) <= SMLNUM: */
                if ( xj > tjj * bignum ) {
                    /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */
                    /* to avoid overflow when dividing by A(j,j). */
                    rec = (tjj * bignum) / xj;
                    if ( cnorm[j] > 1. ) {
                        /* Scale by 1/CNORM(j) to avoid overflow when */
                        /* multiplying x(j) times column j. */
                        rec /= cnorm[j];
                    }
                    blasf77_zdscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax *= rec;
                }
                x[j] = x[j] / tjjs;
                xj   = MAGMA_Z_ABS1( x[j] );
            }
            else {
                /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                /* scale = 0, and compute a solution to A*x = 0. */
                for( i = 0; i < n; ++i ) {
                    x[i] = c_zero;
                }
                x[j]   = c_one;
                xj     = 1.;
                *scale = 0.;
                xmax   = 0.;
            }
L110:

            /* Scale x if necessary to avoid overflow when adding a */
            /* multiple of column j of A. */
            if ( xj > 1. ) {
                rec = 1. / xj;
                if ( cnorm[j] > (bignum - xmax) * rec ) {
                    /* Scale x by 1/(2*abs(x(j))). */
                    rec *= 0.5;
                    blasf77_zdscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                }
            }
            else if ( xj * cnorm[j] > bignum - xmax ) {
                /* Scale x by 1/2. */
                blasf77_zdscal( &n, &d_half, &x[0], &ione );
                *scale *= 0.5;
            }

            if ( upper ) {
                if ( j > 0 ) {
                    /* Compute the update */
                    /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */
                    len = j;
                    ztmp = -tscal * x[j];
                    blasf77_zaxpy( &len, &ztmp, A(0,j), &ione, &x[0], &ione );
                    i = blasf77_izamax( &len, &x[0], &ione ) - 1;
                    xmax = MAGMA_Z_ABS1( x[i] );
                }
            }
            else {
                if ( j < n-1 ) {
                    /* Compute the update */
                    /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */
                    len = n - (j+1);
                    ztmp = -tscal * x[j];
                    blasf77_zaxpy( &len, &ztmp, A(j+1,j), &ione, &x[j + 1], &ione );
                    i = j + blasf77_izamax( &len, &x[j + 1], &ione );
                    xmax = MAGMA_Z_ABS1( x[i] );
                }
            }
        }
    }
    else if ( trans == MagmaTrans ) {
        /* ---------------------------------------- */
        /* Solve A**T * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) - sum A(k,j)*x(k). */
            /*                       k<>j             */
            xj = MAGMA_Z_ABS1( x[j] );
            uscal = MAGMA_Z_MAKE( tscal, 0. );
            rec = 1. / max( xmax, 1. );
            if ( cnorm[j] > (bignum - xj) * rec ) {
                /* If x(j) could overflow, scale x by 1/(2*XMAX). */
                rec *= 0.5;
                if ( nounit ) {
                    tjjs = (*A(j,j) - lambda) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                }
                tjj = MAGMA_Z_ABS1( tjjs );
                if ( tjj > 1. ) {
                    /* Divide by A(j,j) when scaling x if A(j,j) > 1. */
                    rec = min( 1., rec * tjj );
                    uscal = uscal / tjjs;
                }
                if ( rec < 1. ) {
                    blasf77_zdscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax *= rec;
                }
            }

            csumj = c_zero;
            if ( uscal == c_one ) {
                /* If the scaling needed for A in the dot product is 1, */
                /* call ZDOTU to perform the dot product. */
                if ( upper ) {
                    csumj = magma_cblas_zdotu( j, A(0,j), ione, &x[0], ione );
                }
                else if ( j < n-1 ) {
                    csumj = magma_cblas_zdotu( n-(j+1), A(j+1,j), ione, &x[j+1], ione );
                }
            }
            else {
                /* Otherwise, use in-line code for the dot product. */
                if ( upper ) {
                    for( i = 0; i < j; ++i ) {
                        csumj += (*A(i,j) * uscal) * x[i];
                    }
                }
                else if ( j < n-1 ) {
                    for( i = j+1; i < n; ++i ) {
                        csumj += (*A(i,j) * uscal) * x[i];
                    }
                }
            }

            if ( uscal == MAGMA_Z_MAKE( tscal, 0. )) {
                /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
                /* was not used to scale the dotproduct. */
                x[j] -= csumj;
                xj = MAGMA_Z_ABS1( x[j] );
                if ( nounit ) {
                    tjjs = (*A(j,j) - lambda) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                    if ( tscal == 1. ) {
                        goto L160;
                    }
                }

                /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */
                tjj = MAGMA_Z_ABS1( tjjs );
                if ( tjj > smlnum ) {
                    /* abs(A(j,j)) > SMLNUM: */
                    if ( tjj < 1. ) {
                        if ( xj > tjj * bignum ) {
                            /* Scale X by 1/abs(x(j)). */
                            rec = 1. / xj;
                            blasf77_zdscal( &n, &rec, &x[0], &ione );
                            *scale *= rec;
                            xmax   *= rec;
                        }
                    }
                    x[j] = x[j] / tjjs;
                }
                else if ( tjj > 0. ) {
                    /* 0 < abs(A(j,j)) <= SMLNUM: */
                    if ( xj > tjj * bignum ) {
                        /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
                        rec = (tjj * bignum) / xj;
                        blasf77_zdscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax   *= rec;
                    }
                    x[j] = x[j] / tjjs;
                }
                else {
                    /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                    /* scale = 0 and compute a solution to A**T *x = 0. */
                    for( i = 0; i < n; ++i ) {
                        x[i] = c_zero;
                    }
                    x[j]   = c_one;
                    *scale = 0.;
                    xmax   = 0.;
                }
L160:
                ;
            }
            else {
                /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
                /* product has already been divided by 1/A(j,j). */
                x[j] = (x[j] / tjjs) - csumj;
            }
            xmax = max( xmax, MAGMA_Z_ABS1( x[j] ));
        }
    }
    else {
        /* ---------------------------------------- */
        /* Solve A**H * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) - sum A(k,j)*x(k). */
            /*                       k<>j             */
            xj = MAGMA_Z_ABS1( x[j] );
            uscal = MAGMA_Z_MAKE( tscal, 0. );
            rec = 1. / max(xmax, 1.);
            if ( cnorm[j] > (bignum - xj) * rec ) {
                /* If x(j) could overflow, scale x by 1/(2*XMAX). */
                rec *= 0.5;
                if ( nounit ) {
                    tjjs = MAGMA_Z_CONJ( *A(j,j) - lambda ) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                }
                tjj = MAGMA_Z_ABS1( tjjs );
                if ( tjj > 1. ) {
                    /* Divide by A(j,j) when scaling x if A(j,j) > 1. */
                    rec = min( 1., rec * tjj );
                    uscal = uscal / tjjs;
                }
                if ( rec < 1. ) {
                    blasf77_zdscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax   *= rec;
                }
            }

            csumj = c_zero;
            if ( uscal == c_one ) {
                /* If the scaling needed for A in the dot product is 1, */
                /* call ZDOTC to perform the dot product. */
                if ( upper ) {
                    csumj = magma_cblas_zdotc( j, A(0,j), ione, &x[0], ione );
                }
                else if ( j < n-1 ) {
                    csumj = magma_cblas_zdotc( n-(j+1), A(j+1,j), ione, &x[j+1], ione );
                }
            }
            else {
                /* Otherwise, use in-line code for the dot product. */
                if ( upper ) {
                    for( i = 0; i < j; ++i ) {
                        csumj += (MAGMA_Z_CONJ( *A(i,j) ) * uscal) * x[i];
                    }
                }
                else if ( j < n-1 ) {
                    for( i = j + 1; i < n; ++i ) {
                        csumj += (MAGMA_Z_CONJ( *A(i,j) ) * uscal) * x[i];
                    }
                }
            }

            if ( uscal == tscal ) {
                /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
                /* was not used to scale the dotproduct. */
                x[j] -= csumj;
                xj = MAGMA_Z_ABS1( x[j] );
                if ( nounit ) {
                    tjjs = MAGMA_Z_CONJ( *A(j,j) - lambda ) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                    if ( tscal == 1. ) {
                        goto L210;
                    }
                }

                /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */
                tjj = MAGMA_Z_ABS1( tjjs );
                if ( tjj > smlnum ) {
                    /* abs(A(j,j)) > SMLNUM: */
                    if ( tjj < 1. ) {
                        if ( xj > tjj * bignum ) {
                            /* Scale X by 1/abs(x(j)). */
                            rec = 1. / xj;
                            blasf77_zdscal( &n, &rec, &x[0], &ione );
                            *scale *= rec;
                            xmax   *= rec;
                        }
                    }
                    x[j] = x[j] / tjjs;
                }
                else if ( tjj > 0. ) {
                    /* 0 < abs(A(j,j)) <= SMLNUM: */
                    if ( xj > tjj * bignum ) {
                        /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
                        rec = (tjj * bignum) / xj;
                        blasf77_zdscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax   *= rec;
                    }
                    x[j] = x[j] / tjjs;
                }
                else {
                    /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                    /* scale = 0 and compute a solution to A**H *x = 0. */
                    for( i = 0; i < n; ++i ) {
                        x[i] = c_zero;
                    }
                    x[j] = c_one;
                    *scale = 0.;
                    xmax   = 0.;
                }
L210:
                ;
            }
            else {
                /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
                /* product has already been divided by 1/A(j,j). */
                x[j] = (x[j] / tjjs) - csumj;
            }
            xmax = max( xmax, MAGMA_Z_ABS1( x[j] ));
        }
    }
    *scale /= tscal;
    
    /* Scale the column norms by 1/TSCAL for return. */
    if ( tscal != 1. ) {
        double d = 1. / tscal;
        blasf77_dscal( &n, &d, &cnorm[0], &ione );
    }

    return *info;
} /* end zlatrsd */
Пример #3
0
extern "C" magma_int_t
magma_dgeev(
    magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n,
    double *a, magma_int_t lda,
    double *WR, double *WI,
    double *vl, magma_int_t ldvl,
    double *vr, magma_int_t ldvr,
    double *work, magma_int_t lwork,
    magma_queue_t queue,
    magma_int_t *info)
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    DGEEV computes for an N-by-N real nonsymmetric matrix A, the
    eigenvalues and, optionally, the left and/or right eigenvectors.

    The right eigenvector v(j) of A satisfies
                     A * v(j) = lambda(j) * v(j)
    where lambda(j) is its eigenvalue.
    The left eigenvector u(j) of A satisfies
                  u(j)**T * A = lambda(j) * u(j)**T
    where u(j)**T denotes the transpose of u(j).

    The computed eigenvectors are normalized to have Euclidean norm
    equal to 1 and largest component real.

    Arguments
    =========
    JOBVL   (input) CHARACTER*1
            = 'N': left eigenvectors of A are not computed;
            = 'V': left eigenvectors of are computed.

    JOBVR   (input) CHARACTER*1
            = 'N': right eigenvectors of A are not computed;
            = 'V': right eigenvectors of A are computed.

    N       (input) INTEGER
            The order of the matrix A. N >= 0.

    A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the N-by-N matrix A.
            On exit, A has been overwritten.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    WR      (output) DOUBLE PRECISION array, dimension (N)
    WI      (output) DOUBLE PRECISION array, dimension (N)
            WR and WI contain the real and imaginary parts,
            respectively, of the computed eigenvalues.  Complex
            conjugate pairs of eigenvalues appear consecutively
            with the eigenvalue having the positive imaginary part
            first.

    VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)
            If JOBVL = 'V', the left eigenvectors u(j) are stored one
            after another in the columns of VL, in the same order
            as their eigenvalues.
            If JOBVL = 'N', VL is not referenced.
            u(j) = VL(:,j), the j-th column of VL.

    LDVL    (input) INTEGER
            The leading dimension of the array VL.  LDVL >= 1; if
            JOBVL = 'V', LDVL >= N.

    VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)
            If JOBVR = 'V', the right eigenvectors v(j) are stored one
            after another in the columns of VR, in the same order
            as their eigenvalues.
            If JOBVR = 'N', VR is not referenced.
            v(j) = VR(:,j), the j-th column of VR.

    LDVR    (input) INTEGER
            The leading dimension of the array VR.  LDVR >= 1; if
            JOBVR = 'V', LDVR >= N.

    WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= (1+nb)*N.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value.
            > 0:  if INFO = i, the QR algorithm failed to compute all the
                  eigenvalues, and no eigenvectors have been computed;
                  elements and i+1:N of W contain eigenvalues which have
                  converged.
    =====================================================================    */

    magma_int_t ione = 1;
    magma_int_t c__1 = 1;
    magma_int_t c__0 = 0;
    magma_int_t c_n1 = -1;
    
    magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1,
            i__2, i__3;
    double d__1, d__2;

    magma_int_t i__, k, ihi, ilo;
    double      r__, cs, sn, scl;
    double dum[1], eps;
    magma_int_t ibal;
    double anrm;
    magma_int_t ierr, itau, iwrk, nout;
    magma_int_t scalea;
    double cscale;
    double bignum;
    magma_int_t minwrk;
    magma_int_t wantvl;
    double smlnum;
    magma_int_t lquery, wantvr, select[1];

    magma_int_t nb = 0;
    magmaDouble_ptr dT;
    //magma_timestr_t start, end;

    const char* side_ = NULL;

    *info = 0;
    lquery = lwork == -1;
    wantvl = (jobvl == MagmaVec);
    wantvr = (jobvr == MagmaVec);
    if (! wantvl && jobvl != MagmaNoVec) {
        *info = -1;
    } else if (! wantvr && jobvr != MagmaNoVec) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -9;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -11;
    }

    /*  Compute workspace   */
    if (*info == 0) {

        nb = magma_get_dgehrd_nb(n);
        minwrk = (2+nb)*n;
        work[0] = (double) minwrk;
        
        if (lwork < minwrk && ! lquery) {
            *info = -13;
        }

    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
   
    // if eigenvectors are needed
#if defined(VERSION3)
    if (MAGMA_SUCCESS != magma_dmalloc( &dT, nb*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
#endif

    // subtract row and col for 1-based indexing
    a_dim1   = lda;
    a_offset = 1 + a_dim1;
    a       -= a_offset;
    vl_dim1   = ldvl;
    vl_offset = 1 + vl_dim1;
    vl       -= vl_offset;
    vr_dim1   = ldvr;
    vr_offset = 1 + vr_dim1;
    vr       -= vr_offset;
    --work;

    /* Get machine constants */
    eps    = lapackf77_dlamch("P");
    smlnum = lapackf77_dlamch("S");
    bignum = 1. / smlnum;
    lapackf77_dlabad(&smlnum, &bignum);
    smlnum = magma_dsqrt(smlnum) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_dlange("M", &n, &n, &a[a_offset], &lda, dum);
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_dlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n,
                &a[a_offset], &lda, &ierr);
    }

    /* Balance the matrix
       (Workspace: need N) */
    ibal = 1;
    lapackf77_dgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr);

    /* Reduce to upper Hessenberg form
       (Workspace: need 3*N, prefer 2*N+N*NB) */
    itau = ibal + n;
    iwrk = itau + n;
    i__1 = lwork - iwrk + 1;

    //start = get_current_time();
#if defined(VERSION1)
    /*
     * Version 1 - LAPACK
     */
    lapackf77_dgehrd(&n, &ilo, &ihi, &a[a_offset], &lda,
                     &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION2)
    /*
     *  Version 2 - LAPACK consistent HRD
     */
    magma_dgehrd2(n, ilo, ihi, &a[a_offset], lda,
                  &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
    /*
     * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored,
     */
    magma_dgehrd(n, ilo, ihi, &a[a_offset], lda,
                 &work[itau], &work[iwrk], i__1, dT, 0, queue, &ierr);
#endif
    //end = get_current_time();
    //printf("    Time for dgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.);

    if (wantvl) {
      /*        Want left eigenvectors
                Copy Householder vectors to VL */
        side_ = "Left";
        lapackf77_dlacpy(MagmaLowerStr, &n, &n,
                         &a[a_offset], &lda, &vl[vl_offset], &ldvl);

        /*
         * Generate orthogonal matrix in VL
         *   (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
         */
        i__1 = lwork - iwrk + 1;

        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_dorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl,
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_dorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau],
                     dT, 0, nb, queue, &ierr);
#endif
        //end = get_current_time();
        //printf("    Time for dorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /*
         * Perform QR iteration, accumulating Schur vectors in VL
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) )
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_dhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
                         &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info);

        if (wantvr) {
          /* Want left and right eigenvectors
             Copy Schur vectors to VR */
            side_ = "Both";
            lapackf77_dlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr);
        }

    } else if (wantvr) {
        /*  Want right eigenvectors
            Copy Householder vectors to VR */
        side_ = "Right";
        lapackf77_dlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr);

        /*
         * Generate orthogonal matrix in VR
         *   (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB)
         */
        i__1 = lwork - iwrk + 1;
        //start = get_current_time();
#if defined(VERSION1) || defined(VERSION2)
        /*
         * Version 1 & 2 - LAPACK
         */
        lapackf77_dorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr,
                         &work[itau], &work[iwrk], &i__1, &ierr);
#elif defined(VERSION3)
        /*
         * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored
         */
        magma_dorghr(n, ilo, ihi, &vr[vr_offset], ldvr,
                     &work[itau], dT, 0, nb, queue, &ierr);
#endif
        //end = get_current_time();
        //printf("    Time for dorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.);

        /*
         * Perform QR iteration, accumulating Schur vectors in VR
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) )
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_dhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    } else {
        /*
         * Compute eigenvalues only
         *   (Workspace: need N+1, prefer N+HSWORK (see comments) )
         */
        iwrk = itau;
        i__1 = lwork - iwrk + 1;
        lapackf77_dhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI,
                &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info);
    }

    /* If INFO > 0 from DHSEQR, then quit */
    if (*info > 0) {
        fprintf(stderr, "DHSEQR returned with info = %d\n", (int) *info);
        goto L50;
    }

    if (wantvl || wantvr) {
        /*
         * Compute left and/or right eigenvectors
         *   (Workspace: need 4*N)
         */
        lapackf77_dtrevc(side_, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl,
                &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr);
    }

    if (wantvl) {
        /*
         * Undo balancing of left eigenvectors
         *   (Workspace: need N)
         */
        lapackf77_dgebak("B", "L", &n, &ilo, &ihi,
                         &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr);

        /* Normalize left eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            if ( WI[i__-1] == 0.) {
                scl = magma_cblas_dnrm2(n, &vl[i__ * vl_dim1 + 1], 1);
                scl = 1. / scl;
                blasf77_dscal( &n, &scl, &vl[i__ * vl_dim1 + 1], &ione );
            } else if (WI[i__-1] > 0.) {
                d__1 = magma_cblas_dnrm2(n, &vl[ i__      * vl_dim1 + 1], 1);
                d__2 = magma_cblas_dnrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1);
                scl = lapackf77_dlapy2(&d__1, &d__2);
                scl = 1. / scl;
                blasf77_dscal( &n, &scl, &vl[ i__      * vl_dim1 + 1], &ione );
                blasf77_dscal( &n, &scl, &vl[(i__ + 1) * vl_dim1 + 1], &ione );
                i__2 = n;
                for (k = 1; k <= i__2; ++k) {
                    /* Computing 2nd power */
                    d__1 = vl[k + i__ * vl_dim1];
                    /* Computing 2nd power */
                    d__2 = vl[k + (i__ + 1) * vl_dim1];
                    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
                }
                /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_idamax */
                k = blasf77_idamax( &n, &work[iwrk], &ione );  //+1;
                lapackf77_dlartg(&vl[k +  i__      * vl_dim1],
                                 &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__);
                blasf77_drot( &n, &vl[ i__      * vl_dim1 + 1], &ione,
                                  &vl[(i__ + 1) * vl_dim1 + 1], &ione, &cs, &sn );
                vl[k + (i__ + 1) * vl_dim1] = 0.;
            }
        }
    }

    if (wantvr) {
        /*
         * Undo balancing of right eigenvectors
         *   (Workspace: need N)
         */
        lapackf77_dgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n,
                         &vr[vr_offset], &ldvr, &ierr);

        /* Normalize right eigenvectors and make largest component real */
        for (i__ = 1; i__ <= n; ++i__) {
            if (WI[i__-1] == 0.) {
                scl = 1. / magma_cblas_dnrm2(n, &vr[i__ * vr_dim1 + 1], 1);
                blasf77_dscal( &n, &scl, &vr[i__ * vr_dim1 + 1], &ione );
            } else if (WI[i__-1] > 0.) {
                d__1 = magma_cblas_dnrm2(n, &vr[ i__      * vr_dim1 + 1], 1);
                d__2 = magma_cblas_dnrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1);
                scl = lapackf77_dlapy2(&d__1, &d__2);
                scl = 1. / scl;
                blasf77_dscal( &n, &scl, &vr[ i__      * vr_dim1 + 1], &ione );
                blasf77_dscal( &n, &scl, &vr[(i__ + 1) * vr_dim1 + 1], &ione );
                i__2 = n;
                for (k = 1; k <= i__2; ++k) {
                    /* Computing 2nd power */
                    d__1 = vr[k + i__ * vr_dim1];
                    /* Computing 2nd power */
                    d__2 = vr[k + (i__ + 1) * vr_dim1];
                    work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2;
                }
                /* Comment:
                   Fortran BLAS does not have to add 1
                   C       BLAS must add one to cblas_idamax */
                k = blasf77_idamax( &n, &work[iwrk], &ione );  //+1;
                lapackf77_dlartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1],
                        &cs, &sn, &r__);
                blasf77_drot( &n, &vr[ i__      * vr_dim1 + 1], &ione,
                                  &vr[(i__ + 1) * vr_dim1 + 1], &ione, &cs, &sn );
                vr[k + (i__ + 1) * vr_dim1] = 0.;
            }
        }
    }

    /*  Undo scaling if necessary */
L50:
    if (scalea) {
        i__1 = n - *info;
        /* Computing MAX */
        i__3 = n - *info;
        i__2 = max(i__3,1);
        lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
                         WR + (*info), &i__2, &ierr);
        i__1 = n - *info;
        /* Computing MAX */
        i__3 = n - *info;
        i__2 = max(i__3,1);
        lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
                WI + (*info), &i__2, &ierr);
        if (*info > 0) {
            i__1 = ilo - 1;
            lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
                    WR, &n, &ierr);
            i__1 = ilo - 1;
            lapackf77_dlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1,
                    WI, &n, &ierr);
        }
    }

#if defined(VERSION3)
    magma_free( dT );
#endif
    return *info;
} /* magma_dgeev */
Пример #4
0
/**
    Purpose
    -------
    ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the
    eigenvalues and, optionally, the left and/or right eigenvectors.

    The right eigenvector v(j) of A satisfies
        A * v(j) = lambda(j) * v(j)
    where lambda(j) is its eigenvalue.
    The left eigenvector u(j) of A satisfies
        u(j)**H * A = lambda(j) * u(j)**H
    where u(j)**H denotes the conjugate transpose of u(j).

    The computed eigenvectors are normalized to have Euclidean norm
    equal to 1 and largest component real.

    Arguments
    ---------
    @param[in]
    jobvl   magma_vec_t
      -     = MagmaNoVec:        left eigenvectors of A are not computed;
      -     = MagmaVec:          left eigenvectors of are computed.

    @param[in]
    jobvr   magma_vec_t
      -     = MagmaNoVec:        right eigenvectors of A are not computed;
      -     = MagmaVec:          right eigenvectors of A are computed.

    @param[in]
    n       INTEGER
            The order of the matrix A. N >= 0.

    @param[in,out]
    A       COMPLEX_16 array, dimension (LDA,N)
            On entry, the N-by-N matrix A.
            On exit, A has been overwritten.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    w       COMPLEX_16 array, dimension (N)
            w contains the computed eigenvalues.

    @param[out]
    VL      COMPLEX_16 array, dimension (LDVL,N)
            If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one
            after another in the columns of VL, in the same order
            as their eigenvalues.
            If JOBVL = MagmaNoVec, VL is not referenced.
            u(j) = VL(:,j), the j-th column of VL.

    @param[in]
    ldvl    INTEGER
            The leading dimension of the array VL.  LDVL >= 1; if
            JOBVL = MagmaVec, LDVL >= N.

    @param[out]
    VR      COMPLEX_16 array, dimension (LDVR,N)
            If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one
            after another in the columns of VR, in the same order
            as their eigenvalues.
            If JOBVR = MagmaNoVec, VR is not referenced.
            v(j) = VR(:,j), the j-th column of VR.

    @param[in]
    ldvr    INTEGER
            The leading dimension of the array VR.  LDVR >= 1; if
            JOBVR = MagmaVec, LDVR >= N.

    @param[out]
    work    (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= (1+nb)*N.
            For optimal performance, LWORK >= (1+2*nb)*N.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param
    rwork   (workspace) DOUBLE PRECISION array, dimension (2*N)

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = i, the QR algorithm failed to compute all the
                  eigenvalues, and no eigenvectors have been computed;
                  elements and i+1:N of w contain eigenvalues which have
                  converged.

    @ingroup magma_zgeev_driver
    ********************************************************************/
extern "C" magma_int_t
magma_zgeev(
    magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n,
    magmaDoubleComplex *A, magma_int_t lda,
    #ifdef COMPLEX
    magmaDoubleComplex *w,
    #else
    double *wr, double *wi,
    #endif
    magmaDoubleComplex *VL, magma_int_t ldvl,
    magmaDoubleComplex *VR, magma_int_t ldvr,
    magmaDoubleComplex *work, magma_int_t lwork,
    #ifdef COMPLEX
    double *rwork,
    #endif
    magma_int_t *info )
{
    #define VL(i,j)  (VL + (i) + (j)*ldvl)
    #define VR(i,j)  (VR + (i) + (j)*ldvr)
    
    const magma_int_t ione  = 1;
    const magma_int_t izero = 0;
    
    double d__1, d__2;
    magmaDoubleComplex tmp;
    double scl;
    double dum[1], eps;
    double anrm, cscale, bignum, smlnum;
    magma_int_t i, k, ilo, ihi;
    magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb;
    magma_int_t scalea, minwrk, optwrk, irwork, lquery, wantvl, wantvr, select[1];

    magma_side_t side = MagmaRight;

    magma_timer_t time_total=0, time_gehrd=0, time_unghr=0, time_hseqr=0, time_trevc=0, time_sum=0;
    magma_flops_t flop_total=0, flop_gehrd=0, flop_unghr=0, flop_hseqr=0, flop_trevc=0, flop_sum=0;
    timer_start( time_total );
    flops_start( flop_total );
    
    irwork = 0;
    *info = 0;
    lquery = (lwork == -1);
    wantvl = (jobvl == MagmaVec);
    wantvr = (jobvr == MagmaVec);
    if (! wantvl && jobvl != MagmaNoVec) {
        *info = -1;
    } else if (! wantvr && jobvr != MagmaNoVec) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) {
        *info = -8;
    } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) {
        *info = -10;
    }

    /* Compute workspace */
    nb = magma_get_zgehrd_nb( n );
    if (*info == 0) {
        minwrk = (1+  nb)*n;
        optwrk = (1+2*nb)*n;
        work[0] = MAGMA_Z_MAKE( optwrk, 0 );

        if (lwork < minwrk && ! lquery) {
            *info = -12;
        }
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    }
    
    #if defined(VERSION3)
    magmaDoubleComplex_ptr dT;
    if (MAGMA_SUCCESS != magma_zmalloc( &dT, nb*n )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    #endif

    /* Get machine constants */
    eps    = lapackf77_dlamch( "P" );
    smlnum = lapackf77_dlamch( "S" );
    bignum = 1. / smlnum;
    lapackf77_dlabad( &smlnum, &bignum );
    smlnum = magma_dsqrt( smlnum ) / eps;
    bignum = 1. / smlnum;

    /* Scale A if max element outside range [SMLNUM,BIGNUM] */
    anrm = lapackf77_zlange( "M", &n, &n, A, &lda, dum );
    scalea = 0;
    if (anrm > 0. && anrm < smlnum) {
        scalea = 1;
        cscale = smlnum;
    } else if (anrm > bignum) {
        scalea = 1;
        cscale = bignum;
    }
    if (scalea) {
        lapackf77_zlascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr );
    }

    /* Balance the matrix
     * (CWorkspace: none)
     * (RWorkspace: need N)
     *  - this space is reserved until after gebak */
    ibal = 0;
    lapackf77_zgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr );

    /* Reduce to upper Hessenberg form
     * (CWorkspace: need 2*N, prefer N + N*NB)
     * (RWorkspace: N)
     *  - including N reserved for gebal/gebak, unused by zgehrd */
    itau = 0;
    iwrk = itau + n;
    liwrk = lwork - iwrk;

    timer_start( time_gehrd );
    flops_start( flop_gehrd );
    #if defined(VERSION1)
        // Version 1 - LAPACK
        lapackf77_zgehrd( &n, &ilo, &ihi, A, &lda,
                          &work[itau], &work[iwrk], &liwrk, &ierr );
    #elif defined(VERSION2)
        // Version 2 - LAPACK consistent HRD
        magma_zgehrd2( n, ilo, ihi, A, lda,
                       &work[itau], &work[iwrk], liwrk, &ierr );
    #elif defined(VERSION3)
        // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored,
        magma_zgehrd( n, ilo, ihi, A, lda,
                      &work[itau], &work[iwrk], liwrk, dT, &ierr );
    #endif
    time_sum += timer_stop( time_gehrd );
    flop_sum += flops_stop( flop_gehrd );

    if (wantvl) {
        /* Want left eigenvectors
         * Copy Householder vectors to VL */
        side = MagmaLeft;
        lapackf77_zlacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl );

        /* Generate unitary matrix in VL
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zunghr */
        timer_start( time_unghr );
        flops_start( flop_unghr );
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_zunghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr );
        #endif
        time_sum += timer_stop( time_unghr );
        flop_sum += flops_stop( flop_unghr );
        
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        /* Perform QR iteration, accumulating Schur vectors in VL
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w,
                          VL, &ldvl, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );

        if (wantvr) {
            /* Want left and right eigenvectors
             * Copy Schur vectors to VR */
            side = MagmaBothSides;
            lapackf77_zlacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr );
        }
    }
    else if (wantvr) {
        /* Want right eigenvectors
         * Copy Householder vectors to VR */
        side = MagmaRight;
        lapackf77_zlacpy( "L", &n, &n, A, &lda, VR, &ldvr );

        /* Generate unitary matrix in VR
         * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB)
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zunghr */
        timer_start( time_unghr );
        flops_start( flop_unghr );
        #if defined(VERSION1) || defined(VERSION2)
            // Version 1 & 2 - LAPACK
            lapackf77_zunghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau],
                              &work[iwrk], &liwrk, &ierr );
        #elif defined(VERSION3)
            // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored
            magma_zunghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr );
        #endif
        time_sum += timer_stop( time_unghr );
        flop_sum += flops_stop( flop_unghr );

        /* Perform QR iteration, accumulating Schur vectors in VR
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w,
                          VR, &ldvr, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );
    }
    else {
        /* Compute eigenvalues only
         * (CWorkspace: need 1, prefer HSWORK (see comments) )
         * (RWorkspace: N)
         *  - including N reserved for gebal/gebak, unused by zhseqr */
        timer_start( time_hseqr );
        flops_start( flop_hseqr );
        iwrk = itau;
        liwrk = lwork - iwrk;
        lapackf77_zhseqr( "E", "N", &n, &ilo, &ihi, A, &lda, w,
                          VR, &ldvr, &work[iwrk], &liwrk, info );
        time_sum += timer_stop( time_hseqr );
        flop_sum += flops_stop( flop_hseqr );
    }

    /* If INFO > 0 from ZHSEQR, then quit */
    if (*info > 0) {
        goto CLEANUP;
    }

    timer_start( time_trevc );
    flops_start( flop_trevc );
    if (wantvl || wantvr) {
        /* Compute left and/or right eigenvectors
         * (CWorkspace: need 2*N)
         * (RWorkspace: need 2*N)
         *  - including N reserved for gebal/gebak, unused by ztrevc */
        irwork = ibal + n;
        #if TREVC_VERSION == 1
        lapackf77_ztrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl,
                          VR, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 2
        liwrk = lwork - iwrk;
        lapackf77_ztrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl,
                           VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 3
        magma_ztrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                       VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 4
        magma_ztrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                          VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #elif TREVC_VERSION == 5
        magma_ztrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl,
                              VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr );
        #else
        #error Unknown TREVC_VERSION
        #endif
    }
    time_sum += timer_stop( time_trevc );
    flop_sum += flops_stop( flop_trevc );

    if (wantvl) {
        /* Undo balancing of left eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n,
                          VL, &ldvl, &ierr );

        /* Normalize left eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / magma_cblas_dznrm2( n, VL(0,i), 1 );
            blasf77_zdscal( &n, &scl, VL(0,i), &ione );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *VL(k,i) );
                d__2 = MAGMA_Z_IMAG( *VL(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = blasf77_idamax( &n, &rwork[irwork], &ione ) - 1;  // subtract 1; k is 0-based
            tmp = MAGMA_Z_CNJG( *VL(k,i) ) / magma_dsqrt( rwork[irwork + k] );
            blasf77_zscal( &n, &tmp, VL(0,i), &ione );
            *VL(k,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL( *VL(k,i) ), 0 );
        }
    }

    if (wantvr) {
        /* Undo balancing of right eigenvectors
         * (CWorkspace: none)
         * (RWorkspace: need N) */
        lapackf77_zgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n,
                          VR, &ldvr, &ierr );

        /* Normalize right eigenvectors and make largest component real */
        for (i = 0; i < n; ++i) {
            scl = 1. / magma_cblas_dznrm2( n, VR(0,i), 1 );
            blasf77_zdscal( &n, &scl, VR(0,i), &ione );
            for (k = 0; k < n; ++k) {
                /* Computing 2nd power */
                d__1 = MAGMA_Z_REAL( *VR(k,i) );
                d__2 = MAGMA_Z_IMAG( *VR(k,i) );
                rwork[irwork + k] = d__1*d__1 + d__2*d__2;
            }
            k = blasf77_idamax( &n, &rwork[irwork], &ione ) - 1;  // subtract 1; k is 0-based
            tmp = MAGMA_Z_CNJG( *VR(k,i) ) / magma_dsqrt( rwork[irwork + k] );
            blasf77_zscal( &n, &tmp, VR(0,i), &ione );
            *VR(k,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL( *VR(k,i) ), 0 );
        }
    }

CLEANUP:
    /* Undo scaling if necessary */
    if (scalea) {
        // converged eigenvalues, stored in WR[i+1:n] and WI[i+1:n] for i = INFO
        magma_int_t nval = n - (*info);
        magma_int_t ld   = max( nval, 1 );
        lapackf77_zlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w + (*info), &ld, &ierr );
        if (*info > 0) {
            // first ilo columns were already upper triangular,
            // so the corresponding eigenvalues are also valid.
            nval = ilo - 1;
            lapackf77_zlascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w, &n, &ierr );
        }
    }

    #if defined(VERSION3)
    magma_free( dT );
    #endif
    
    timer_stop( time_total );
    flops_stop( flop_total );
    timer_printf( "dgeev times n %5d, gehrd %7.3f, unghr %7.3f, hseqr %7.3f, trevc %7.3f, total %7.3f, sum %7.3f\n",
                  (int) n, time_gehrd, time_unghr, time_hseqr, time_trevc, time_total, time_sum );
    timer_printf( "dgeev flops n %5d, gehrd %7lld, unghr %7lld, hseqr %7lld, trevc %7lld, total %7lld, sum %7lld\n",
                  (int) n, flop_gehrd, flop_unghr, flop_hseqr, flop_trevc, flop_total, flop_sum );

    work[0] = MAGMA_Z_MAKE( (double) optwrk, 0. );

    return *info;
} /* magma_zgeev */