Ejemplo n.º 1
0
magma_int_t magma_clatrsd(
    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_bool_t normin,
    magma_int_t n, const magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex lambda,
    magmaFloatComplex *x,
    float *scale, float *cnorm,
    magma_int_t *info)
{
#define A(i,j) (A + (i) + (j)*lda)

    /* constants */
    const magma_int_t ione = 1;
    const float d_half = 0.5;
    const magmaFloatComplex c_zero = MAGMA_C_ZERO;
    const magmaFloatComplex c_one  = MAGMA_C_ONE;

    /* System generated locals */
    magma_int_t len;
    magmaFloatComplex ztmp;

    /* Local variables */
    magma_int_t i, j;
    float xj, rec, tjj;
    magma_int_t jinc;
    float xbnd;
    magma_int_t imax;
    float tmax;
    magmaFloatComplex tjjs;
    float xmax, grow;

    float tscal;
    magmaFloatComplex uscal;
    magma_int_t jlast;
    magmaFloatComplex csumj;

    float bignum;
    magma_int_t jfirst;
    float smlnum;

    /* Function Body */
    *info = 0;
    magma_int_t upper  = (uplo  == MagmaUpper);
    magma_int_t notran = (trans == MagmaNoTrans);
    magma_int_t nounit = (diag  == MagmaNonUnit);

    /* Test the input parameters. */
    if ( ! upper && uplo != MagmaLower ) {
        *info = -1;
    }
    else if (! notran &&
             trans != MagmaTrans &&
             trans != MagmaConjTrans) {
        *info = -2;
    }
    else if ( ! nounit && diag != MagmaUnit ) {
        *info = -3;
    }
    else if ( ! (normin == MagmaTrue) &&
              ! (normin == MagmaFalse) ) {
        *info = -4;
    }
    else if ( n < 0 ) {
        *info = -5;
    }
    else if ( lda < max(1,n) ) {
        *info = -7;
    }
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if ( n == 0 ) {
        return *info;
    }

    /* Determine machine dependent parameters to control overflow. */
    smlnum = lapackf77_slamch( "Safe minimum" );
    bignum = 1. / smlnum;
    lapackf77_slabad( &smlnum, &bignum );
    smlnum /= lapackf77_slamch( "Precision" );
    bignum = 1. / smlnum;
    *scale = 1.;

    if ( normin == MagmaFalse ) {
        /* Compute the 1-norm of each column, not including the diagonal. */
        if ( upper ) {
            /* A is upper triangular. */
            cnorm[0] = 0.;
            for( j = 1; j < n; ++j ) {
                cnorm[j] = magma_cblas_scasum( j, A(0,j), ione );
            }
        }
        else {
            /* A is lower triangular. */
            for( j = 0; j < n-1; ++j ) {
                cnorm[j] = magma_cblas_scasum( n-(j+1), A(j+1,j), ione );
            }
            cnorm[n-1] = 0.;
        }
    }

    /* Scale the column norms by TSCAL if the maximum element in CNORM is */
    /* greater than BIGNUM/2. */
    imax = blasf77_isamax( &n, &cnorm[0], &ione ) - 1;
    tmax = cnorm[imax];
    if ( tmax <= bignum * 0.5 ) {
        tscal = 1.;
    }
    else {
        tscal = 0.5 / (smlnum * tmax);
        blasf77_sscal( &n, &tscal, &cnorm[0], &ione );
    }

    /* ================================================================= */
    /* Compute a bound on the computed solution vector to see if the */
    /* Level 2 BLAS routine CTRSV can be used. */
    xmax = 0.;
    for( j = 0; j < n; ++j ) {
        xmax = max( xmax, 0.5*MAGMA_C_ABS1( x[j] ));
    }
    xbnd = xmax;

    if ( notran ) {
        /* ---------------------------------------- */
        /* Compute the growth in A * x = b. */
        if ( upper ) {
            jfirst = n-1;
            jlast  = 0;
            jinc   = -1;
        }
        else {
            jfirst = 0;
            jlast  = n;
            jinc   = 1;
        }

        if ( tscal != 1. ) {
            grow = 0.;
            goto L60;
        }

        /* A is non-unit triangular. */
        /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
        /* Initially, G(0) = max{x(i), i=1,...,n}. */
        grow = 0.5 / max( xbnd, smlnum );
        xbnd = grow;
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Exit the loop if the growth factor is too small. */
            if ( grow <= smlnum ) {
                goto L60;
            }

            if ( nounit ) {
                tjjs = *A(j,j) - lambda;
            }
            else {
                tjjs = c_one - lambda;
            }
            tjj = MAGMA_C_ABS1( tjjs );

            if ( tjj >= smlnum ) {
                /* M(j) = G(j-1) / abs(A(j,j)) */
                xbnd = min( xbnd, min(1.,tjj)*grow );
            }
            else {
                /* M(j) could overflow, set XBND to 0. */
                xbnd = 0.;
            }

            if ( tjj + cnorm[j] >= smlnum ) {
                /* G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) */
                grow *= (tjj / (tjj + cnorm[j]));
            }
            else {
                /* G(j) could overflow, set GROW to 0. */
                grow = 0.;
            }
        }
        grow = xbnd;
L60:
        ;
    }
    else {
        /* ---------------------------------------- */
        /* Compute the growth in A**T * x = b  or  A**H * x = b. */
        if ( upper ) {
            jfirst = 0;
            jlast  = n;
            jinc   = 1;
        }
        else {
            jfirst = n-1;
            jlast  = 0;
            jinc   = -1;
        }

        if ( tscal != 1. ) {
            grow = 0.;
            goto L90;
        }

        /* A is non-unit triangular. */
        /* Compute GROW = 1/G(j) and XBND = 1/M(j). */
        /* Initially, M(0) = max{x(i), i=1,...,n}. */
        grow = 0.5 / max( xbnd, smlnum );
        xbnd = grow;
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Exit the loop if the growth factor is too small. */
            if ( grow <= smlnum ) {
                goto L90;
            }

            /* G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) */
            xj = 1. + cnorm[j];
            grow = min( grow, xbnd / xj );

            if ( nounit ) {
                tjjs = *A(j,j) - lambda;
            }
            else {
                tjjs = c_one - lambda;
            }
            tjj = MAGMA_C_ABS1( tjjs );

            if ( tjj >= smlnum ) {
                /* M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) */
                if ( xj > tjj ) {
                    xbnd *= (tjj / xj);
                }
            }
            else {
                /* M(j) could overflow, set XBND to 0. */
                xbnd = 0.;
            }
        }
        grow = min( grow, xbnd );
L90:
        ;
    }
        
    /* ================================================================= */
    /* Due to modified diagonal, we can't use regular BLAS ctrsv. */
    
    /* Use a Level 1 BLAS solve, scaling intermediate results. */
    if ( xmax > bignum * 0.5 ) {
        /* Scale X so that its components are less than or equal to */
        /* BIGNUM in absolute value. */
        *scale = (bignum * 0.5) / xmax;
        blasf77_csscal( &n, scale, &x[0], &ione );
        xmax = bignum;
    }
    else {
        xmax *= 2.;
    }

    if ( notran ) {
        /* ---------------------------------------- */
        /* Solve A * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) / A(j,j), scaling x if necessary. */
            xj = MAGMA_C_ABS1( x[j] );
            if ( nounit ) {
                tjjs = (*A(j,j) - lambda ) * tscal;
            }
            else {
                tjjs = (c_one - lambda) * tscal;
                if ( tscal == 1. ) {
                    goto L110;
                }
            }
            tjj = MAGMA_C_ABS1( tjjs );
            if ( tjj > smlnum ) {
                /* abs(A(j,j)) > SMLNUM: */
                if ( tjj < 1. ) {
                    if ( xj > tjj * bignum ) {
                        /* Scale x by 1/b(j). */
                        rec = 1. / xj;
                        blasf77_csscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax *= rec;
                    }
                }
                x[j] = x[j] / tjjs;
                xj   = MAGMA_C_ABS1( x[j] );
            }
            else if ( tjj > 0. ) {
                /* 0 < abs(A(j,j)) <= SMLNUM: */
                if ( xj > tjj * bignum ) {
                    /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM */
                    /* to avoid overflow when dividing by A(j,j). */
                    rec = (tjj * bignum) / xj;
                    if ( cnorm[j] > 1. ) {
                        /* Scale by 1/CNORM(j) to avoid overflow when */
                        /* multiplying x(j) times column j. */
                        rec /= cnorm[j];
                    }
                    blasf77_csscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax *= rec;
                }
                x[j] = x[j] / tjjs;
                xj   = MAGMA_C_ABS1( x[j] );
            }
            else {
                /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                /* scale = 0, and compute a solution to A*x = 0. */
                for( i = 0; i < n; ++i ) {
                    x[i] = c_zero;
                }
                x[j]   = c_one;
                xj     = 1.;
                *scale = 0.;
                xmax   = 0.;
            }
L110:

            /* Scale x if necessary to avoid overflow when adding a */
            /* multiple of column j of A. */
            if ( xj > 1. ) {
                rec = 1. / xj;
                if ( cnorm[j] > (bignum - xmax) * rec ) {
                    /* Scale x by 1/(2*abs(x(j))). */
                    rec *= 0.5;
                    blasf77_csscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                }
            }
            else if ( xj * cnorm[j] > bignum - xmax ) {
                /* Scale x by 1/2. */
                blasf77_csscal( &n, &d_half, &x[0], &ione );
                *scale *= 0.5;
            }

            if ( upper ) {
                if ( j > 0 ) {
                    /* Compute the update */
                    /* x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) */
                    len = j;
                    ztmp = -tscal * x[j];
                    blasf77_caxpy( &len, &ztmp, A(0,j), &ione, &x[0], &ione );
                    i = blasf77_icamax( &len, &x[0], &ione ) - 1;
                    xmax = MAGMA_C_ABS1( x[i] );
                }
            }
            else {
                if ( j < n-1 ) {
                    /* Compute the update */
                    /* x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) */
                    len = n - (j+1);
                    ztmp = -tscal * x[j];
                    blasf77_caxpy( &len, &ztmp, A(j+1,j), &ione, &x[j + 1], &ione );
                    i = j + blasf77_icamax( &len, &x[j + 1], &ione );
                    xmax = MAGMA_C_ABS1( x[i] );
                }
            }
        }
    }
    else if ( trans == MagmaTrans ) {
        /* ---------------------------------------- */
        /* Solve A**T * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) - sum A(k,j)*x(k). */
            /*                       k<>j             */
            xj = MAGMA_C_ABS1( x[j] );
            uscal = MAGMA_C_MAKE( tscal, 0. );
            rec = 1. / max( xmax, 1. );
            if ( cnorm[j] > (bignum - xj) * rec ) {
                /* If x(j) could overflow, scale x by 1/(2*XMAX). */
                rec *= 0.5;
                if ( nounit ) {
                    tjjs = (*A(j,j) - lambda) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                }
                tjj = MAGMA_C_ABS1( tjjs );
                if ( tjj > 1. ) {
                    /* Divide by A(j,j) when scaling x if A(j,j) > 1. */
                    rec = min( 1., rec * tjj );
                    uscal = uscal / tjjs;
                }
                if ( rec < 1. ) {
                    blasf77_csscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax *= rec;
                }
            }

            csumj = c_zero;
            if ( uscal == c_one ) {
                /* If the scaling needed for A in the dot product is 1, */
                /* call CDOTU to perform the dot product. */
                if ( upper ) {
                    csumj = magma_cblas_cdotu( j, A(0,j), ione, &x[0], ione );
                }
                else if ( j < n-1 ) {
                    csumj = magma_cblas_cdotu( n-(j+1), A(j+1,j), ione, &x[j+1], ione );
                }
            }
            else {
                /* Otherwise, use in-line code for the dot product. */
                if ( upper ) {
                    for( i = 0; i < j; ++i ) {
                        csumj += (*A(i,j) * uscal) * x[i];
                    }
                }
                else if ( j < n-1 ) {
                    for( i = j+1; i < n; ++i ) {
                        csumj += (*A(i,j) * uscal) * x[i];
                    }
                }
            }

            if ( uscal == MAGMA_C_MAKE( tscal, 0. )) {
                /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
                /* was not used to scale the dotproduct. */
                x[j] -= csumj;
                xj = MAGMA_C_ABS1( x[j] );
                if ( nounit ) {
                    tjjs = (*A(j,j) - lambda) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                    if ( tscal == 1. ) {
                        goto L160;
                    }
                }

                /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */
                tjj = MAGMA_C_ABS1( tjjs );
                if ( tjj > smlnum ) {
                    /* abs(A(j,j)) > SMLNUM: */
                    if ( tjj < 1. ) {
                        if ( xj > tjj * bignum ) {
                            /* Scale X by 1/abs(x(j)). */
                            rec = 1. / xj;
                            blasf77_csscal( &n, &rec, &x[0], &ione );
                            *scale *= rec;
                            xmax   *= rec;
                        }
                    }
                    x[j] = x[j] / tjjs;
                }
                else if ( tjj > 0. ) {
                    /* 0 < abs(A(j,j)) <= SMLNUM: */
                    if ( xj > tjj * bignum ) {
                        /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
                        rec = (tjj * bignum) / xj;
                        blasf77_csscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax   *= rec;
                    }
                    x[j] = x[j] / tjjs;
                }
                else {
                    /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                    /* scale = 0 and compute a solution to A**T *x = 0. */
                    for( i = 0; i < n; ++i ) {
                        x[i] = c_zero;
                    }
                    x[j]   = c_one;
                    *scale = 0.;
                    xmax   = 0.;
                }
L160:
                ;
            }
            else {
                /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
                /* product has already been divided by 1/A(j,j). */
                x[j] = (x[j] / tjjs) - csumj;
            }
            xmax = max( xmax, MAGMA_C_ABS1( x[j] ));
        }
    }
    else {
        /* ---------------------------------------- */
        /* Solve A**H * x = b */
        for( j = jfirst; (jinc < 0 ? j >= jlast : j < jlast); j += jinc ) {
            /* Compute x(j) = b(j) - sum A(k,j)*x(k). */
            /*                       k<>j             */
            xj = MAGMA_C_ABS1( x[j] );
            uscal = MAGMA_C_MAKE( tscal, 0. );
            rec = 1. / max(xmax, 1.);
            if ( cnorm[j] > (bignum - xj) * rec ) {
                /* If x(j) could overflow, scale x by 1/(2*XMAX). */
                rec *= 0.5;
                if ( nounit ) {
                    tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                }
                tjj = MAGMA_C_ABS1( tjjs );
                if ( tjj > 1. ) {
                    /* Divide by A(j,j) when scaling x if A(j,j) > 1. */
                    rec = min( 1., rec * tjj );
                    uscal = uscal / tjjs;
                }
                if ( rec < 1. ) {
                    blasf77_csscal( &n, &rec, &x[0], &ione );
                    *scale *= rec;
                    xmax   *= rec;
                }
            }

            csumj = c_zero;
            if ( uscal == c_one ) {
                /* If the scaling needed for A in the dot product is 1, */
                /* call CDOTC to perform the dot product. */
                if ( upper ) {
                    csumj = magma_cblas_cdotc( j, A(0,j), ione, &x[0], ione );
                }
                else if ( j < n-1 ) {
                    csumj = magma_cblas_cdotc( n-(j+1), A(j+1,j), ione, &x[j+1], ione );
                }
            }
            else {
                /* Otherwise, use in-line code for the dot product. */
                if ( upper ) {
                    for( i = 0; i < j; ++i ) {
                        csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i];
                    }
                }
                else if ( j < n-1 ) {
                    for( i = j + 1; i < n; ++i ) {
                        csumj += (MAGMA_C_CONJ( *A(i,j) ) * uscal) * x[i];
                    }
                }
            }

            if ( uscal == tscal ) {
                /* Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) */
                /* was not used to scale the dotproduct. */
                x[j] -= csumj;
                xj = MAGMA_C_ABS1( x[j] );
                if ( nounit ) {
                    tjjs = MAGMA_C_CONJ( *A(j,j) - lambda ) * tscal;
                }
                else {
                    tjjs = (c_one - lambda) * tscal;
                    if ( tscal == 1. ) {
                        goto L210;
                    }
                }

                /* Compute x(j) = x(j) / A(j,j), scaling if necessary. */
                tjj = MAGMA_C_ABS1( tjjs );
                if ( tjj > smlnum ) {
                    /* abs(A(j,j)) > SMLNUM: */
                    if ( tjj < 1. ) {
                        if ( xj > tjj * bignum ) {
                            /* Scale X by 1/abs(x(j)). */
                            rec = 1. / xj;
                            blasf77_csscal( &n, &rec, &x[0], &ione );
                            *scale *= rec;
                            xmax   *= rec;
                        }
                    }
                    x[j] = x[j] / tjjs;
                }
                else if ( tjj > 0. ) {
                    /* 0 < abs(A(j,j)) <= SMLNUM: */
                    if ( xj > tjj * bignum ) {
                        /* Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. */
                        rec = (tjj * bignum) / xj;
                        blasf77_csscal( &n, &rec, &x[0], &ione );
                        *scale *= rec;
                        xmax   *= rec;
                    }
                    x[j] = x[j] / tjjs;
                }
                else {
                    /* A(j,j) = 0:  Set x(1:n) = 0, x(j) = 1, and */
                    /* scale = 0 and compute a solution to A**H *x = 0. */
                    for( i = 0; i < n; ++i ) {
                        x[i] = c_zero;
                    }
                    x[j] = c_one;
                    *scale = 0.;
                    xmax   = 0.;
                }
L210:
                ;
            }
            else {
                /* Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot */
                /* product has already been divided by 1/A(j,j). */
                x[j] = (x[j] / tjjs) - csumj;
            }
            xmax = max( xmax, MAGMA_C_ABS1( x[j] ));
        }
    }
    *scale /= tscal;
    
    /* Scale the column norms by 1/TSCAL for return. */
    if ( tscal != 1. ) {
        float d = 1. / tscal;
        blasf77_sscal( &n, &d, &cnorm[0], &ione );
    }

    return *info;
} /* end clatrsd */
Ejemplo n.º 2
0
magma_int_t magma_ctrevc3_mt(
    magma_side_t side, magma_vec_t howmany,
    magma_int_t *select,  // logical in Fortran
    magma_int_t n,
    magmaFloatComplex *T,  magma_int_t ldt,
    magmaFloatComplex *VL, magma_int_t ldvl,
    magmaFloatComplex *VR, magma_int_t ldvr,
    magma_int_t mm, magma_int_t *mout,
    magmaFloatComplex *work, magma_int_t lwork,
    float *rwork, magma_int_t *info )
{
    #define  T(i,j)  ( T + (i) + (j)*ldt )
    #define VL(i,j)  (VL + (i) + (j)*ldvl)
    #define VR(i,j)  (VR + (i) + (j)*ldvr)
    #define work(i,j) (work + (i) + (j)*n)

    // .. Parameters ..
    const magmaFloatComplex c_zero = MAGMA_C_ZERO;
    const magmaFloatComplex c_one  = MAGMA_C_ONE;
    const magma_int_t  nbmin = 16, nbmax = 128;
    const magma_int_t  ione = 1;
    
    // .. Local Scalars ..
    magma_int_t            allv, bothv, leftv, over, rightv, somev;
    magma_int_t            i, ii, is, j, k, ki, iv, n2, nb, nb2, version;
    float                 ovfl, remax, smin, smlnum, ulp, unfl;
    
    // Decode and test the input parameters
    bothv  = (side == MagmaBothSides);
    rightv = (side == MagmaRight) || bothv;
    leftv  = (side == MagmaLeft ) || bothv;

    allv  = (howmany == MagmaAllVec);
    over  = (howmany == MagmaBacktransVec);
    somev = (howmany == MagmaSomeVec);

    // Set mout to the number of columns required to store the selected
    // eigenvectors.
    if ( somev ) {
        *mout = 0;
        for( j=0; j < n; ++j ) {
            if ( select[j] ) {
                *mout += 1;
            }
        }
    }
    else {
        *mout = n;
    }

    *info = 0;
    if ( ! rightv && ! leftv )
        *info = -1;
    else if ( ! allv && ! over && ! somev )
        *info = -2;
    else if ( n < 0 )
        *info = -4;
    else if ( ldt < max( 1, n ) )
        *info = -6;
    else if ( ldvl < 1 || ( leftv && ldvl < n ) )
        *info = -8;
    else if ( ldvr < 1 || ( rightv && ldvr < n ) )
        *info = -10;
    else if ( mm < *mout )
        *info = -11;
    else if ( lwork < max( 1, 2*n ) )
        *info = -14;
    
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    // Quick return if possible.
    if ( n == 0 ) {
        return *info;
    }
    
    // Use blocked version (2) if sufficient workspace.
    // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x.
    // (Compared to dtrevc3, rwork stores 1-norms.)
    // Zero-out the workspace to avoid potential NaN propagation.
    nb = 2;
    if ( lwork >= n + 2*n*nbmin ) {
        version = 2;
        nb = (lwork - n) / (2*n);
        nb = min( nb, nbmax );
        nb2 = 1 + 2*nb;
        lapackf77_claset( "F", &n, &nb2, &c_zero, &c_zero, work, &n );
    }
    else {
        version = 1;
    }

    // Set the constants to control overflow.
    unfl = lapackf77_slamch( "Safe minimum" );
    ovfl = 1. / unfl;
    lapackf77_slabad( &unfl, &ovfl );
    ulp = lapackf77_slamch( "Precision" );
    smlnum = unfl*( n / ulp );

    // Store the diagonal elements of T in working array work.
    for( i=0; i < n; ++i ) {
        *work(i,0) = *T(i,i);
    }

    // Compute 1-norm of each column of strictly upper triangular
    // part of T to control overflow in triangular solver.
    rwork[0] = 0.;
    for( j=1; j < n; ++j ) {
        rwork[j] = cblas_scasum( j, T(0,j), ione );
    }

    // launch threads -- each single-threaded MKL
    magma_int_t nthread = magma_get_parallel_numthreads();
    magma_int_t lapack_nthread = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads( 1 );
    magma_queue queue;
    queue.launch( nthread );
    //printf( "nthread %d, %d\n", nthread, lapack_nthread );
    
    // NB = N/thread, rounded up to multiple of 16,
    // but avoid multiples of page size, e.g., 512*8 bytes = 4096.
    magma_int_t NB = magma_int_t( ceil( ceil( ((float)n) / nthread ) / 16. ) * 16. );
    if ( NB % 512 == 0 ) {
        NB += 32;
    }
    
    magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0;
    timer_start( time_total );

    if ( rightv ) {
        // ============================================================
        // Compute right eigenvectors.
        // iv is index of column in current block.
        // Non-blocked version always uses iv=1;
        // blocked     version starts with iv=nb, goes down to 1.
        // (Note the "0-th" column is used to store the original diagonal.)
        iv = 1;
        if ( version == 2 ) {
            iv = nb;
        }
        
        timer_start( time_trsv );
        is = *mout - 1;
        for( ki=n-1; ki >= 0; --ki ) {
            if ( somev ) {
                if ( ! select[ki] ) {
                    continue;
                }
            }
            smin = max( ulp*( MAGMA_C_ABS1( *T(ki,ki) ) ), smlnum );

            // --------------------------------------------------------
            // Complex right eigenvector
            *work(ki,iv) = c_one;

            // Form right-hand side.
            for( k=0; k < ki; ++k ) {
                *work(k,iv) = -(*T(k,ki));
            }

            // Solve upper triangular system:
            // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work.
            if ( ki > 0 ) {
                queue.push_task( new magma_clatrsd_task(
                    MagmaUpper, MagmaNoTrans, MagmaNonUnit, MagmaTrue,
                    ki, T, ldt, *T(ki,ki),
                    work(0,iv), work(ki,iv), rwork ));
            }

            // Copy the vector x or Q*x to VR and normalize.
            if ( ! over ) {
                // ------------------------------
                // no back-transform: copy x to VR and normalize
                queue.sync();
                n2 = ki+1;
                blasf77_ccopy( &n2, work(0,iv), &ione, VR(0,is), &ione );

                ii = blasf77_icamax( &n2, VR(0,is), &ione ) - 1;
                remax = 1. / MAGMA_C_ABS1( *VR(ii,is) );
                blasf77_csscal( &n2, &remax, VR(0,is), &ione );

                for( k=ki+1; k < n; ++k ) {
                    *VR(k,is) = c_zero;
                }
            }
            else if ( version == 1 ) {
                // ------------------------------
                // version 1: back-transform each vector with GEMV, Q*x.
                queue.sync();
                time_trsv_sum += timer_stop( time_trsv );
                timer_start( time_gemv );
                if ( ki > 0 ) {
                    blasf77_cgemv( "n", &n, &ki, &c_one,
                                   VR, &ldvr,
                                   work(0, iv), &ione,
                                   work(ki,iv), VR(0,ki), &ione );
                }
                time_gemv_sum += timer_stop( time_gemv );
                ii = blasf77_icamax( &n, VR(0,ki), &ione ) - 1;
                remax = 1. / MAGMA_C_ABS1( *VR(ii,ki) );
                blasf77_csscal( &n, &remax, VR(0,ki), &ione );
                timer_start( time_trsv );
            }
            else if ( version == 2 ) {
                // ------------------------------
                // version 2: back-transform block of vectors with GEMM
                // zero out below vector
                for( k=ki+1; k < n; ++k ) {
                    *work(k,iv) = c_zero;
                }

                // Columns iv:nb of work are valid vectors.
                // When the number of vectors stored reaches nb,
                // or if this was last vector, do the GEMM
                if ( (iv == 1) || (ki == 0) ) {
                    queue.sync();
                    time_trsv_sum += timer_stop( time_trsv );
                    timer_start( time_gemm );
                    nb2 = nb-iv+1;
                    n2  = ki+nb-iv+1;
                    
                    // split gemm into multiple tasks, each doing one block row
                    for( i=0; i < n; i += NB ) {
                        magma_int_t ib = min( NB, n-i );
                        queue.push_task( new cgemm_task(
                            MagmaNoTrans, MagmaNoTrans, ib, nb2, n2, c_one,
                            VR(i,0), ldvr,
                            work(0,iv   ), n, c_zero,
                            work(i,nb+iv), n ));
                    }
                    queue.sync();
                    time_gemm_sum += timer_stop( time_gemm );
                    
                    // normalize vectors
                    // TODO if somev, should copy vectors individually to correct location.
                    for( k = iv; k <= nb; ++k ) {
                        ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1;
                        remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) );
                        blasf77_csscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    lapackf77_clacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr );
                    iv = nb;
                    timer_start( time_trsv );
                }
                else {
                    iv -= 1;
                }
            } // blocked back-transform

            is -= 1;
        }
    }
    timer_stop( time_trsv );
    
    timer_stop( time_total );
    timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n",
                  time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total );

    if ( leftv ) {
        // ============================================================
        // Compute left eigenvectors.
        // iv is index of column in current block.
        // Non-blocked version always uses iv=1;
        // blocked     version starts with iv=1, goes up to nb.
        // (Note the "0-th" column is used to store the original diagonal.)
        iv = 1;
        is = 0;
        for( ki=0; ki < n; ++ki ) {
            if ( somev ) {
                if ( ! select[ki] ) {
                    continue;
                }
            }
            smin = max( ulp*MAGMA_C_ABS1( *T(ki,ki) ), smlnum );
        
            // --------------------------------------------------------
            // Complex left eigenvector
            *work(ki,iv) = c_one;
        
            // Form right-hand side.
            for( k = ki + 1; k < n; ++k ) {
                *work(k,iv) = -MAGMA_C_CNJG( *T(ki,k) );
            }
            
            // Solve conjugate-transposed triangular system:
            // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work.
            // TODO what happens with T(k,k) - lambda is small? Used to have < smin test.
            if ( ki < n-1 ) {
                n2 = n-ki-1;
                queue.push_task( new magma_clatrsd_task(
                    MagmaUpper, MagmaConjTrans, MagmaNonUnit, MagmaTrue,
                    n2, T(ki+1,ki+1), ldt, *T(ki,ki),
                    work(ki+1,iv), work(ki,iv), rwork ));
            }
            
            // Copy the vector x or Q*x to VL and normalize.
            if ( ! over ) {
                // ------------------------------
                // no back-transform: copy x to VL and normalize
                queue.sync();
                n2 = n-ki;
                blasf77_ccopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione );
        
                ii = blasf77_icamax( &n2, VL(ki,is), &ione ) + ki - 1;
                remax = 1. / MAGMA_C_ABS1( *VL(ii,is) );
                blasf77_csscal( &n2, &remax, VL(ki,is), &ione );
        
                for( k=0; k < ki; ++k ) {
                    *VL(k,is) = c_zero;
                }
            }
            else if ( version == 1 ) {
                // ------------------------------
                // version 1: back-transform each vector with GEMV, Q*x.
                queue.sync();
                if ( ki < n-1 ) {
                    n2 = n-ki-1;
                    blasf77_cgemv( "n", &n, &n2, &c_one,
                                   VL(0,ki+1), &ldvl,
                                   work(ki+1,iv), &ione,
                                   work(ki,  iv), VL(0,ki), &ione );
                }
                ii = blasf77_icamax( &n, VL(0,ki), &ione ) - 1;
                remax = 1. / MAGMA_C_ABS1( *VL(ii,ki) );
                blasf77_csscal( &n, &remax, VL(0,ki), &ione );
            }
            else if ( version == 2 ) {
                // ------------------------------
                // version 2: back-transform block of vectors with GEMM
                // zero out above vector
                // could go from (ki+1)-NV+1 to ki
                for( k=0; k < ki; ++k ) {
                    *work(k,iv) = c_zero;
                }
        
                // Columns 1:iv of work are valid vectors.
                // When the number of vectors stored reaches nb,
                // or if this was last vector, do the GEMM
                if ( (iv == nb) || (ki == n-1) ) {
                    queue.sync();
                    n2 = n-(ki+1)+iv;
                    
                    // split gemm into multiple tasks, each doing one block row
                    for( i=0; i < n; i += NB ) {
                        magma_int_t ib = min( NB, n-i );
                        queue.push_task( new cgemm_task(
                            MagmaNoTrans, MagmaNoTrans, ib, iv, n2, c_one,
                            VL(i,ki-iv+1), ldvl,
                            work(ki-iv+1,1), n, c_zero,
                            work(i,nb+1), n ));
                    }
                    queue.sync();
                    // normalize vectors
                    for( k=1; k <= iv; ++k ) {
                        ii = blasf77_icamax( &n, work(0,nb+k), &ione ) - 1;
                        remax = 1. / MAGMA_C_ABS1( *work(ii,nb+k) );
                        blasf77_csscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    lapackf77_clacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl );
                    iv = 1;
                }
                else {
                    iv += 1;
                }
            } // blocked back-transform
        
            is += 1;
        }
    }
    
    // close down threads
    queue.quit();
    magma_set_lapack_numthreads( lapack_nthread );
    
    return *info;
}  // End of CTREVC
Ejemplo n.º 3
0
/**
    Purpose
    =======

    CLAHEF computes a partial factorization of a complex Hermitian
    matrix A using the Bunch-Kaufman diagonal pivoting method. The
    partial factorization has the form:

    A  =  ( I  U12 ) ( A11  0  ) (  I    0   )  if UPLO = 'U', or:
          ( 0  U22 ) (  0   D  ) ( U12' U22' )

    A  =  ( L11  0 ) (  D   0  ) ( L11' L21' )  if UPLO = 'L'
          ( L21  I ) (  0  A22 ) (  0    I   )

    where the order of D is at most NB. The actual order is returned in
    the argument KB, and is either NB or NB-1, or N if N <= NB.
    Note that U' denotes the conjugate transpose of U.

    CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code
    (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
    A22 (if UPLO = 'L').

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER
            Specifies whether the upper or lower triangular part of the
            Hermitian matrix A is stored:
      -     = 'U':  Upper triangular
      -     = 'L':  Lower triangular

    @param[in]
    N       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    NB      INTEGER
            The maximum number of columns of the matrix A that should be
            factored.  NB should be at least 2 to allow for 2-by-2 pivot
            blocks.

    @param[out]
    KB      INTEGER
            The number of columns of A that were actually factored.
            KB is either NB-1 or NB, or N if N <= NB.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, A contains details of the partial factorization.

    @param[in]
    LDA     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    ipiv    INTEGER array, dimension (N)
            Details of the interchanges and the block structure of D.
            If UPLO = 'U', only the last KB elements of ipiv are set;
            if UPLO = 'L', only the first KB elements are set.
    \n
            If ipiv(k) > 0, then rows and columns k and ipiv(k) were
            interchanged and D(k,k) is a 1-by-1 diagonal block.
            If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and
            columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k)
            is a 2-by-2 diagonal block.  If UPLO = 'L' and ipiv(k) =
            ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were
            interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block.

    @param[out]
    W       (workspace) COMPLEX array, dimension (LDW,NB)

    @param[in]
    LDW     INTEGER
            The leading dimension of the array W.  LDW >= max(1,N).

    @param[out]
    INFO    INTEGER
      -     = 0: successful exit
      -     > 0: if INFO = k, D(k,k) is exactly zero.  The factorization
                 has been completed, but the block diagonal matrix D is
                 exactly singular.

    @ingroup magma_chetrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_clahef_gpu(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb,
    magmaFloatComplex *hA, magma_int_t lda,
    magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw,
    magma_queue_t queue,
    magma_int_t *info)
{
    /* .. Parameters .. */
    float d_one   = 1.0;
    float d_zero  = 0.0;
    float d_eight = 8.0;
    float d_seven = 7.0;
#if defined(PRECISION_c)
    float  f_zero =  0.0;
#endif
    magmaFloatComplex c_one  =  MAGMA_C_ONE;
    magmaFloatComplex c_mone = -MAGMA_C_ONE;
    magma_int_t upper = (uplo == MagmaUpper);
    magma_int_t ione = 1;

    /* .. Local Scalars .. */
    magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo;
    float   abs_akk, alpha, colmax, R1, rowmax;
    magmaFloatComplex Zimax, Z;

#define dA(i, j)  dA, dA_offset + (j)*ldda  + (i)
#define dW(i, j)  dW, dW_offset + (j)*lddw  + (i)
#define  A(i, j) (hA + (j)*lda   + (i))

    /* .. Executable Statements .. */
    *info = 0;

    /* Initialize alpha for use in choosing pivot block size. */
    alpha = ( d_one+sqrt( d_seven ) ) / d_eight;

    magma_event_t event = NULL;
    if( upper ) {
        /* Factorize the trailing columns of A using the upper triangle
           of A and working backwards, and compute the matrix W = U12*D
           for use in updating A11 (note that conjg(W) is actually stored)

           K is the main loop index, decreasing from N in steps of 1 or 2

           KW is the column of W which corresponds to column K of A   */
        int k, kw = 0;
        for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) {
            kw = nb - (n-k);
            /* Copy column K of A to column KW of W and update it */

            magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue );

            // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#endif

            if (k+1 < n) {
                magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda,
                             dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue );

                // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            }

            kstep = 1;

            /* Determine rows and columns to be interchanged and whether
               a 1-by-1 or 2-by-2 pivot block will be used */
            magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event );
            magma_queue_sync( queue );
            abs_akk = fabs( MAGMA_C_REAL( Z ) );

            /* imax is the row-index of the largest off-diagonal element in
               column K, and colmax is its absolute value */
            if( k > 0 ) {
                // magma is one-base
                imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1;
                magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue );
                colmax = MAGMA_C_ABS1( Z );
            } else {
                colmax = d_zero;
            }
            if( max( abs_akk, colmax ) == 0.0 ) {

                /* Column K is zero: set INFO and continue */
                if ( *info == 0 ) *info = k;

                kp = k;

#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            } else {
                if( abs_akk >= alpha*colmax ) {

                    /* no interchange, use 1-by-1 pivot block */
                    kp = k;
                } else {

                    /* Copy column imax to column KW-1 of W and update it */
                    magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#endif

#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue );
#else
                    magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue );
#endif
                    if( k+1 < n ) {
                        magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone,
                                     dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw,
                                     c_one, dW( 0, kw-1 ), ione, queue );

#if defined(PRECISION_z)
                        magma_dsetvector_async( 1, &d_zero, 1,
                                                dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                        magma_ssetvector_async( 1, &f_zero, 1,
                                                dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#endif
                    }
                    magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event );
                    magma_queue_sync( queue );

                    /* jmax is the column-index of the largest off-diagonal
                      element in row imax, and rowmax is its absolute value */
                    jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue );
                    magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue );
                    rowmax = MAGMA_C_ABS1( Z );
                    if ( imax > 0 ) {
                        // magma is one-base
                        jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1;
                        magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue );
                        rowmax = max( rowmax, MAGMA_C_ABS1( Z  ) );
                    }

                    if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) {

                        /* no interchange, use 1-by-1 pivot block */
                        kp = k;
                    } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) {

                        /* interchange rows and columns K and imax, use 1-by-1
                           pivot block */
                        kp = imax;

                        /* copy column KW-1 of W to column KW */
                        magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue );
                    } else {

                        /* interchange rows and columns K-1 and imax, use 2-by-2
                           pivot block */
                        kp = imax;
                        kstep = 2;
                    }
                }
                kk = k - kstep + 1;
                kkW = nb - (n - kk);

                /* Updated column kp is already stored in column kkW of W */
                if( kp != kk ) {

                    /* Interchange rows kk and kp in last kk columns of A and W */
                    // note: row-swap A(:,kk)
                    magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue );
                    magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue );

                    /* Copy non-updated column kk to column kp */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue );
#else
                    magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue );
#endif

                    // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp)
                    magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event );
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event );
#endif
                }
                if( kstep == 1 ) {

                    /* 1-by-1 pivot block D(k): column KW of W now holds
                          W(k) = U(k)*D(k)
                          where U(k) is the k-th column of U
                          Store U(k) in column k of A */
                    magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue );
                    if ( k > 0 ) {
                        magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event );
                        magma_queue_sync( queue );
                        R1 = d_one / MAGMA_C_REAL( Z );
                        magma_csscal( k, R1, dA( 0, k ), 1, queue );

                        /* Conjugate W(k) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                        magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue );
#endif
                    }
                } else {

                    /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold
                      ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k)
                      where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */
                    if( k > 1 ) {
                        /* Store U(k) and U(k-1) in columns k and k-1 of A */
                        magmablas_clascl_2x2( MagmaUpper,
                                              k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue );
                    }

                    /* Copy D(k) to A */
                    magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue );

                    /* Conjugate W(k) and W(k-1) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( k,   dW( 0, kw ),   1, dW( 0, kw ),   1, queue );
                    magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue );
#endif
                }
            }

            /* Store details of the interchanges in ipiv */
            if( kstep == 1 ) {
                ipiv[ k ] = 1+kp;
            } else {
                ipiv[ k ] = -(1+kp);
                ipiv[ k-1 ] = -(1+kp);
            }
        }
        /* Update the upper triangle of A11 (= A(1:k,1:k)) as
            A11 := A11 - U12*D*U12' = A11 - U12*W'
           computing blocks of NB columns at a time (note that conjg(W) is
           actually stored) */
        kw = nb - (n-k);
        for (int j = ( k / nb )*nb; j >= 0; j -= nb ) {
            int jb = min( nb, k-j+1 );

#ifdef SYMMETRIC_UPDATE
            /* Update the upper triangle of the diagonal block */
            for (int jj = j; jj < j + jb; jj++) {
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#endif
                magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone,
                             dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one,
                             dA( j, jj ), 1, queue );
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#endif
            }
            /* Update the rectangular superdiagonal block */
            magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1),
                         c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw,
                         c_one, dA( 0, j ), ldda, queue );
#else
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#endif
            magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1),
                         c_mone, dA( 0, k+1 ),  ldda,
                         dW( j, kw+1 ), lddw,
                         c_one,  dA( 0, j ),    ldda, queue );
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#endif
#endif
        }

        /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */
        for (int j = k+1; j < n;)
        {
            int jj = j;
            int jp = ipiv[ j ];
            if( jp < 0 ) {
                jp = -jp;
                j = j + 1;
            }
            j = j + 1;
            jp = jp - 1;
            if( jp != jj && j < n )
                magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue );
        }

        // copying the panel back to CPU
        magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event );
        magma_queue_sync( queue );

        /* Set KB to the number of columns factorized */
        *kb = n - (k+1);

    } else {
        /* Factorize the leading columns of A using the lower triangle
           of A and working forwards, and compute the matrix W = L21*D
           for use in updating A22 (note that conjg(W) is actually stored)

           K is the main loop index, increasing from 1 in steps of 1 or 2 */

        int k;
        for (k = 0; k < min(nb-1,n); k += kstep) {

            /* Copy column K of A to column K of W and update it */
            /* -------------------------------------------------------------- */
            magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue );

            // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#endif
            /* -------------------------------------------------------------- */

            magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda,
                         dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue );
            // re-set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event );
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event );
            magma_queue_sync( queue );
#endif

            kstep = 1;

            /* Determine rows and columns to be interchanged and whether
               a 1-by-1 or 2-by-2 pivot block will be used */
            magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event );
            magma_queue_sync( queue );
            abs_akk = fabs( MAGMA_C_REAL( Z ) );

            /* imax is the row-index of the largest off-diagonal element in
               column K, and colmax is its absolute value */
            if( k < n-1 ) {
                // magmablas is one-base
                imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue );

                magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue );
                colmax = MAGMA_C_ABS1( Z );

            } else {
                colmax = d_zero;
            }

            if ( max( abs_akk, colmax ) == 0.0 ) {

                /* Column K is zero: set INFO and continue */
                if( *info == 0 ) *info = k;
                kp = k;

                // make sure the imaginary part of diagonal is zero
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            } else {
                if ( abs_akk >= alpha*colmax ) {

                    /* no interchange, use 1-by-1 pivot block */

                    kp = k;
                } else {
                    /* Copy column imax to column K+1 of W and update it */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue );
#else
                    magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue );
#endif

                    magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#endif

                    magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda,
                                 dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#endif

                    magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event);
                    magma_queue_sync( queue );

                    /* jmax is the column-index of the largest off-diagonal
                       element in row imax, and rowmax is its absolute value */

                    // magmablas is one-base
                    jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue );

                    magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue );
                    rowmax = MAGMA_C_ABS1( Z );
                    if( imax < n-1 ) {
                        // magmablas is one-base
                        jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue);
                        magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue );
                        rowmax = max( rowmax, MAGMA_C_ABS1( Z ) );
                    }

                    if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) {

                        /* no interchange, use 1-by-1 pivot block */
                        kp = k;
                    } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) {

                        /* interchange rows and columns K and imax, use 1-by-1
                           pivot block */
                        kp = imax;

                        /* copy column K+1 of W to column K */
                        magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue );
                    } else {

                        /* interchange rows and columns K+1 and imax, use 2-by-2
                           pivot block */
                        kp = imax;
                        kstep = 2;
                    }
                }

                kk = k + kstep - 1;

                /* Updated column kp is already stored in column kk of W */
                if( kp != kk ) {

                    /* Copy non-updated column kk to column kp */
                    /* ------------------------------------------------------------------ */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue );
#else
                    magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue );
#endif
                    if ( kp < n ) {
                        magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue );
                    }
                    /* ------------------------------------------------------------------ */

                    /* Interchange rows kk and kp in first kk columns of A and W */
                    magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue );
                    magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue );
                }

                if ( kstep == 1 ) {

                    /* 1-by-1 pivot block D(k): column k of W now holds

                       W(k) = L(k)*D(k)

                       where L(k) is the k-th column of L

                       Store L(k) in column k of A */
                    magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue );

                    if ( k < n-1 ) {
                        magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event );
                        magma_queue_sync( queue );
                        R1 = d_one / MAGMA_C_REAL( Z );
                        magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue);

                        /* Conjugate W(k) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                        magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue );
#endif
                    }
                } else {

                    /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold

                    ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k)

                    where L(k) and L(k+1) are the k-th and (k+1)-th columns
                    of L */
                    magmablas_clascl_2x2( MagmaLower,
                                          n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo,
                                          queue );

                    /* Copy D(k) to A */
                    magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue );

                    /* Conjugate W(k) and W(k+1) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( (n-1)-k,   dW( k+1,k ),  1, dW( k+1,k ),   1, queue );
                    magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue );
#endif
                }
            }

            /* Store details of the interchanges in ipiv */
            if ( kstep == 1 ) {
                ipiv[k] = kp+1;
            } else {
                ipiv[k] = -kp-1;
                ipiv[k+1] = -kp-1;
            }
        }

        /* Update the lower triangle of A22 (= A(k:n,k:n)) as

           A22 := A22 - L21*D*L21' = A22 - L21*W'

           computing blocks of NB columns at a time (note that conjg(W) is
           actually stored) */
        for( int j = k; j < n; j += nb ) {
            int jb = min( nb, n-j );

            /* Update the lower triangle of the diagonal block */

#ifdef SYMMETRIC_UPDATE
            for (int jj = j; jj < j + jb; jj++) {
                int jnb = j + jb - jj;

                /* -------------------------------------------------------- */
                magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda,
                             dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue );
                /* -------------------------------------------------------- */
            }

            /* Update the rectangular subdiagonal block */

            if( j+jb < n ) {
                int nk = n - (j+jb);

                /* -------------------------------------------- */
                magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k,
                             c_mone, dA( j+jb, 0 ), ldda,
                             dW( j, 0 ),    lddw,
                             c_one,  dA( j+jb, j ), ldda, queue );
                /* ------------------------------------------- */
            }
#else

#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#endif
            magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k,
                         c_mone, dA( j, 0 ), ldda,
                         dW( j, 0 ), lddw,
                         c_one,  dA( j, j ), ldda, queue );
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#endif
#endif
        }

        /* Put L21 in standard form by partially undoing the interchanges
           in columns 1:k-1 */
        for (int j = k; j > 0;) {
            int jj = j;
            int jp = ipiv[j-1];
            if( jp < 0 ) {
                jp = -jp;
                j--;
            }
            j--;
            if ( jp != jj && j >= 1 ) {
                magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue );
            }
        }
        // copying the panel back to CPU
        magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event );
        magma_queue_sync( queue );

        /* Set KB to the number of columns factorized */
        *kb = k;
    }

    return *info;
    /* End of CLAHEF */
}