C++ (Cpp) magma_dsetvector_async Examples

Programming Language: C++ (Cpp)

Method/Function: magma_dsetvector_async

Examples at hotexamples.com: 5

C++ (Cpp) magma_dsetvector_async - 5 examples found. These are the top rated real world C++ (Cpp) examples of magma_dsetvector_async extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: dlahr2_m.cpp Project: EmergentOrder/magma

/**
    Purpose
    -------
    DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by DGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       DOUBLE_PRECISION array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       DOUBLE_PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_dgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    double *A, magma_int_t lda,
    double *tau,
    double *T, magma_int_t ldt,
    double *Y, magma_int_t ldy,
    struct dgehrd_data* data )
{
    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    double scale;

    magma_int_t i;
    double ei = MAGMA_D_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return 0;
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv );
    }
    
    // set all Y=0
    lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_D_NEGATE( tau[i-1] );
            blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_dcopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_dtrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_dgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_dgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_dtrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_daxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_dlarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magmablasSetKernelStream( data->streams[d] );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_dsetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->streams[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_dlacpy( MagmaFull, nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb,      i), nb );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_dgemv( MagmaNoTrans, n-k, dn-dki1,
                             c_one,  dA (d, k,    dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k,       i), 1 );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_dgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->streams[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_D_NEGATE( tau[i] );
        blasf77_dgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If real, conjugate row of V, and undo afterwards
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_dgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_dgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->streams[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1,
                         c_one,  dA (d, 0,    dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0,       0), ldda );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_dgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->streams[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] );
        magma_dsetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->streams[d] );
    }

    return 0;
} /* magma_dlahr2 */

Example #2

Show file

File: dlaex3.cpp Project: XapaJIaMnu/magma

/**
    Purpose
    -------
    DLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to DLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    k       INTEGER
            The number of terms in the rational function to be solved by
            DLAED4.  K >= 0.

    @param[in]
    n       INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N > K).

    @param[in]
    n1      INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    @param[out]
    d       DOUBLE PRECISION array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    @param[out]
    Q       DOUBLE PRECISION array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    @param[in]
    ldq     INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    @param[in]
    rho     DOUBLE PRECISION
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    @param[in,out]
    dlamda  DOUBLE PRECISION array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    @param[in]
    Q2      DOUBLE PRECISION array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.
            TODO what is LDQ2?

    @param[in]
    indx    INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see DLAED2).
            The rows of the eigenvectors found by DLAED4 must be likewise
            permuted before the matrix multiply can take place.

    @param[in]
    ctot    INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    @param[in,out]
    w       DOUBLE PRECISION array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    @param
    s       (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    @param[out]
    indxq   INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.

    @param
    dwork   (workspace) DOUBLE PRECISION array, dimension (3*N*N/2+3*N)

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
            TODO verify range, vl, vu, il, iu -- copied from dlaex1.

    @param[in]
    vl      DOUBLE PRECISION
    @param[in]
    vu      DOUBLE PRECISION
            if RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            if RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ---------------
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlaex3(magma_int_t k, magma_int_t n, magma_int_t n1, double* d,
             double* Q, magma_int_t ldq, double rho,
             double* dlamda, double* Q2, magma_int_t* indx,
             magma_int_t* ctot, double* w, double* s, magma_int_t* indxq,
             double* dwork,
             magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu,
             magma_int_t* info )
{
#define Q(i_,j_) (Q + (i_) + (j_)*ldq)

    double d_one  = 1.;
    double d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;

    magma_int_t iil, iiu, rk;

    double* dq2= dwork;
    double* ds = dq2  + n*(n/2+1);
    double* dq = ds   + n*(n/2+1);
    magma_int_t lddq = n/2 + 1;

    magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2;
    double temp;
    magma_int_t alleig, valeig, indeig;

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    *info = 0;

    if (k < 0)
        *info=-1;
    else if (n < k)
        *info=-2;
    else if (ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if (*info != 0) {
        magma_xerbla(__func__, -(*info));
        return *info;
    }

    // Quick return if possible
    if (k == 0)
        return *info;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;

    magma_dsetvector_async( lq2, Q2, 1, dq2, 1, NULL );

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

#pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for (i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for (j = ib; j < ie; ++j) {
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if (iinfo != 0) {
#pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

#pragma omp barrier

        if (*info == 0) {
#pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig)
                    magma_dvrange(k, d, &iil, &iiu, vl, vu);
                else if (indeig)
                    magma_dirange(k, indxq, &iil, &iiu, il, iu);
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2) {
#pragma omp single
                {
                    for (j = 0; j < k; ++j) {
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }
            }
            else if (k != 1) {
                // Compute updated W.
                blasf77_dcopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_dcopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for (j = 0; j < k; ++j) {
                    magma_int_t i_tmp = min(j, ie);
                    for (i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for (i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for (i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

#pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if (id < tot) {
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else {
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for (j = ib; j < ie; ++j) {
                    for (i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = magma_cblas_dnrm2( k, s+id*k, 1 );
                    for (i = 0; i < k; ++i) {
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }
    if (*info != 0)
        return *info;

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    magma_timer_t time=0;
    timer_start( time );

    for (i = 0; i < k; ++i)
        dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for (j = 0; j < k; ++j) {
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if (iinfo != 0)
            *info=iinfo;
    }
    if (*info != 0)
        return *info;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig)
        magma_dvrange(k, d, &iil, &iiu, vl, vu);
    else if (indeig)
        magma_dirange(k, indxq, &iil, &iiu, il, iu);
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2) {
        for (j = 0; j < k; ++j) {
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }
    }
    else if (k != 1) {
        // Compute updated W.
        blasf77_dcopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_dcopy( &k, Q, &tmp, w, &ione);

        for (j = 0; j < k; ++j) {
            for (i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for (i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for (i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for (j = iil-1; j < iiu; ++j) {
            for (i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = magma_cblas_dnrm2( k, s, 1 );
            for (i = 0; i < k; ++i) {
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

    timer_stop( time );
    timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#endif //_OPENMP
    // Compute the updated eigenvectors.

    timer_start( time );
    magma_queue_sync( NULL );

    if (rk != 0) {
        if ( n23 != 0 ) {
            if (rk < magma_get_dlaed3_k()) {
                lapackf77_dlacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_dgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            } else {
                magma_dsetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq);
                magma_dgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq );
            }
        } else
            lapackf77_dlaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

        if ( n12 != 0 ) {
            if (rk < magma_get_dlaed3_k()) {
                lapackf77_dlacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_dgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            } else {
                magma_dsetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 );
                magma_dgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq);
                magma_dgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq );
            }
        } else
            lapackf77_dlaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
    }
    timer_stop( time );
    timer_printf( "gemms = %6.2f\n", time );

    return *info;
} /* magma_dlaex3 */

Example #3

Show file

File: dlatrd_mgpu.cpp Project: EmergentOrder/magma

/**
    Purpose
    -------
    DLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" double
magma_dlatrd_mgpu(magma_int_t num_gpus, magma_uplo_t uplo,
                  magma_int_t n0, magma_int_t n, magma_int_t nb, magma_int_t nb0,
                  double *A,  magma_int_t lda,
                  double *e, double *tau,
                  double *W,   magma_int_t ldw,
                  double **dA, magma_int_t ldda, magma_int_t offset,
                  double **dW, magma_int_t lddw,
                  double *dwork[MagmaMaxGPUs], magma_int_t ldwork,
                  magma_int_t k,
                  double *dx[MagmaMaxGPUs],
                  double *dy[MagmaMaxGPUs],
                  double *work,
                  magma_queue_t stream[][10],
                  double *times)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(id, i, j)  (dA[(id)] + ((j)+loffset)*ldda + (i) + offset)
#define dW(id, i, j)  (dW[(id)] + (j)          *lddw + (i))
#define dW1(id, i, j) (dW[(id)] + ((j)+nb)     *lddw + (i))

    double mv_time = 0.0;
    magma_int_t i;
#ifndef MAGMABLAS_DSYMV_MGPU
    magma_int_t loffset = nb0*((offset/nb0)/num_gpus);
#endif

    double c_neg_one = MAGMA_D_NEG_ONE;
    double c_one     = MAGMA_D_ONE;
    double c_zero    = MAGMA_D_ZERO;
    double value     = MAGMA_D_ZERO;
    magma_int_t id, idw, i_one = 1;

    //magma_int_t kk;
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;

    double alpha;

    double *dx2[MagmaMaxGPUs];
    double *f;
    magma_dmalloc_cpu( &f, n );

    if (n <= 0) {
        return 0;
    }

//#define PROFILE_SYMV
#ifdef PROFILE_SYMV
    magma_event_t start, stop;
    float etime;
    magma_timestr_t cpu_start, cpu_end;
    magma_setdevice(0);
    magma_event_create( &start );
    magma_event_create( &stop  );
#endif

    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb ; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                double wii = *W(i, iw+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, A(0, i+1), &i_one, A(0, i), &ione);

                wii = *A(i, i+1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy(&i_1, &wii, W(0, iw+1), &i_one, A(0, i), &ione);
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);

                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_MAKE( 1, 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    dx2[id] = dW1(id, 0, iw);
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, iw), 1, stream[id][0]);
#ifndef  MAGMABLAS_DSYMV_MGPU
                    magma_dsetvector_async( i, A(0,i), 1, dx[id], 1, stream[id][0] );
#endif
                }
                magmablas_dsymv_mgpu(num_gpus, k, MagmaUpper, i, nb0, c_one, dA, ldda, 0,
                                     dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                     work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }

                /* overlap update */
                if ( i < n-1 && i-1 >= n - nb ) {
                    magma_int_t im1_1 = i_1 - 1;
                    magma_int_t im1   = i-1;
                    /* Update A(1:i,i) */
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        magma_int_t im1_n = i_n + 1;
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(im1, iw+1), &ldw, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, W(im1, iw+1), &ldw);
                        lapackf77_dlacgv(&im1_n, A(im1, i +1), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &im1_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  A(im1, i+1), &lda, &c_one, A(0, i-1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&im1_n, A(im1, i+1), &lda);
                    #endif
                }

                // 3. Here is where we need it // TODO find the right place
                magmablas_dsymv_sync(num_gpus, k, i, work, W(0, iw), stream );

                if (i < n-1) {
                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

                    blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

                    blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }

                blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -.5f * value;
                blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione);

                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,iw), 1, dW(id, 0, iw), 1, stream[id][0] );
                    }
                }
            }
        }
    } else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            idw = ((offset+i)/nb)%num_gpus;
            if ( i > 0 ) {
                trace_cpu_start( 0, "gemv", "gemv" );
                double wii = *W(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &ldw);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, A(i, i-1), &ione, A(i, i), &ione);

                wii = *A(i, i-1);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                    lapackf77_dlacgv(&i_one, &wii, &lda);
                #endif
                wii = -wii;
                blasf77_daxpy( &i_n, &wii, W(i, i-1), &ione, A(i, i), &ione);
            }

            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                trace_cpu_start( 0, "larfg", "larfg" );
                alpha = *A(i+1, i);
#ifdef PROFILE_SYMV
                cpu_start = get_current_time();
#endif
                lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
#ifdef PROFILE_SYMV
                cpu_end = get_current_time();
                times[0] += GetTimerValue(cpu_start,cpu_end)/1000.0;
#endif
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_MAKE( 1, 0 );
                trace_cpu_end( 0 );

                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                //trace_gpu_start(  idw, 0, "comm", "comm1" );
#ifndef  MAGMABLAS_DSYMV_MGPU
                magma_setdevice(idw);
                magma_dsetvector( i_n, A(i+1,i), 1, dA(idw, i+1, i), 1 );
#endif
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    trace_gpu_start( id, 0, "comm", "comm" );
#ifdef MAGMABLAS_DSYMV_MGPU
                    dx2[id] = dW1(id, 0, i)-offset;
#else
                    dx2[id] = dx[id];
                    magma_dsetvector( i_n, A(i+1,i), 1, dx[id], 1 );
#endif
                    magma_dsetvector_async( n, A(0,i), 1, dW1(id, 0, i), 1, stream[id][0] );
                    trace_gpu_end( id, 0 );
                }
                /* mat-vec on multiple GPUs */
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(start, stream[0][0]);
#endif
                magmablas_dsymv_mgpu(num_gpus, k, MagmaLower, i_n, nb0, c_one, dA, ldda, offset+i+1,
                                       dx2, ione, c_zero, dy, ione, dwork, ldwork,
                                       work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                magma_setdevice(0);
                magma_event_record(stop, stream[0][0]);
#endif
                trace_cpu_start( 0, "gemv", "gemv" );
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
                trace_cpu_end( 0 );

                /* overlap update */
                if ( i > 0 && i+1 < n ) {
                    magma_int_t ip1 = i+1;
                    trace_cpu_start( 0, "gemv", "gemv" );
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(ip1, 0), &lda,
                                  W(ip1, 0), &ldw, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, W(ip1, 0), &ldw);
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(ip1, 0), &ldw,
                                  A(ip1, 0), &lda, &c_one, A(ip1, ip1), &ione);
                    #if defined(PRECISION_z) || defined(PRECISION_c)
                        lapackf77_dlacgv(&i, A(ip1, 0), &lda);
                    #endif
                    trace_cpu_end( 0 );
                }

                /* synchronize */
                magmablas_dsymv_sync(num_gpus, k, i_n, work, W(i+1,i), stream );
#ifdef PROFILE_SYMV
                cudaEventElapsedTime(&etime, start, stop);
                mv_time += (etime/1000.0);
                times[1+(i_n/(n0/10))] += (etime/1000.0);
#endif
                trace_cpu_start( 0, "axpy", "axpy" );
                if (i != 0)
                    blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);

                blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione);

                #if defined(PRECISION_z) || defined(PRECISION_c)
                    cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                    value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i]* -.5f * value;
                blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
                trace_cpu_end( 0 );
                for( id=0; id < num_gpus; id++ ) {
                    magma_setdevice(id);
                    if ( k > 1 ) {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][1] );
                    } else {
                        magma_dsetvector_async( n, W(0,i), 1, dW(id, 0, i), 1, stream[id][0] );
                    }
                }
            }
        }
    }

#ifdef PROFILE_SYMV
    magma_setdevice(0);
    magma_event_destory( start );
    magma_event_destory( stop  );
#endif
    for( id=0; id < num_gpus; id++ ) {
        magma_setdevice(id);
        if ( k > 1 )
            magma_queue_sync(stream[id][1]);
    }
    magma_free_cpu(f);

    return mv_time;
} /* magma_dlatrd_mgpu */

Example #4

Show file

File: dlatrd2.cpp Project: cjy7117/FT-MAGMA

/**
    Purpose
    -------
    DLATRD2 reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, DLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, DLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by DSYTRD2_GPU. It uses an
    accelerated HEMV that needs extra memory.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       DOUBLE_PRECISION array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       DOUBLE_PRECISION array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).
    
    @param
    dA      TODO: dimension (ldda, n) ??
    
    @param
    ldda    TODO: ldda >= n ??
    
    @param
    dW      TODO: dimension (lddw, 2*nb) ??
    
    @param
    lddw    TODO: lddw >= n ??
    
    @param
    dwork   TODO: dimension (ldwork) ??
    
    @param
    ldwork  TODO: ldwork >= ceil(n/64)*ldda ??

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

        (  a   a   a   v4  v5 )              (  d                  )
        (      a   a   v4  v5 )              (  1   d              )
        (          a   1   v5 )              (  v1  1   a          )
        (              d   1  )              (  v1  v2  a   a      )
        (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_dsyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlatrd2(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
    double *A,  magma_int_t lda,
    double *e, double *tau,
    double *W,  magma_int_t ldw,
    magmaDouble_ptr dA, magma_int_t ldda,
    magmaDouble_ptr dW, magma_int_t lddw,
    magmaDouble_ptr dwork, magma_int_t ldwork)
{
    #define A(i_, j_) (A + (i_) + (j_)*lda)
    #define W(i_, j_) (W + (i_) + (j_)*ldw)
    
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #define dW(i_, j_) (dW + (i_) + (j_)*lddw)

    const double c_neg_one = MAGMA_D_NEG_ONE;
    const double c_one     = MAGMA_D_ONE;
    const double c_zero    = MAGMA_D_ZERO;
    const magma_int_t ione = 1;

    double alpha, value;
    magma_int_t i, i_n, i_1, iw;

    /* Check arguments */
    magma_int_t info = 0;
    if ( uplo != MagmaLower && uplo != MagmaUpper ) {
        info = -1;
    } else if ( n < 0 ) {
        info = -2;
    } else if ( nb < 1 ) {
        info = -3;
    } else if ( lda < max(1,n) ) {
        info = -5;
    } else if ( ldw < max(1,n) ) {
        info = -9;
    } else if ( ldda < max(1,n) ) {
        info = -11;
    } else if ( lddw < max(1,n) ) {
        info = -13;
    } else if ( ldwork < ldda*ceildiv(n,64) ) {
        info = -15;
    }
    
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    /* Quick return if possible */
    if (n == 0) {
        return info;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    
    double *f;
    magma_dmalloc_cpu( &f, n );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, W(i, iw+1), &ldw );
                #endif
                blasf77_dgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                               W(i, iw+1), &ldw, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, W(i, iw+1), &ldw );
                lapackf77_dlacgv( &i_n, A(i, i+1),  &lda );
                #endif
                blasf77_dgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                               A(i, i+1), &lda, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_dlacgv( &i_n, A(i, i+1), &lda );
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_dlarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] );
                
                e[i-1] = MAGMA_D_REAL( alpha );
                *A(i-1,i) = MAGMA_D_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_dsetvector_async( i, A(0, i), 1, dA(0, i), 1, stream );
                
                magmablas_dsymv_work( MagmaUpper, i, c_one, dA(0, 0), ldda,
                                      dA(0, i), ione, c_zero, dW(0, iw), ione,
                                      dwork, ldwork, stream );
                
                // 2. Start getting the result back (asynchronously)
                magma_dgetmatrix_async( i, 1,
                                        dW(0, iw), lddw,
                                        W(0, iw),  ldw, stream );
                
                if (i < n-1) {
                    blasf77_dgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                }
                
                // 3. Here we need dsymv result W(0, iw)
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_dgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                    
                    blasf77_dgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                    
                    blasf77_dgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                }
                
                blasf77_dscal( &i, &tau[i - 1], W(0, iw), &ione );
                
                value = magma_cblas_ddot( i, W(0,iw), ione, A(0,i), ione );
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_daxpy( &i, &alpha, A(0, i), &ione,
                               W(0, iw), &ione );
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, W(i, 0), &ldw );
            #endif
            blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                           W(i, 0), &ldw, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, W(i, 0), &ldw );
            lapackf77_dlacgv( &i, A(i, 0), &lda );
            #endif
            blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                           A(i, 0), &lda, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i, A(i, 0), &lda );
            #endif
            
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_dlarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] );
                e[i] = MAGMA_D_REAL( alpha );
                *A(i+1,i) = MAGMA_D_ONE;
                
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_dsetvector_async( i_n, A(i+1, i), 1, dA(i+1, i), 1, stream );
                
                magmablas_dsymv_work( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda,
                                      dA(i+1, i), ione, c_zero, dW(i+1, i), ione,
                                      dwork, ldwork, stream );
                
                // 2. Start getting the result back (asynchronously)
                magma_dgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
                
                blasf77_dgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                               W(0, i), &ione, &c_zero, f, &ione );
                
                blasf77_dgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                // 3. Here we need dsymv result W(i+1, i)
                magma_queue_sync( stream );
                
                if (i != 0)
                    blasf77_daxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione );
                
                blasf77_dgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                               W(0, i), &ione, &c_one, W(i+1, i), &ione );
                blasf77_dscal( &i_n, &tau[i], W(i+1,i), &ione );
                
                value = magma_cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                alpha = tau[i] * -0.5f * value;
                blasf77_daxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione );
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return info;
} /* magma_dlatrd */

Example #5

Show file

File: clahef_gpu.cpp Project: kjbartel/clmagma

/**
    Purpose
    =======

    CLAHEF computes a partial factorization of a complex Hermitian
    matrix A using the Bunch-Kaufman diagonal pivoting method. The
    partial factorization has the form:

    A  =  ( I  U12 ) ( A11  0  ) (  I    0   )  if UPLO = 'U', or:
          ( 0  U22 ) (  0   D  ) ( U12' U22' )

    A  =  ( L11  0 ) (  D   0  ) ( L11' L21' )  if UPLO = 'L'
          ( L21  I ) (  0  A22 ) (  0    I   )

    where the order of D is at most NB. The actual order is returned in
    the argument KB, and is either NB or NB-1, or N if N <= NB.
    Note that U' denotes the conjugate transpose of U.

    CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code
    (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
    A22 (if UPLO = 'L').

    Arguments
    ---------
    @param[in]
    UPLO    CHARACTER
            Specifies whether the upper or lower triangular part of the
            Hermitian matrix A is stored:
      -     = 'U':  Upper triangular
      -     = 'L':  Lower triangular

    @param[in]
    N       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    NB      INTEGER
            The maximum number of columns of the matrix A that should be
            factored.  NB should be at least 2 to allow for 2-by-2 pivot
            blocks.

    @param[out]
    KB      INTEGER
            The number of columns of A that were actually factored.
            KB is either NB-1 or NB, or N if N <= NB.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit, A contains details of the partial factorization.

    @param[in]
    LDA     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    ipiv    INTEGER array, dimension (N)
            Details of the interchanges and the block structure of D.
            If UPLO = 'U', only the last KB elements of ipiv are set;
            if UPLO = 'L', only the first KB elements are set.
    \n
            If ipiv(k) > 0, then rows and columns k and ipiv(k) were
            interchanged and D(k,k) is a 1-by-1 diagonal block.
            If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and
            columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k)
            is a 2-by-2 diagonal block.  If UPLO = 'L' and ipiv(k) =
            ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were
            interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block.

    @param[out]
    W       (workspace) COMPLEX array, dimension (LDW,NB)

    @param[in]
    LDW     INTEGER
            The leading dimension of the array W.  LDW >= max(1,N).

    @param[out]
    INFO    INTEGER
      -     = 0: successful exit
      -     > 0: if INFO = k, D(k,k) is exactly zero.  The factorization
                 has been completed, but the block diagonal matrix D is
                 exactly singular.

    @ingroup magma_chetrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_clahef_gpu(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb,
    magmaFloatComplex *hA, magma_int_t lda,
    magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda,
    magma_int_t *ipiv,
    magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw,
    magma_queue_t queue,
    magma_int_t *info)
{
    /* .. Parameters .. */
    float d_one   = 1.0;
    float d_zero  = 0.0;
    float d_eight = 8.0;
    float d_seven = 7.0;
#if defined(PRECISION_c)
    float  f_zero =  0.0;
#endif
    magmaFloatComplex c_one  =  MAGMA_C_ONE;
    magmaFloatComplex c_mone = -MAGMA_C_ONE;
    magma_int_t upper = (uplo == MagmaUpper);
    magma_int_t ione = 1;

    /* .. Local Scalars .. */
    magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo;
    float   abs_akk, alpha, colmax, R1, rowmax;
    magmaFloatComplex Zimax, Z;

#define dA(i, j)  dA, dA_offset + (j)*ldda  + (i)
#define dW(i, j)  dW, dW_offset + (j)*lddw  + (i)
#define  A(i, j) (hA + (j)*lda   + (i))

    /* .. Executable Statements .. */
    *info = 0;

    /* Initialize alpha for use in choosing pivot block size. */
    alpha = ( d_one+sqrt( d_seven ) ) / d_eight;

    magma_event_t event = NULL;
    if( upper ) {
        /* Factorize the trailing columns of A using the upper triangle
           of A and working backwards, and compute the matrix W = U12*D
           for use in updating A11 (note that conjg(W) is actually stored)

           K is the main loop index, decreasing from N in steps of 1 or 2

           KW is the column of W which corresponds to column K of A   */
        int k, kw = 0;
        for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) {
            kw = nb - (n-k);
            /* Copy column K of A to column KW of W and update it */

            magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue );

            // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#endif

            if (k+1 < n) {
                magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda,
                             dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue );

                // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            }

            kstep = 1;

            /* Determine rows and columns to be interchanged and whether
               a 1-by-1 or 2-by-2 pivot block will be used */
            magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event );
            magma_queue_sync( queue );
            abs_akk = fabs( MAGMA_C_REAL( Z ) );

            /* imax is the row-index of the largest off-diagonal element in
               column K, and colmax is its absolute value */
            if( k > 0 ) {
                // magma is one-base
                imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1;
                magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue );
                colmax = MAGMA_C_ABS1( Z );
            } else {
                colmax = d_zero;
            }
            if( max( abs_akk, colmax ) == 0.0 ) {

                /* Column K is zero: set INFO and continue */
                if ( *info == 0 ) *info = k;

                kp = k;

#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            } else {
                if( abs_akk >= alpha*colmax ) {

                    /* no interchange, use 1-by-1 pivot block */
                    kp = k;
                } else {

                    /* Copy column imax to column KW-1 of W and update it */
                    magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#endif

#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue );
#else
                    magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue );
#endif
                    if( k+1 < n ) {
                        magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone,
                                     dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw,
                                     c_one, dW( 0, kw-1 ), ione, queue );

#if defined(PRECISION_z)
                        magma_dsetvector_async( 1, &d_zero, 1,
                                                dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                        magma_ssetvector_async( 1, &f_zero, 1,
                                                dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event );
#endif
                    }
                    magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event );
                    magma_queue_sync( queue );

                    /* jmax is the column-index of the largest off-diagonal
                      element in row imax, and rowmax is its absolute value */
                    jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue );
                    magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue );
                    rowmax = MAGMA_C_ABS1( Z );
                    if ( imax > 0 ) {
                        // magma is one-base
                        jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1;
                        magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue );
                        rowmax = max( rowmax, MAGMA_C_ABS1( Z  ) );
                    }

                    if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) {

                        /* no interchange, use 1-by-1 pivot block */
                        kp = k;
                    } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) {

                        /* interchange rows and columns K and imax, use 1-by-1
                           pivot block */
                        kp = imax;

                        /* copy column KW-1 of W to column KW */
                        magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue );
                    } else {

                        /* interchange rows and columns K-1 and imax, use 2-by-2
                           pivot block */
                        kp = imax;
                        kstep = 2;
                    }
                }
                kk = k - kstep + 1;
                kkW = nb - (n - kk);

                /* Updated column kp is already stored in column kkW of W */
                if( kp != kk ) {

                    /* Interchange rows kk and kp in last kk columns of A and W */
                    // note: row-swap A(:,kk)
                    magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue );
                    magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue );

                    /* Copy non-updated column kk to column kp */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue );
#else
                    magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue );
#endif

                    // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp)
                    magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event );
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event );
#endif
                }
                if( kstep == 1 ) {

                    /* 1-by-1 pivot block D(k): column KW of W now holds
                          W(k) = U(k)*D(k)
                          where U(k) is the k-th column of U
                          Store U(k) in column k of A */
                    magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue );
                    if ( k > 0 ) {
                        magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event );
                        magma_queue_sync( queue );
                        R1 = d_one / MAGMA_C_REAL( Z );
                        magma_csscal( k, R1, dA( 0, k ), 1, queue );

                        /* Conjugate W(k) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                        magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue );
#endif
                    }
                } else {

                    /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold
                      ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k)
                      where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */
                    if( k > 1 ) {
                        /* Store U(k) and U(k-1) in columns k and k-1 of A */
                        magmablas_clascl_2x2( MagmaUpper,
                                              k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue );
                    }

                    /* Copy D(k) to A */
                    magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue );

                    /* Conjugate W(k) and W(k-1) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( k,   dW( 0, kw ),   1, dW( 0, kw ),   1, queue );
                    magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue );
#endif
                }
            }

            /* Store details of the interchanges in ipiv */
            if( kstep == 1 ) {
                ipiv[ k ] = 1+kp;
            } else {
                ipiv[ k ] = -(1+kp);
                ipiv[ k-1 ] = -(1+kp);
            }
        }
        /* Update the upper triangle of A11 (= A(1:k,1:k)) as
            A11 := A11 - U12*D*U12' = A11 - U12*W'
           computing blocks of NB columns at a time (note that conjg(W) is
           actually stored) */
        kw = nb - (n-k);
        for (int j = ( k / nb )*nb; j >= 0; j -= nb ) {
            int jb = min( nb, k-j+1 );

#ifdef SYMMETRIC_UPDATE
            /* Update the upper triangle of the diagonal block */
            for (int jj = j; jj < j + jb; jj++) {
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#endif
                magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone,
                             dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one,
                             dA( j, jj ), 1, queue );
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event );
#endif
            }
            /* Update the rectangular superdiagonal block */
            magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1),
                         c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw,
                         c_one, dA( 0, j ), ldda, queue );
#else
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#endif
            magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1),
                         c_mone, dA( 0, k+1 ),  ldda,
                         dW( j, kw+1 ), lddw,
                         c_one,  dA( 0, j ),    ldda, queue );
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue );
#endif
#endif
        }

        /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */
        for (int j = k+1; j < n;)
        {
            int jj = j;
            int jp = ipiv[ j ];
            if( jp < 0 ) {
                jp = -jp;
                j = j + 1;
            }
            j = j + 1;
            jp = jp - 1;
            if( jp != jj && j < n )
                magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue );
        }

        // copying the panel back to CPU
        magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event );
        magma_queue_sync( queue );

        /* Set KB to the number of columns factorized */
        *kb = n - (k+1);

    } else {
        /* Factorize the leading columns of A using the lower triangle
           of A and working forwards, and compute the matrix W = L21*D
           for use in updating A22 (note that conjg(W) is actually stored)

           K is the main loop index, increasing from 1 in steps of 1 or 2 */

        int k;
        for (k = 0; k < min(nb-1,n); k += kstep) {

            /* Copy column K of A to column K of W and update it */
            /* -------------------------------------------------------------- */
            magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue );

            // set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event);
            magma_queue_sync( queue );
#endif
            /* -------------------------------------------------------------- */

            magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda,
                         dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue );
            // re-set imaginary part of diagonal to be zero
#if defined(PRECISION_z)
            magma_dsetvector_async( 1, &d_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event );
            magma_queue_sync( queue );
#elif defined(PRECISION_c)
            magma_ssetvector_async( 1, &f_zero, 1,
                                    dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event );
            magma_queue_sync( queue );
#endif

            kstep = 1;

            /* Determine rows and columns to be interchanged and whether
               a 1-by-1 or 2-by-2 pivot block will be used */
            magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event );
            magma_queue_sync( queue );
            abs_akk = fabs( MAGMA_C_REAL( Z ) );

            /* imax is the row-index of the largest off-diagonal element in
               column K, and colmax is its absolute value */
            if( k < n-1 ) {
                // magmablas is one-base
                imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue );

                magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue );
                colmax = MAGMA_C_ABS1( Z );

            } else {
                colmax = d_zero;
            }

            if ( max( abs_akk, colmax ) == 0.0 ) {

                /* Column K is zero: set INFO and continue */
                if( *info == 0 ) *info = k;
                kp = k;

                // make sure the imaginary part of diagonal is zero
#if defined(PRECISION_z)
                magma_dsetvector_async( 1, &d_zero, 1,
                                        dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#elif defined(PRECISION_c)
                magma_ssetvector_async( 1, &f_zero, 1,
                                        dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event );
                magma_queue_sync( queue );
#endif
            } else {
                if ( abs_akk >= alpha*colmax ) {

                    /* no interchange, use 1-by-1 pivot block */

                    kp = k;
                } else {
                    /* Copy column imax to column K+1 of W and update it */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue );
#else
                    magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue );
#endif

                    magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#endif

                    magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda,
                                 dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue );
#if defined(PRECISION_z)
                    magma_dsetvector_async( 1, &d_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#elif defined(PRECISION_c)
                    magma_ssetvector_async( 1, &f_zero, 1,
                                            dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event);
                    magma_queue_sync( queue );
#endif

                    magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event);
                    magma_queue_sync( queue );

                    /* jmax is the column-index of the largest off-diagonal
                       element in row imax, and rowmax is its absolute value */

                    // magmablas is one-base
                    jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue );

                    magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue );
                    rowmax = MAGMA_C_ABS1( Z );
                    if( imax < n-1 ) {
                        // magmablas is one-base
                        jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue);
                        magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue );
                        rowmax = max( rowmax, MAGMA_C_ABS1( Z ) );
                    }

                    if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) {

                        /* no interchange, use 1-by-1 pivot block */
                        kp = k;
                    } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) {

                        /* interchange rows and columns K and imax, use 1-by-1
                           pivot block */
                        kp = imax;

                        /* copy column K+1 of W to column K */
                        magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue );
                    } else {

                        /* interchange rows and columns K+1 and imax, use 2-by-2
                           pivot block */
                        kp = imax;
                        kstep = 2;
                    }
                }

                kk = k + kstep - 1;

                /* Updated column kp is already stored in column kk of W */
                if( kp != kk ) {

                    /* Copy non-updated column kk to column kp */
                    /* ------------------------------------------------------------------ */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue );
#else
                    magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue );
#endif
                    if ( kp < n ) {
                        magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue );
                    }
                    /* ------------------------------------------------------------------ */

                    /* Interchange rows kk and kp in first kk columns of A and W */
                    magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue );
                    magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue );
                }

                if ( kstep == 1 ) {

                    /* 1-by-1 pivot block D(k): column k of W now holds

                       W(k) = L(k)*D(k)

                       where L(k) is the k-th column of L

                       Store L(k) in column k of A */
                    magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue );

                    if ( k < n-1 ) {
                        magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event );
                        magma_queue_sync( queue );
                        R1 = d_one / MAGMA_C_REAL( Z );
                        magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue);

                        /* Conjugate W(k) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                        magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue );
#endif
                    }
                } else {

                    /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold

                    ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k)

                    where L(k) and L(k+1) are the k-th and (k+1)-th columns
                    of L */
                    magmablas_clascl_2x2( MagmaLower,
                                          n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo,
                                          queue );

                    /* Copy D(k) to A */
                    magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue );

                    /* Conjugate W(k) and W(k+1) */
#if defined(PRECISION_z) || defined(PRECISION_c)
                    magmablas_clacpy_cnjg( (n-1)-k,   dW( k+1,k ),  1, dW( k+1,k ),   1, queue );
                    magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue );
#endif
                }
            }

            /* Store details of the interchanges in ipiv */
            if ( kstep == 1 ) {
                ipiv[k] = kp+1;
            } else {
                ipiv[k] = -kp-1;
                ipiv[k+1] = -kp-1;
            }
        }

        /* Update the lower triangle of A22 (= A(k:n,k:n)) as

           A22 := A22 - L21*D*L21' = A22 - L21*W'

           computing blocks of NB columns at a time (note that conjg(W) is
           actually stored) */
        for( int j = k; j < n; j += nb ) {
            int jb = min( nb, n-j );

            /* Update the lower triangle of the diagonal block */

#ifdef SYMMETRIC_UPDATE
            for (int jj = j; jj < j + jb; jj++) {
                int jnb = j + jb - jj;

                /* -------------------------------------------------------- */
                magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda,
                             dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue );
                /* -------------------------------------------------------- */
            }

            /* Update the rectangular subdiagonal block */

            if( j+jb < n ) {
                int nk = n - (j+jb);

                /* -------------------------------------------- */
                magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k,
                             c_mone, dA( j+jb, 0 ), ldda,
                             dW( j, 0 ),    lddw,
                             c_one,  dA( j+jb, j ), ldda, queue );
                /* ------------------------------------------- */
            }
#else

#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#endif
            magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k,
                         c_mone, dA( j, 0 ), ldda,
                         dW( j, 0 ), lddw,
                         c_one,  dA( j, j ), ldda, queue );
#if defined(PRECISION_z)
            magmablas_dlaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#elif defined(PRECISION_c)
            magmablas_slaset(MagmaUpperLower, 1, jb,
                             0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue );
#endif
#endif
        }

        /* Put L21 in standard form by partially undoing the interchanges
           in columns 1:k-1 */
        for (int j = k; j > 0;) {
            int jj = j;
            int jp = ipiv[j-1];
            if( jp < 0 ) {
                jp = -jp;
                j--;
            }
            j--;
            if ( jp != jj && j >= 1 ) {
                magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue );
            }
        }
        // copying the panel back to CPU
        magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event );
        magma_queue_sync( queue );

        /* Set KB to the number of columns factorized */
        *kb = k;
    }

    return *info;
    /* End of CLAHEF */
}