Example #1
0
extern "C" void
magma_strdtype1cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    float *A, magma_int_t LDA,
    float *V, float *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    //magma_int_t    J1, J2, J3, i, j;
    magma_int_t    len, LDX;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    //float conjtmp;
    float Z_ONE  =  MAGMA_S_ONE;
    float *WORK;
    magma_smalloc_cpu( &WORK, N );
    
    
    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    //printf("voici vpos %d taupos %d  tpos %d  blkid %d \n", vpos, taupos, tpos, blkid);
    LDX     = LDA-1;
    len     = ed-st+1;
    *V(vpos)  = Z_ONE;
    memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(float));
    memset(A(st+1, st-1), 0, (len-1)*sizeof(float));
    /* Eliminate the col  at st-1 */
    lapackf77_slarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) );
    /* apply left and right on A(st:ed,st:ed)*/
    magma_slarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
    //conjtmp = MAGMA_S_CNJG(*TAU(taupos));
    //lapackf77_slarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_S_CNJG(*TAU(taupos)))
    magma_free_cpu(WORK);
}
Example #2
0
extern "C" void
magma_strdtype1cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb,
                                float *A, magma_int_t lda,
                                float *V, magma_int_t ldv,
                                float *TAU,
                                magma_int_t st, magma_int_t ed,
                                magma_int_t sweep, magma_int_t Vblksiz,
                                float *work)
{
/*
    WORK (workspace) float real array, dimension N
*/

    magma_int_t ione = 1;
    magma_int_t vpos, taupos, len, len2;

    float c_one    =  MAGMA_S_ONE;

    magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos);
    //printf("voici vpos %d taupos %d  tpos %d  blkid %d \n", vpos, taupos, tpos, blkid);

    len     = ed-st+1;
    *V(vpos)  = c_one;

    len2 = len-1;
    blasf77_scopy( &len2, A(st+1, st-1), &ione, V(vpos+1), &ione );
    //memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(float));
    memset(A(st+1, st-1), 0, (len-1)*sizeof(float));

    /* Eliminate the col  at st-1 */
    lapackf77_slarfg( &len, A(st, st-1), V(vpos+1), &ione, TAU(taupos) );

    /* apply left and right on A(st:ed,st:ed)*/
    magma_slarfxsym_v2(len, A(st,st), lda-1, V(vpos), TAU(taupos), work);
}
Example #3
0
extern "C" void
magma_strdtype2cbHLsym_withQ_v2(
    magma_int_t n, magma_int_t nb,
    float *A, magma_int_t lda,
    float *V, magma_int_t ldv,
    float *TAU,
    magma_int_t st, magma_int_t ed,
    magma_int_t sweep, magma_int_t Vblksiz,
    float *work)
{
    /*
     WORK (workspace) float real array, dimension NB
    */

    magma_int_t ione = 1;
    magma_int_t vpos, taupos;

    float conjtmp;

    float c_one = MAGMA_S_ONE;

    magma_int_t ldx = lda-1;
    magma_int_t len = ed - st + 1;
    magma_int_t lem = min(ed+nb, n) - ed;
    magma_int_t lem2;
    
    if (lem > 0) {
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos);
        /* apply remaining right coming from the top block */
        lapackf77_slarfx("R", &lem, &len, V(vpos), TAU(taupos), A(ed+1, st), &ldx, work);
    }
    if (lem > 1) {
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, ed, ldv, &vpos, &taupos);

        /* remove the first column of the created bulge */
        *V(vpos)  = c_one;
        //memcpy(V(vpos+1), A(ed+2, st), (lem-1)*sizeof(float));
        lem2 = lem-1;
        blasf77_scopy( &lem2, A(ed+2, st), &ione, V(vpos+1), &ione );
        memset(A(ed+2, st),0,(lem-1)*sizeof(float));

        /* Eliminate the col at st */
        lapackf77_slarfg( &lem, A(ed+1, st), V(vpos+1), &ione, TAU(taupos) );

        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been removed; */
        conjtmp = MAGMA_S_CNJG(*TAU(taupos));
        lapackf77_slarfx("L", &lem, &len, V(vpos),  &conjtmp, A(ed+1, st+1), &ldx, work);
    }
}
Example #4
0
extern "C" void
magma_strdtype2cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    float *A, magma_int_t LDA,
    float *V, float *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    magma_int_t    J1, J2, len, lem, LDX;
    //magma_int_t    i, j;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    float conjtmp;
    float Z_ONE  =  MAGMA_S_ONE;
    //float WORK[NB];
    float *WORK;
    magma_smalloc_cpu( &WORK, NB );
    
    
    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    J1     = ed+1;
    J2     = min(ed+NB,N);
    len    = ed-st+1;
    lem    = J2-J1+1;
    if (lem > 0) {
        /* apply remaining right commming from the top block */
        lapackf77_slarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
    }
    if (lem > 1) {
        findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
        /* remove the first column of the created bulge */
        *V(vpos)  = Z_ONE;
        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(float));
        memset(A(J1+1, st),0,(lem-1)*sizeof(float));
        /* Eliminate the col at st */
        lapackf77_slarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */
        conjtmp = MAGMA_S_CNJG(*TAU(taupos));
        lapackf77_slarfx("L", &lem, &len, V(vpos),  &conjtmp, A(J1, st+1), &LDX, WORK);
    }
    magma_free_cpu(WORK);
}
Example #5
0
/**
    Purpose
    -------
    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by SGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements in rows K:N of the first NB columns are
            overwritten with the matrix Y.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).

    @param[out]
    dV      REAL array on the GPU, dimension (LDDV, NB)
            On exit this n-by-nb array contains the Householder vectors of the transformation.

    @param[in]
    lddv    INTEGER
            The leading dimension of the array dV.  LDDV >= max(1,N).

    @param[in,out]
    A       REAL array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_sgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaFloat_ptr dA, magma_int_t ldda,
    magmaFloat_ptr dV, magma_int_t lddv,
    float *A,     magma_int_t lda,
    float *tau,
    float *T,     magma_int_t ldt,
    float *Y,     magma_int_t ldy )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define  Y(i_,j_) ( Y + (i_) + (j_)*ldy)
    #define  T(i_,j_) ( T + (i_) + (j_)*ldt)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dV(i_,j_) (dV + (i_) + (j_)*lddv)
    
    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t ione = 1;
    
    magma_int_t n_k_i_1, n_k;
    float scale;

    magma_int_t i;
    float ei = MAGMA_S_ZERO;

    magma_int_t info = 0;
    if (n < 0) {
        info = -1;
    } else if (k < 0 || k > n) {
        info = -2;
    } else if (nb < 1 || nb > n) {
        info = -3;
    } else if (ldda < max(1,n)) {
        info = -5;
    } else if (lddv < max(1,n)) {
        info = -7;
    } else if (lda < max(1,n)) {
        info = -9;
    } else if (ldt < max(1,nb)) {
        info = -12;
    } else if (ldy < max(1,n)) {
        info = -13;
    }
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    // adjust from 1-based indexing
    k -= 1;

    if (n <= 1)
        return info;
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Update A(k:n-1,i); Update i-th column of A - Y * T * V'
            // This updates one more row than LAPACK does (row k),
            // making the block above the panel an even multiple of nb.
            // Use last column of T as workspace, w.
            // w(0:i-1, nb-1) = VA(k+i, 0:i-1)'
            blasf77_scopy( &i,
                           A(k+i,0),  &lda,
                           T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            // If real, conjugate row of V.
            lapackf77_slacgv(&i, T(0,nb-1), &ione);
            #endif
            
            // w = T(0:i-1, 0:i-1) * w
            blasf77_strmv( "Upper", "No trans", "No trans", &i,
                           T(0,0),    &ldt,
                           T(0,nb-1), &ione );
            
            // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0),    &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i),    &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_scopy( &i,
                           A(k+1,i),  &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_strmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_sgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1),  &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_strmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1),  &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_strmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_saxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i),  &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_slarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i)
        magma_ssetvector( n_k_i_1,
                          A(k+i+1,i), 1,
                          dV(i+1,i),  1 );
        
        // Compute Y(k+1:n,i) = A vi
        // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i)
        magma_sgemv( MagmaNoTrans, n_k, n_k_i_1,
                     c_one,  dA(k,i+1), ldda,
                             dV(i+1,i), ione,
                     c_zero, dA(k,i),   ione );
        
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_S_NEGATE( tau[i]);
        blasf77_sgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i),     &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_strmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];

        // Y(k:n-1, i) = dA(k:n-1, i)
        magma_sgetvector( n-k,
                          dA(k,i), 1,
                          Y(k,i),  1 );
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;

    return info;
} /* magma_slahr2 */
Example #6
0
/**
    Purpose
    -------
    SLAQPS computes a step of QR factorization with column pivoting
    of a real M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. N >= 0

    @param[in]
    offset  INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    @param[in]
    nb      INTEGER
            The number of columns to factorize.

    @param[out]
    kb      INTEGER
            The number of columns actually factorized.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    @param[in,out]
    jpvt    INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    @param[out]
    tau     REAL array, dimension (KB)
            The scalar factors of the elementary reflectors.

    @param[in,out]
    vn1     REAL array, dimension (N)
            The vector with the partial column norms.

    @param[in,out]
    vn2     REAL array, dimension (N)
            The vector with the exact column norms.

    @param[in,out]
    auxv    REAL array, dimension (NB)
            Auxiliar vector.

    @param[in,out]
    F       REAL array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    @param[in]
    ldf     INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    @ingroup magma_sgeqp3_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset,
             magma_int_t nb, magma_int_t *kb,
             float *A,  magma_int_t lda,
             float *dA, magma_int_t ldda,
             magma_int_t *jpvt, float *tau, float *vn1, float *vn2,
             float *auxv,
             float *F,  magma_int_t ldf,
             float *dF, magma_int_t lddf)
{
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    float c_zero    = MAGMA_S_MAKE( 0.,0.);
    float c_one     = MAGMA_S_MAKE( 1.,0.);
    float c_neg_one = MAGMA_S_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    float d__1;
    float z__1;
    
    magma_int_t j, k, rk;
    float Akk;
    magma_int_t pvt;
    float temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_ssqrt( lapackf77_slamch("Epsilon"));

    magma_queue_t stream;
    magma_queue_create( &stream );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // subtract 1 from Fortran isamax; pvt, k are 0-based.
        i__1 = n-k;
        pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1;
        
        if (pvt != k) {
            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_sgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb) {
                /* no need of transfer if pivot is within the panel */
                blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( stream );

                /* 2. Swap as usual on CPU                          */
                blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_ssetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );
        
            /* Multiply on GPU */
            // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_sgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione );
            
            magma_sgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, stream );
            
            blasf77_sgemv( MagmaConjTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( stream );
            blasf77_sgemv( MagmaConjTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_S_NEGATE( tau[k] );
            blasf77_sgemv( MagmaConjTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_sgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (float) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_ssqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_ssetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );

        magma_sgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = magma_cblas_snrm2( i__1, A(rk+1,lsticc), ione );
        else {
            /* Where is the data, CPU or GPU ? */
            float r1, r2;
            
            r1 = magma_cblas_snrm2( nb-k, A(rk+1,lsticc), ione );
            r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione);
            vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( stream );

    return MAGMA_SUCCESS;
} /* magma_slaqps */
Example #7
0
extern "C" magma_int_t
magma_slahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaFloat_ptr da, size_t da_offset, magma_int_t ldda,
    magmaFloat_ptr dv, size_t dv_offset, magma_int_t lddv,
    float *a, magma_int_t lda,
    float *tau,
    float *t, magma_int_t ldt,
    float *y, magma_int_t ldy,
    magma_queue_t queue)
{
/*  -- clMAGMA auxiliary routine (version 0.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======

    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.

    This is an auxiliary routine called by SGEHRD.

    Arguments
    =========

    N       (input) INTEGER
            The order of the matrix A.

    K       (input) INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    NB      (input) INTEGER
            The number of columns to be reduced.

    DA      (input/output) REAL array on the GPU, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    DV      (output) REAL array on the GPU, dimension (N, NB)
            On exit this contains the Householder vectors of the transformation.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    TAU     (output) REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    T       (output) REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    LDT     (input) INTEGER
            The leading dimension of the array T.  LDT >= NB.

    Y       (output) REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    LDY     (input) INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ===============
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */


    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t c__1 = 1;
    
    magma_int_t a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__2, i__3;
    float d__1;

    magma_int_t i__;
    float ei;

    --tau;
    a_dim1 = lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    t_dim1 = ldt;
    t_offset = 1 + t_dim1;
    t -= t_offset;
    y_dim1 = ldy;
    y_offset = 1 + y_dim1;
    y -= y_offset;

    if (n <= 1)
      return MAGMA_SUCCESS;
    
    for (i__ = 1; i__ <= nb; ++i__) {
        if (i__ > 1) {

          /* Update A(K+1:N,I); Update I-th column of A - Y * V' */
          i__2 = n - k + 1;
          i__3 = i__ - 1;
          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_slacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif
          blasf77_scopy(&i__3, &a[k+i__-1+a_dim1], &lda, &t[nb*t_dim1+1], &c__1);
          blasf77_strmv("u","n","n",&i__3,&t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1);

          blasf77_sgemv("NO TRANSPOSE", &i__2, &i__3, &c_neg_one, &y[k + y_dim1],
                        &ldy, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__*a_dim1],&c__1);

          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_slacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif

          /* Apply I - V * T' * V' to this column (call it b) from the
             left, using the last column of T as workspace

             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
                      ( V2 )             ( b2 )
             where V1 is unit lower triangular
             w := V1' * b1                                                 */
          
          i__2 = i__ - 1;
          blasf77_scopy(&i__2, &a[k+1+i__*a_dim1], &c__1, &t[nb*t_dim1+1], &c__1);
          blasf77_strmv("Lower", MagmaConjTransStr, "UNIT", &i__2,
                        &a[k + 1 + a_dim1], &lda, &t[nb * t_dim1 + 1], &c__1);

          /* w := w + V2'*b2 */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_sgemv(MagmaConjTransStr, &i__2, &i__3, &c_one,
                        &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1,
                        &c_one, &t[nb*t_dim1+1], &c__1);

          /* w := T'*w */
          i__2 = i__ - 1;
          blasf77_strmv("U", MagmaConjTransStr, "N", &i__2, &t[t_offset], &ldt,
                        &t[nb*t_dim1+1], &c__1);
          
          /* b2 := b2 - V2*w */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_sgemv("N", &i__2, &i__3, &c_neg_one, &a[k + i__ + a_dim1], &lda,
                 &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__+i__*a_dim1], &c__1);

          /* b1 := b1 - V1*w */
          i__2 = i__ - 1;
          blasf77_strmv("L","N","U",&i__2,&a[k+1+a_dim1],&lda,&t[nb*t_dim1+1],&c__1);
          blasf77_saxpy(&i__2, &c_neg_one, &t[nb * t_dim1 + 1], &c__1,
                 &a[k + 1 + i__ * a_dim1], &c__1);
          
          a[k + i__ - 1 + (i__ - 1) * a_dim1] = ei;
        }
        
        /* Generate the elementary reflector H(I) to annihilate A(K+I+1:N,I) */
        i__2 = n - k - i__ + 1;
        i__3 = k + i__ + 1;
        lapackf77_slarfg(&i__2, &a[k + i__ + i__ * a_dim1],
                         &a[min(i__3,n) + i__ * a_dim1], &c__1, &tau[i__]);
        ei = a[k + i__ + i__ * a_dim1];
        a[k + i__ + i__ * a_dim1] = c_one;

        /* Compute  Y(K+1:N,I) */
        i__2 = n - k;
        i__3 = n - k - i__ + 1;
        magma_ssetvector( i__3, &a[k + i__ + i__*a_dim1], 1, dv, dv_offset+(i__-1)*(lddv+1),      1, queue );

        magma_sgemv(MagmaNoTrans, i__2+1, i__3, c_one,
                    da, da_offset + (-1 + k + i__ * ldda), ldda,
                    dv, dv_offset + (i__-1)*(lddv+1), c__1, c_zero,
                    da, da_offset + (-1 + k + (i__-1)*ldda), c__1, queue);
        
        i__2 = n - k - i__ + 1;
        i__3 = i__ - 1;
        blasf77_sgemv(MagmaConjTransStr, &i__2, &i__3, &c_one,
                      &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1,
                      &c_zero, &t[i__*t_dim1+1], &c__1);

        /* Compute T(1:I,I) */
        i__2 = i__ - 1;
        d__1 = MAGMA_S_NEGATE( tau[i__] );
        blasf77_sscal(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1);
        blasf77_strmv("U","N","N", &i__2, &t[t_offset], &ldt, &t[i__*t_dim1+1], &c__1);
        t[i__ + i__ * t_dim1] = tau[i__];

        magma_sgetvector( n - k + 1, da, da_offset+(-1+ k+(i__-1)*ldda), 1, y+ k + i__*y_dim1, 1, queue );
    }
    a[k + nb + nb * a_dim1] = ei;

    return MAGMA_SUCCESS;
} /* magma_slahr2 */
Example #8
0
extern "C" magma_err_t
magma_slatrd(char uplo, magma_int_t n, magma_int_t nb,
             float *a,  magma_int_t lda,
             float *e, float *tau,
             float *w,  magma_int_t ldw,
             magmaFloat_ptr da, size_t da_offset, magma_int_t ldda,
             magmaFloat_ptr dw, size_t dw_offset, magma_int_t lddw, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) REAL array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */
  
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
  
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;

    float value = MAGMA_S_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
  
    float alpha;

    float *f;
    magma_smalloc_cpu( &f, n );

    if (n <= 0) {
      return 0;
    }

    magma_event_t event = NULL;

    if (lapackf77_lsame(uplo_, "U")) {

      /* Reduce last NB columns of upper triangle */
      for (i = n-1; i >= n - nb ; --i) {
        i_1 = i + 1;
        i_n = n - i - 1;
        
        iw = i - n + nb;
        if (i < n-1) {
          /* Update A(1:i,i) */
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
          #endif
          blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                        W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
              lapackf77_slacgv(&i_n, A(i, i+1), &lda);
          #endif
          blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                        A(i, i+1), &lda, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, A(i, i+1), &lda);
          #endif
        }
        if (i > 0) {
          /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
          
          alpha = *A(i-1, i);
          
          lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
          
          e[i-1] = MAGMA_S_REAL( alpha );
          MAGMA_S_SET2REAL(*A(i-1, i), 1.);
          
          /* Compute W(1:i-1,i) */
          // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
          magma_ssetvector( i, A(0, i), 0, 1, dA(0, i), 1, queue );
          
          magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                      dA(0, i), ione, c_zero, dW(0, iw), ione, queue);
          
          // 2. Start putting the result back (asynchronously)
          magma_sgetmatrix_async( i, 1,
                                  dW(0, iw),         lddw,
                                  W(0, iw)/*test*/, 0, ldw, queue, &event );
          
          if (i < n-1) {

            blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
          }
          
            // 3. Here is where we need it // TODO find the right place
            magma_event_sync(event);

          if (i < n-1) {
          
            blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

            blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

            blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
          }

          blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione);

          #if defined(PRECISION_z) || defined(PRECISION_c)
          cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
          #else
          value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
          #endif
          alpha = tau[i - 1] * -0.5f * value;
          blasf77_saxpy(&i, &alpha, A(0, i), &ione,
                        W(0, iw), &ione);
        }
      }

    } else {

      /*  Reduce first NB columns of lower triangle */
      for (i = 0; i < nb; ++i)
        {

          /* Update A(i:n,i) */
          i_n = n - i;
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, W(i, 0), &ldw);
          #endif
          blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                        W(i, 0), &ldw, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, W(i, 0), &ldw);
              lapackf77_slacgv(&i, A(i ,0), &lda);
          #endif
          blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                        A(i, 0), &lda, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, A(i, 0), &lda);
          #endif

          if (i < n-1)
            {
              /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
              i_n = n - i - 1;
              alpha = *A(i+1, i);
              lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
              e[i] = MAGMA_S_REAL( alpha );
              MAGMA_S_SET2REAL(*A(i+1, i), 1.);

              /* Compute W(i+1:n,i) */
              // 1. Send the block reflector  A(i+1:n,i) to the GPU
              magma_ssetvector( i_n, A(i+1, i), 0, 1, dA(i+1, i), 1, queue );
          
              magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                          dW(i+1, i), ione, queue);
          
              // 2. Start putting the result back (asynchronously)
              magma_sgetmatrix_async( i_n, 1,
                                      dW(i+1, i), lddw,
                                      W(i+1, i), 0, ldw, queue, &event );

              blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                            W(0, i), &ione, &c_zero, f, &ione);
              
              blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              // 3. Here is where we need it
              magma_event_sync(event);

              if (i!=0)
                blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
     
              blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                            W(0, i), &ione, &c_one, W(i+1, i), &ione);
              blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione);
              
              #if defined(PRECISION_z) || defined(PRECISION_c)
              cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
              #else
              value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
              #endif
              alpha = tau[i] * -0.5f * value;
              blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);

    return 0;
} /* slatrd_ */
Example #9
0
/**
    Purpose
    -------
    SLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by SGEBRD.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.

    @param[in]
    nb      INTEGER
            The number of leading rows and columns of A to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
    \n
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[in,out]
    dA      REAL array, dimension (LDDA,N)
            Copy of A on GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    d       REAL array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    @param[out]
    e       REAL array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    @param[out]
    tauq    REAL array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    REAL array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    X       REAL array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    @param[in]
    ldx     INTEGER
            The leading dimension of the array X. LDX >= M.

    @param[out]
    dX      REAL array, dimension (LDDX,NB)
            Copy of X on GPU.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX. LDDX >= M.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[out]
    dY      REAL array, dimension (LDDY,NB)
            Copy of Y on GPU.

    @param[in]
    lddy    INTEGER
            The leading dimension of the array dY. LDDY >= N.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )
    @endverbatim

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    @ingroup magma_sgesvd_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                  float *A,  magma_int_t lda,
                  float *dA, magma_int_t ldda,
                  float *d, float *e, float *tauq, float *taup,
                  float *X,  magma_int_t ldx,
                  float *dX, magma_int_t lddx,
                  float *Y,  magma_int_t ldy,
                  float *dY, magma_int_t lddy)
{
    #define A(i_,j_) (A + (i_) + (j_)*lda)
    #define X(i_,j_) (X + (i_) + (j_)*ldx)
    #define Y(i_,j_) (Y + (i_) + (j_)*ldy)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dY(i_,j_) (dY + (i_) + (j_)*lddy)
    #define dX(i_,j_) (dX + (i_) + (j_)*lddx)
    
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;
    magma_int_t ione = 1;
    
    magma_int_t i__2, i__3;
    magma_int_t i;
    float alpha;

    A  -= 1 + lda;
    X  -= 1 + ldx;
    dX -= 1 + lddx;
    Y  -= 1 + ldy;
    dY -= 1 + lddy;
    --d;
    --e;
    --tauq;
    --taup;

    /* Quick return if possible */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }

    float *f;
    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_smalloc_cpu( &f, max(n,m) );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (m >= n) {
        /* Reduce to upper bidiagonal form */
        for (i = 1; i <= nb; ++i) {
            /*  Update A(i:m,i) */
            i__2 = m - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           A(i,1), &lda,
                           Y(i,1), &ldy, &c_one,
                           A(i,i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           X(i,1), &ldx,
                           A(1,i), &ione, &c_one,
                           A(i,i), &ione );
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
            alpha = *A(i,i);
            i__2 = m - i + 1;
            i__3 = i + 1;
            lapackf77_slarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i < n) {
                *A(i,i) = c_one;

                /* Compute Y(i+1:n,i) */
                i__2 = m - i + 1;
                i__3 = n - i;

                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__2,
                                  A(i,i), 1,
                                  dA(i-1,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i-1,i),   ldda,
                             dA(i-1,i-1), ione, c_zero,
                             dY(i+1,i),   ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i,1), &lda,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );

                i__2 = n - i;
                i__3 = i - 1;
                blasf77_sgemv( "N", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               X(i,1), &ldx,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );

                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }

                i__2 = i - 1;
                i__3 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_sscal( &i__2, &tauq[i], Y(i+1,i), &ione );

                /* Update A(i,i+1:n) */
                i__2 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, A(i,i+1), &lda );
                lapackf77_slacgv( &i,  A(i,1), &lda );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i, &c_neg_one,
                               Y(i+1,1), &ldy,
                               A(i,1),   &lda, &c_one,
                               A(i,i+1), &lda );
                i__2 = i - 1;
                i__3 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i,  A(i,1), &lda );
                lapackf77_slacgv( &i__2, X(i,1), &ldx );
                #endif
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               X(i,1),   &ldx, &c_one,
                               A(i,i+1), &lda );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, X(i,1), &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                i__2 = n - i;
                i__3 = i + 2;
                alpha = *A(i,i+1);
                lapackf77_slarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i,i+1) = c_one;

                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i;
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__3,
                                  A(i,i+1), lda,
                                  dA(i-1,i), ldda );
                // 2. Multiply ---------------------------------------------
                //magma_scopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 );
                magma_sgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i), ldda,
                             dA(i-1,i), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );

                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );

                i__2 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               Y(i+1,1), &ldy,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                i__2 = m - i;
                blasf77_sgemv( "N", &i__2, &i, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = i - 1;
                i__3 = n - i;
                blasf77_sgemv( "N", &i__2, &i__3, &c_one,
                               A(1,i+1), &lda,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i != 0) {
                    i__2 = m - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione );
                }


                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_sscal( &i__2, &taup[i], X(i+1,i), &ione );

                #if defined(PRECISION_z) || defined(PRECISION_c)
                i__2 = n - i;
                lapackf77_slacgv( &i__2,  A(i,i+1), &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after SLACGV()
                magma_ssetvector( i__2,
                                  A(i,i+1),  lda,
                                  dA(i-1,i), ldda );
                #endif
            }
        }
    }
    else {
        /* Reduce to lower bidiagonal form */
        for (i = 1; i <= nb; ++i) {
        
            /* Update A(i,i:n) */
            i__2 = n - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__2, A(i,i), &lda );
            lapackf77_slacgv( &i__3, A(i,1), &lda );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           Y(i,1), &ldy,
                           A(i,1), &lda, &c_one,
                           A(i,i), &lda );
            i__2 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, A(i,1), &lda );
            lapackf77_slacgv( &i__3, X(i,1), &ldx );
            #endif
            i__3 = n - i + 1;
            blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                           A(1,i), &lda,
                           X(i,1), &ldx, &c_one,
                           A(i,i), &lda );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__2, X(i,1), &ldx );
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            i__2 = n - i + 1;
            i__3 = i + 1;
            alpha = *A(i,i);
            lapackf77_slarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i < m) {
                *A(i,i) = c_one;
                
                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i + 1;
                
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_ssetvector( i__3,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                
                // 2. Multiply ---------------------------------------------
                //magma_scopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 );
                magma_sgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i-1), ldda,
                             dA(i-1,i-1), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );
                
                i__2 = n - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               Y(i,1), &ldy,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = i - 1;
                i__3 = n - i + 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_one,
                               A(1,i), &lda,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__2 != 0) {
                    i__3 = m - i;
                    blasf77_saxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione );
                }
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_sscal( &i__2, &taup[i], X(i+1,i), &ione );
                i__2 = n - i + 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, A(i,i), &lda );
                magma_ssetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                #endif
                
                /* Update A(i+1:m,i) */
                i__2 = m - i;
                i__3 = i - 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               Y(i,1),   &ldy, &c_one,
                               A(i+1,i), &ione );
                i__2 = m - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i, &c_neg_one,
                               X(i+1,1), &ldx,
                               A(1,i),   &ione, &c_one,
                               A(i+1,i), &ione );
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                i__2 = m - i;
                i__3 = i + 2;
                alpha = *A(i+1,i);
                lapackf77_slarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = c_one;
                
                /* Compute Y(i+1:n,i) */
                i__2 = m - i;
                i__3 = n - i;
                
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__2,
                                  A(i+1,i), 1,
                                  dA(i,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i,i),   ldda,
                             dA(i,i-1), ione, c_zero,
                             dY(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i+1,1), &lda,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                i__2 = n - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = m - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               X(i+1,1), &ldx,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }
                
                i__2 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_sscal( &i__2, &tauq[i], Y(i+1,i), &ione );
            }
            #if defined(PRECISION_z) || defined(PRECISION_c)
            else {
                i__2 = n - i + 1;
                lapackf77_slacgv( &i__2, A(i,i), &lda );
                magma_ssetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
            }
            #endif
        }
    }
    
    magma_queue_destroy( stream );
    magma_free_cpu( f );
    
    return info;
} /* magma_slabrd_gpu */
Example #10
0
/**
    Purpose
    -------
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       REAL array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).
    
    @param
    dA
    
    @param
    ldda
    
    @param
    dW
    
    @param
    lddw

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

        (  a   a   a   v4  v5 )              (  d                  )
        (      a   a   v4  v5 )              (  1   d              )
        (          a   1   v5 )              (  v1  1   a          )
        (              d   1  )              (  v1  v2  a   a      )
        (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slatrd(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
    float *A,  magma_int_t lda,
    float *e, float *tau,
    float *W,  magma_int_t ldw,
    magmaFloat_ptr dA, magma_int_t ldda,
    magmaFloat_ptr dW, magma_int_t lddw)
{
    #define A(i_, j_) (A + (i_) + (j_)*lda)
    #define W(i_, j_) (W + (i_) + (j_)*ldw)
    
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #define dW(i_, j_) (dW + (i_) + (j_)*lddw)

    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t ione = 1;

    float alpha, value;
    magma_int_t i, i_n, i_1, iw;

    /* Check arguments */
    magma_int_t info = 0;
    if ( uplo != MagmaLower && uplo != MagmaUpper ) {
        info = -1;
    } else if ( n < 0 ) {
        info = -2;
    } else if ( nb < 1 ) {
        info = -3;
    } else if ( lda < max(1,n) ) {
        info = -5;
    } else if ( ldw < max(1,n) ) {
        info = -9;
    } else if ( ldda < max(1,n) ) {
        info = -11;
    } else if ( lddw < max(1,n) ) {
        info = -13;
    }
    
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    /* Quick return if possible */
    if (n == 0) {
        return info;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    
    float *f;
    magma_smalloc_cpu( &f, n );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, W(i, iw+1), &ldw );
                #endif
                blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                               W(i, iw+1), &ldw, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, W(i, iw+1), &ldw );
                lapackf77_slacgv( &i_n, A(i, i+1),  &lda );
                #endif
                blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                               A(i, i+1), &lda, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, A(i, i+1), &lda );
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] );
                
                e[i-1] = MAGMA_S_REAL( alpha );
                *A(i-1,i) = MAGMA_S_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                magma_ssymv( MagmaUpper, i, c_one, dA(0, 0), ldda,
                             dA(0, i), ione, c_zero, dW(0, iw), ione );
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i, 1,
                                        dW(0, iw), lddw,
                                        W(0, iw),  ldw, stream );
                
                if (i < n-1) {
                    blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                    
                    blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                    
                    blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                }
                
                blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione );
                
                value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_saxpy( &i, &alpha, A(0, i), &ione,
                               W(0, iw), &ione );
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, W(i, 0), &ldw );
            #endif
            blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                           W(i, 0), &ldw, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, W(i, 0), &ldw );
            lapackf77_slacgv( &i, A(i, 0), &lda );
            #endif
            blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                           A(i, 0), &lda, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, A(i, 0), &lda );
            #endif
            
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = MAGMA_S_ONE;
                
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
                
                magma_ssymv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda,
                             dA(i+1, i), ione, c_zero, dW(i+1, i), ione );
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
                
                blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                               W(0, i), &ione, &c_zero, f, &ione );
                
                blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                // 3. Here is where we need it
                magma_queue_sync( stream );
                
                if (i != 0)
                    blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione );
                
                blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                               W(0, i), &ione, &c_one, W(i+1, i), &ione );
                blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione );
                
                value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                alpha = tau[i] * -0.5f * value;
                blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione );
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return info;
} /* magma_slatrd */
Example #11
0
/***************************************************************************//**
    Purpose
    -------
    SLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by SGEBRD.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.

    @param[in]
    nb      INTEGER
            The number of leading rows and columns of A to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
    \n
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[in,out]
    dA      REAL array, dimension (LDDA,N)
            Copy of A on GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    d       REAL array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    @param[out]
    e       REAL array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    @param[out]
    tauq    REAL array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    REAL array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    X       REAL array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    @param[in]
    ldx     INTEGER
            The leading dimension of the array X. LDX >= M.

    @param[out]
    dX      REAL array, dimension (LDDX,NB)
            Copy of X on GPU.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX. LDDX >= M.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[out]
    dY      REAL array, dimension (LDDY,NB)
            Copy of Y on GPU.

    @param[in]
    lddy    INTEGER
            The leading dimension of the array dY. LDDY >= N.

    @param
    work    REAL array, dimension (LWORK)
            Workspace.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK. LWORK >= max( M, N ).

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )
    @endverbatim

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    @ingroup magma_labrd
*******************************************************************************/
extern "C" magma_int_t
magma_slabrd_gpu(
    magma_int_t m, magma_int_t n, magma_int_t nb,
    float     *A, magma_int_t lda,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *d, float *e, float *tauq, float *taup,
    float     *X, magma_int_t ldx,
    magmaFloat_ptr dX, magma_int_t lddx,
    float     *Y, magma_int_t ldy,
    magmaFloat_ptr dY, magma_int_t lddy,
    float  *work, magma_int_t lwork,
    magma_queue_t queue )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define  X(i_,j_) ( X + (i_) + (j_)*ldx)
    #define  Y(i_,j_) ( Y + (i_) + (j_)*ldy)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dY(i_,j_) (dY + (i_) + (j_)*lddy)
    #define dX(i_,j_) (dX + (i_) + (j_)*lddx)
    
    /* Constants */
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t ione = 1;
    
    /* Local variables */
    magma_int_t i, i1, m_i, m_i1, n_i, n_i1;
    float alpha;

    /* Quick return if possible */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }

    if (m >= n) {
        /* Reduce to upper bidiagonal form */
        for (i=0; i < nb; ++i) {
            /* Update A(i:m,i) */
            i1   = i + 1;
            m_i  = m - i;
            m_i1 = m - (i+1);
            n_i1 = n - (i+1);
            #ifdef COMPLEX
            lapackf77_slacgv( &i, Y(i,0), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &m_i, &i, &c_neg_one,
                           A(i,0), &lda,
                           Y(i,0), &ldy, &c_one,
                           A(i,i), &ione );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, Y(i,0), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &m_i, &i, &c_neg_one,
                           X(i,0), &ldx,
                           A(0,i), &ione, &c_one,
                           A(i,i), &ione );
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
            alpha = *A(i,i);
            lapackf77_slarfg( &m_i, &alpha, A(min(i+1,m-1),i), &ione, &tauq[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i+1 < n) {
                *A(i,i) = c_one;

                /* Compute Y(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( m_i,
                                  A(i,i), 1,
                                  dA(i,i), 1, queue );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, m_i, n_i1, c_one,
                             dA(i,i+1),   ldda,
                             dA(i,i), ione, c_zero,
                             dY(i+1,i),   ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( n_i1, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, queue );
                blasf77_sgemv( MagmaConjTransStr, &m_i, &i, &c_one,
                               A(i,0), &lda,
                               A(i,i), &ione, &c_zero,
                               Y(0,i), &ione );

                blasf77_sgemv( "N", &n_i1, &i, &c_neg_one,
                               Y(i+1,0), &ldy,
                               Y(0,i),   &ione, &c_zero,
                               work,     &ione );
                blasf77_sgemv( MagmaConjTransStr, &m_i, &i, &c_one,
                               X(i,0), &ldx,
                               A(i,i), &ione, &c_zero,
                               Y(0,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );

                if (i != 0) {
                    blasf77_saxpy( &n_i1, &c_one, work, &ione, Y(i+1,i), &ione );
                }

                blasf77_sgemv( MagmaConjTransStr, &i, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               Y(0,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                blasf77_sscal( &n_i1, &tauq[i], Y(i+1,i), &ione );

                /* Update A(i,i+1:n) */
                #ifdef COMPLEX
                lapackf77_slacgv( &n_i1, A(i,i+1), &lda );
                lapackf77_slacgv( &i1,  A(i,0), &lda );
                #endif
                blasf77_sgemv( "No transpose", &n_i1, &i1, &c_neg_one,
                               Y(i+1,0), &ldy,
                               A(i,0),   &lda, &c_one,
                               A(i,i+1), &lda );
                #ifdef COMPLEX
                lapackf77_slacgv( &i1,  A(i,0), &lda );
                lapackf77_slacgv( &i, X(i,0), &ldx );
                #endif
                blasf77_sgemv( MagmaConjTransStr, &i, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               X(i,0),   &ldx, &c_one,
                               A(i,i+1), &lda );
                #ifdef COMPLEX
                lapackf77_slacgv( &i, X(i,0), &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                alpha = *A(i,i+1);
                lapackf77_slarfg( &n_i1, &alpha, A(i,min(i+2,n-1)), &lda, &taup[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i,i+1) = c_one;

                /* Compute X(i+1:m,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( n_i1,
                                  A(i,i+1), lda,
                                  dA(i,i+1), ldda, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaNoTrans, m_i1, n_i1, c_one,
                             dA(i+1,i+1), ldda,
                             dA(i,i+1), ldda,
                             //dY(0,0), 1,
                             c_zero,
                             dX(i+1,i), ione, queue );

                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( m_i1, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, queue );

                blasf77_sgemv( MagmaConjTransStr, &n_i1, &i1, &c_one,
                               Y(i+1,0), &ldy,
                               A(i,i+1), &lda, &c_zero,
                               X(0,i),   &ione );

                blasf77_sgemv( "N", &m_i1, &i1, &c_neg_one,
                               A(i+1,0), &lda,
                               X(0,i),   &ione, &c_zero,
                               work,     &ione );
                blasf77_sgemv( "N", &i, &n_i1, &c_one,
                               A(0,i+1), &lda,
                               A(i,i+1), &lda, &c_zero,
                               X(0,i),   &ione );

                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if ((i+1) != 0) {
                    blasf77_saxpy( &m_i1, &c_one, work, &ione, X(i+1,i), &ione );
                }

                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               X(i+1,0), &ldx,
                               X(0,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                blasf77_sscal( &m_i1, &taup[i], X(i+1,i), &ione );

                #ifdef COMPLEX
                lapackf77_slacgv( &n_i1,  A(i,i+1), &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after SLACGV()
                magma_ssetvector( n_i1,
                                  A(i,i+1),  lda,
                                  dA(i,i+1), ldda, queue );
                #endif
            }
        }
    }
    else {
        /* Reduce to lower bidiagonal form */
        for (i=0; i < nb; ++i) {
            /* Update A(i,i:n) */
            i1   = i + 1;
            m_i1 = m - (i+1);
            n_i  = n - i;
            n_i1 = n - (i+1);
            #ifdef COMPLEX
            lapackf77_slacgv( &n_i, A(i,i), &lda );
            lapackf77_slacgv( &i, A(i,0), &lda );
            #endif
            blasf77_sgemv( "No transpose", &n_i, &i, &c_neg_one,
                           Y(i,0), &ldy,
                           A(i,0), &lda, &c_one,
                           A(i,i), &lda );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, A(i,0), &lda );
            lapackf77_slacgv( &i, X(i,0), &ldx );
            #endif
            blasf77_sgemv( MagmaConjTransStr, &i, &n_i, &c_neg_one,
                           A(0,i), &lda,
                           X(i,0), &ldx, &c_one,
                           A(i,i), &lda );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, X(i,0), &ldx );
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            alpha = *A(i,i);
            lapackf77_slarfg( &n_i, &alpha, A(i,min(i+1,n-1)), &lda, &taup[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i+1 < m) {
                *A(i,i) = c_one;
                
                /* Compute X(i+1:m,i) */
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaNoTrans, m_i1, n_i, c_one,
                             dA(i+1,i), ldda,
                             dA(i,i), ldda,
                             //dY(0,0), 1,
                             c_zero,
                             dX(i+1,i), ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( m_i1, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, queue );
                
                blasf77_sgemv( MagmaConjTransStr, &n_i, &i, &c_one,
                               Y(i,0), &ldy,
                               A(i,i), &lda, &c_zero,
                               X(0,i), &ione );
                
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               A(i+1,0), &lda,
                               X(0,i),   &ione, &c_zero,
                               work,     &ione );
                
                blasf77_sgemv( "No transpose", &i, &n_i, &c_one,
                               A(0,i), &lda,
                               A(i,i), &lda, &c_zero,
                               X(0,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if (i != 0) {
                    blasf77_saxpy( &m_i1, &c_one, work, &ione, X(i+1,i), &ione );
                }
                
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               X(i+1,0), &ldx,
                               X(0,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                blasf77_sscal( &m_i1, &taup[i], X(i+1,i), &ione );
                #ifdef COMPLEX
                lapackf77_slacgv( &n_i, A(i,i), &lda );
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
                #endif
                
                /* Update A(i+1:m,i) */
                #ifdef COMPLEX
                lapackf77_slacgv( &i, Y(i,0), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               A(i+1,0), &lda,
                               Y(i,0),   &ldy, &c_one,
                               A(i+1,i), &ione );
                #ifdef COMPLEX
                lapackf77_slacgv( &i, Y(i,0), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &m_i1, &i1, &c_neg_one,
                               X(i+1,0), &ldx,
                               A(0,i),   &ione, &c_one,
                               A(i+1,i), &ione );
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                alpha = *A(i+1,i);
                lapackf77_slarfg( &m_i1, &alpha, A(min(i+2,m-1),i), &ione, &tauq[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = c_one;
                
                /* Compute Y(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( m_i1,
                                  A(i+1,i), 1,
                                  dA(i+1,i), 1, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, m_i1, n_i1, c_one,
                             dA(i+1,i+1), ldda,
                             dA(i+1,i), ione, c_zero,
                             dY(i+1,i), ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( n_i1, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, queue );
                
                blasf77_sgemv( MagmaConjTransStr, &m_i1, &i, &c_one,
                               A(i+1,0), &lda,
                               A(i+1,i), &ione, &c_zero,
                               Y(0,i),   &ione );
                blasf77_sgemv( "No transpose", &n_i1, &i, &c_neg_one,
                               Y(i+1,0), &ldy,
                               Y(0,i),   &ione, &c_zero,
                               work,     &ione );
                
                blasf77_sgemv( MagmaConjTransStr, &m_i1, &i1, &c_one,
                               X(i+1,0), &ldx,
                               A(i+1,i), &ione, &c_zero,
                               Y(0,i),   &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if (i != 0) {
                    blasf77_saxpy( &n_i1, &c_one, work, &ione, Y(i+1,i), &ione );
                }
                
                blasf77_sgemv( MagmaConjTransStr, &i1, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               Y(0,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                blasf77_sscal( &n_i1, &tauq[i], Y(i+1,i), &ione );
            }
            #ifdef COMPLEX
            else {
                lapackf77_slacgv( &n_i, A(i,i), &lda );
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
            }
            #endif
        }
    }
    
    return info;
} /* magma_slabrd_gpu */
Example #12
0
extern "C" magma_int_t
magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset,
             magma_int_t nb, magma_int_t *kb,
             float *A,  magma_int_t lda,
             float *dA, magma_int_t ldda,
             magma_int_t *jpvt, float *tau, float *vn1, float *vn2,
             float *auxv,
             float *F,  magma_int_t ldf,
             float *dF, magma_int_t lddf)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SLAQPS computes a step of QR factorization with column pivoting
    of a real M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A. N >= 0

    OFFSET  (input) INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    NB      (input) INTEGER
            The number of columns to factorize.

    KB      (output) INTEGER
            The number of columns actually factorized.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    LDA     (input) INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    JPVT    (input/output) INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    TAU     (output) REAL array, dimension (KB)
            The scalar factors of the elementary reflectors.

    VN1     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    VN2     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    AUXV    (input/output) REAL array, dimension (NB)
            Auxiliar vector.

    F       (input/output) REAL array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    LDF     (input) INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    =====================================================================    */
    
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    float c_zero    = MAGMA_S_MAKE( 0.,0.);
    float c_one     = MAGMA_S_MAKE( 1.,0.);
    float c_neg_one = MAGMA_S_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    float d__1;
    float z__1;
    
    magma_int_t j, k, rk;
    float Akk;
    magma_int_t pvt;
    float temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_ssqrt( lapackf77_slamch("Epsilon"));

    magma_queue_t stream;
    magma_queue_create( &stream );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k.
        // C:       pvt, k, isamax are all 0-based; don't subtract 1.
        pvt = k + cblas_isamax( n-k, &vn1[k], ione );
        
        if (pvt != k) {

            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_sgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb){
                /* no need of transfer if pivot is within the panel */
                blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( stream );

                /* 2. Swap as usual on CPU                          */
                blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_ssetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j){
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );
        
            /* Multiply on GPU */
            // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione );
            
            magma_sgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, stream );
            
            blasf77_sgemv( MagmaTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( stream );
            blasf77_sgemv( MagmaTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_S_NEGATE( tau[k] );
            blasf77_sgemv( MagmaTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_sgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (float) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_ssqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_ssetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );

        magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione);
        else {
            /* Where is the data, CPU or GPU ? */
            float r1, r2;
            
            r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione);
            r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione);
            vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( stream );

    return MAGMA_SUCCESS;
} /* magma_slaqps */
Example #13
0
/**
    Purpose
    -------
    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by SGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_sgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    float *A, magma_int_t lda,
    float *tau,
    float *T, magma_int_t ldt,
    float *Y, magma_int_t ldy,
    struct sgehrd_data *data )
{
    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    float scale;

    magma_int_t i;
    float ei = MAGMA_S_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return *info;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv, data->queues[d] );
    }
    
    // set all Y=0
    lapackf77_slaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_S_NEGATE( tau[i-1] );
            blasf77_saxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_scopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_strmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_sgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_strmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_strmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_saxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_slarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_ssetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->queues[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_slacpy( MagmaFull, nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb,      i), nb, data->queues[d] );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_sgemv( MagmaNoTrans, n-k, dn-dki1,
                             c_one,  dA (d, k,    dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k,       i), 1, data->queues[d] );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_sgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->queues[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_S_NEGATE( tau[i] );
        blasf77_sgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_strmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If real, conjugate row of V, and undo afterwards
            #ifdef COMPLEX
            lapackf77_slacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_sgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #ifdef COMPLEX
            lapackf77_slacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_sgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->queues[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_saxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_sgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1,
                         c_one,  dA (d, 0,    dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0,       0), ldda, data->queues[d] );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_sgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->queues[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_saxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_ssetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->queues[d] );
        magma_ssetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->queues[d] );
    }

    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_slahr2 */
Example #14
0
/**
    Purpose
    -------
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       REAL array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slatrd(magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
             float *A,  magma_int_t lda,
             float *e, float *tau,
             float *W,  magma_int_t ldw,
             float *dA, magma_int_t ldda,
             float *dW, magma_int_t lddw)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(i, j) (dA + (j)*ldda + (i))
#define dW(i, j) (dW + (j)*lddw + (i))

    magma_int_t i;
    
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;

    float value = MAGMA_S_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    float alpha;
    float *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_smalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside slatrd

    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_slacgv(&i_n, A(i, i+1), &lda);
                #endif
                blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, A(i, i+1), &lda);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_S_REAL( alpha );
                *A(i-1,i) = MAGMA_S_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                            dA(0, i), ione, c_zero, dW(0, iw), ione);
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_saxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, W(i, 0), &ldw);
            lapackf77_slacgv(&i, A(i, 0), &lda);
            #endif
            blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = MAGMA_S_ONE;
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                            dW(i+1, i), ione);
            
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i != 0)
                    blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return 0;
} /* magma_slatrd */
Example #15
0
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float *h_x, *h_x1, *h_x2, *h_tau;
    float *d_x, *d_tau;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float      error, work[1];
    magma_int_t N, size, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    // does larfg on nb columns, one after another
    nb = (opts.nb > 0 ? opts.nb : 64);
    
    magma_queue_t queue = 0;

    printf("    N    nb    CPU GFLop/s (ms)    GPU GFlop/s (ms)    error  \n");
    printf("==============================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[i];
            gflops = FLOPS_SLARFG( N ) / 1e9 * nb;
    
            TESTING_MALLOC( h_x,   float, N*nb );
            TESTING_MALLOC( h_x1,  float, N*nb );
            TESTING_MALLOC( h_x2,  float, N*nb );
            TESTING_MALLOC( h_tau, float, nb   );
        
            TESTING_DEVALLOC( d_x,   float, N*nb );
            TESTING_DEVALLOC( d_tau, float,   nb );
            
            /* Initialize the vector */
            size = N*nb;
            lapackf77_slarnv( &ione, ISEED, &size, h_x );
            blasf77_scopy( &size, h_x, &ione, h_x1, &ione );
            
            /* =====================================================================
               Performs operation using MAGMA-BLAS
               =================================================================== */
            magma_ssetvector( size, h_x, ione, d_x, ione );
    
            gpu_time = magma_sync_wtime( queue );
            for( int j = 0; j < nb; ++j ) {
                magma_slarfg( N, &d_x[0+j*N], &d_x[1+j*N], ione, &d_tau[j] );
            }
            gpu_time = magma_sync_wtime( queue ) - gpu_time;
            gpu_perf = gflops / gpu_time;
            
            magma_sgetvector( size, d_x, ione, h_x2, ione );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            for( int j = 0; j < nb; ++j ) {
                lapackf77_slarfg( &N, &h_x1[0+j*N], &h_x1[1+j*N], &ione, &h_tau[j] );
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Error Computation and Performance Compariosn
               =================================================================== */
            blasf77_saxpy( &size, &c_neg_one, h_x1, &ione, h_x2, &ione);
            error = lapackf77_slange( "F", &N, &nb, h_x2, &N, work );
            printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2g\n",
                   (int) N, (int) nb, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error );
            
            TESTING_FREE( h_x   );
            TESTING_FREE( h_x1  );
            TESTING_FREE( h_x2  );
            TESTING_FREE( h_tau );
        
            TESTING_DEVFREE( d_x   );
            TESTING_DEVFREE( d_tau );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return 0;
}