C++ (Cpp) blasf77_sgemv Exemples

Exemple #1

0

Afficher le fichier

Fichier : testing_sgetrf_gpu.cpp Projet : xulunfan/magma

// On input, A and ipiv is LU factorization of A. On output, A is overwritten.
// Requires m == n.
// Uses init_matrix() to re-generate original A as needed.
// Generates random RHS b and solves Ax=b.
// Returns residual, |Ax - b| / (n |A| |x|).
float get_residual(
    magma_opts &opts,
    magma_int_t m, magma_int_t n,
    float *A, magma_int_t lda,
    magma_int_t *ipiv )
{
    if ( m != n ) {
        printf( "\nERROR: residual check defined only for square matrices\n" );
        return -1;
    }
    
    const float c_one     = MAGMA_S_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const magma_int_t ione = 1;
    
    // this seed should be DIFFERENT than used in init_matrix
    // (else x is column of A, so residual can be exactly zero)
    magma_int_t ISEED[4] = {0,0,0,2};
    magma_int_t info = 0;
    float *x, *b;
    
    // initialize RHS
    TESTING_MALLOC_CPU( x, float, n );
    TESTING_MALLOC_CPU( b, float, n );
    lapackf77_slarnv( &ione, ISEED, &n, b );
    blasf77_scopy( &n, b, &ione, x, &ione );
    
    // solve Ax = b
    lapackf77_sgetrs( "Notrans", &n, &ione, A, &lda, ipiv, x, &n, &info );
    if (info != 0) {
        printf("lapackf77_sgetrs returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    }
    
    // reset to original A
    init_matrix( opts, m, n, A, lda );
    
    // compute r = Ax - b, saved in b
    blasf77_sgemv( "Notrans", &m, &n, &c_one, A, &lda, x, &ione, &c_neg_one, b, &ione );
    
    // compute residual |Ax - b| / (n*|A|*|x|)
    float norm_x, norm_A, norm_r, work[1];
    norm_A = lapackf77_slange( "F", &m, &n, A, &lda, work );
    norm_r = lapackf77_slange( "F", &n, &ione, b, &n, work );
    norm_x = lapackf77_slange( "F", &n, &ione, x, &n, work );
    
    //printf( "r=\n" ); magma_sprint( 1, n, b, 1 );
    
    TESTING_FREE_CPU( x );
    TESTING_FREE_CPU( b );
    
    //printf( "r=%.2e, A=%.2e, x=%.2e, n=%d\n", norm_r, norm_A, norm_x, n );
    return norm_r / (n * norm_A * norm_x);
}

Exemple #2

0

Afficher le fichier

Fichier : testing_sgeqrf_gpu.cpp Projet : xulunfan/magma

/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sgeqrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    const float             d_neg_one = MAGMA_D_NEG_ONE;
    const float             d_one     = MAGMA_D_ONE;
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t        ione      = 1;
    
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           Anorm, error=0, error2=0;
    float *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloat_ptr d_A, dT;
    magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;
    magma_int_t ISEED[4] = {0,0,0,1};
    
    magma_opts opts;
    opts.parse_opts( argc, argv );
    
    magma_int_t status = 0;
    float tol = opts.tolerance * lapackf77_slamch("E");
    
    // version 3 can do either check
    if (opts.check == 1 && opts.version == 1) {
        opts.check = 2;
        printf( "%% version 1 requires check 2 (solve A*x=b)\n" );
    }
    if (opts.check == 2 && opts.version == 2) {
        opts.check = 1;
        printf( "%% version 2 requires check 1 (R - Q^H*A)\n" );
    }
    
    printf( "%% version %d\n", (int) opts.version );
    if ( opts.check == 1 ) {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");
        printf("%%==============================================================================\n");
    }
    else {
        printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)    |b - A*x|\n");
        printf("%%===============================================================\n");
    }
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min( M, N );
            lda    = M;
            n2     = lda*N;
            ldda   = magma_roundup( M, opts.align );  // multiple of 32 by default
            nb     = magma_get_sgeqrf_nb( M, N );
            gflops = FLOPS_SGEQRF( M, N ) / 1e9;
            
            // query for workspace size
            lwork = -1;
            lapackf77_sgeqrf( &M, &N, NULL, &M, NULL, tmp, &lwork, &info );
            lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );
            
            TESTING_MALLOC_CPU( tau,    float, min_mn );
            TESTING_MALLOC_CPU( h_A,    float, n2     );
            TESTING_MALLOC_CPU( h_work, float, lwork  );
            
            TESTING_MALLOC_PIN( h_R,    float, n2     );
            
            TESTING_MALLOC_DEV( d_A,    float, ldda*N );
            
            if ( opts.version == 1 || opts.version == 3 ) {
                size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb;
                TESTING_MALLOC_DEV( dT, float, size );
                magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size );
            }
            
            /* Initialize the matrix */
            lapackf77_slarnv( &ione, ISEED, &n2, h_A );
            lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );
            magma_ssetmatrix( M, N, h_R, lda, d_A, ldda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            nb = magma_get_sgeqrf_nb( M, N );
            
            gpu_time = magma_wtime();
            if ( opts.version == 1 ) {
                // stores dT, V blocks have zeros, R blocks inverted & stored in dT
                magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );
            }
            else if ( opts.version == 2 ) {
                // LAPACK complaint arguments
                magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info );
            }
            #ifdef HAVE_CUBLAS
            else if ( opts.version == 3 ) {
                // stores dT, V blocks have zeros, R blocks stored in dT
                magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );
            }
            #endif
            else {
                printf( "Unknown version %d\n", (int) opts.version );
                return -1;
            }
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0) {
                printf("magma_sgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            }
            
            if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) {
                if ( opts.version == 3 ) {
                    // copy diagonal blocks of R back to A
                    for( int i=0; i < min_mn-nb; i += nb ) {
                        magma_int_t ib = min( min_mn-i, nb );
                        magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda );
                    }
                }
                
                /* =====================================================================
                   Check the result, following zqrt01 except using the reduced Q.
                   This works for any M,N (square, tall, wide).
                   Only for version 2, which has LAPACK complaint output.
                   Or   for version 3, after restoring diagonal blocks of A above.
                   =================================================================== */
                magma_sgetmatrix( M, N, d_A, ldda, h_R, lda );
                
                magma_int_t ldq = M;
                magma_int_t ldr = min_mn;
                float *Q, *R;
                float *work;
                TESTING_MALLOC_CPU( Q,    float, ldq*min_mn );  // M by K
                TESTING_MALLOC_CPU( R,    float, ldr*N );       // K by N
                TESTING_MALLOC_CPU( work, float,             min_mn );
                
                // generate M by K matrix Q, where K = min(M,N)
                lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );
                lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );
                assert( info == 0 );
                
                // copy K by N matrix R
                lapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );
                lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );
                
                // error = || R - Q^H*A || / (N * ||A||)
                blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M,
                               &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );
                Anorm = lapackf77_slange( "1", &M,      &N, h_A, &lda, work );
                error = lapackf77_slange( "1", &min_mn, &N, R,   &ldr, work );
                if ( N > 0 && Anorm > 0 )
                    error /= (N*Anorm);
                
                // set R = I (K by K identity), then R = I - Q^H*Q
                // error = || I - Q^H*Q || / N
                lapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );
                blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );
                error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work );
                if ( N > 0 )
                    error2 /= N;
                
                TESTING_FREE_CPU( Q    );  Q    = NULL;
                TESTING_FREE_CPU( R    );  R    = NULL;
                TESTING_FREE_CPU( work );  work = NULL;
            }
            else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3) ) {
                /* =====================================================================
                   Check the result by solving consistent linear system, A*x = b.
                   Only for versions 1 & 3 with M >= N.
                   =================================================================== */
                magma_int_t lwork2;
                float *x, *b, *hwork;
                magmaFloat_ptr d_B;

                // initialize RHS, b = A*random
                TESTING_MALLOC_CPU( x, float, N );
                TESTING_MALLOC_CPU( b, float, M );
                lapackf77_slarnv( &ione, ISEED, &N, x );
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );
                // copy to GPU
                TESTING_MALLOC_DEV( d_B, float, M );
                magma_ssetvector( M, b, 1, d_B, 1 );

                if ( opts.version == 1 ) {
                    // allocate hwork
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, float, lwork2 );

                    // solve linear system
                    magma_sgeqrs_gpu( M, N, 1,
                                      d_A, ldda, tau, dT,
                                      d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    }
                    TESTING_FREE_CPU( hwork );
                }
                #ifdef HAVE_CUBLAS
                else if ( opts.version == 3 ) {
                    // allocate hwork
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, tmp, -1, &info );
                    lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
                    TESTING_MALLOC_CPU( hwork, float, lwork2 );

                    // solve linear system
                    magma_sgeqrs3_gpu( M, N, 1,
                                       d_A, ldda, tau, dT,
                                       d_B, M, hwork, lwork2, &info );
                    if (info != 0) {
                        printf("magma_sgeqrs3 returned error %d: %s.\n",
                               (int) info, magma_strerror( info ));
                    }
                    TESTING_FREE_CPU( hwork );
                }
                #endif
                else {
                    printf( "Unknown version %d\n", (int) opts.version );
                    return -1;
                }
                magma_sgetvector( N, d_B, 1, x, 1 );

                // compute r = Ax - b, saved in b
                blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );

                // compute residual |Ax - b| / (max(m,n)*|A|*|x|)
                float norm_x, norm_A, norm_r, work[1];
                norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work );
                norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work );
                norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work );

                TESTING_FREE_CPU( x );
                TESTING_FREE_CPU( b );
                TESTING_FREE_DEV( d_B );

                error = norm_r / (max(M,N) * norm_A * norm_x);
            }
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0) {
                    printf("lapackf77_sgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                }
            }
            
            /* =====================================================================
               Print performance and error.
               =================================================================== */
            printf("%5d %5d   ", (int) M, (int) N );
            if ( opts.lapack ) {
                printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );
            }
            else {
                printf("  ---   (  ---  )" );
            }
            printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );
            if ( opts.check == 1 ) {
                bool okay = (error < tol && error2 < tol);
                status += ! okay;
                printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );
            }
            else if ( opts.check == 2 ) {
                if ( M >= N ) {
                    bool okay = (error < tol);
                    status += ! okay;
                    printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );
                }
                else {
                    printf( "(error check only for M >= N)\n" );
                }
            }
            else {
                printf( "    ---\n" );
            }
            
            TESTING_FREE_CPU( tau    );
            TESTING_FREE_CPU( h_A    );
            TESTING_FREE_CPU( h_work );
            
            TESTING_FREE_PIN( h_R );
            
            TESTING_FREE_DEV( d_A );
            
            if ( opts.version == 1 || opts.version == 3 ) {
                TESTING_FREE_DEV( dT );
            }
            
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    opts.cleanup();
    TESTING_FINALIZE();
    return status;
}

Exemple #3

0

Afficher le fichier

Fichier : slahr2.cpp Projet : cjy7117/FT-MAGMA

/**
    Purpose
    -------
    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by SGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    dA      REAL array on the GPU, dimension (LDDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements in rows K:N of the first NB columns are
            overwritten with the matrix Y.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,N).

    @param[out]
    dV      REAL array on the GPU, dimension (LDDV, NB)
            On exit this n-by-nb array contains the Householder vectors of the transformation.

    @param[in]
    lddv    INTEGER
            The leading dimension of the array dV.  LDDV >= max(1,N).

    @param[in,out]
    A       REAL array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_sgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaFloat_ptr dA, magma_int_t ldda,
    magmaFloat_ptr dV, magma_int_t lddv,
    float *A,     magma_int_t lda,
    float *tau,
    float *T,     magma_int_t ldt,
    float *Y,     magma_int_t ldy )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define  Y(i_,j_) ( Y + (i_) + (j_)*ldy)
    #define  T(i_,j_) ( T + (i_) + (j_)*ldt)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dV(i_,j_) (dV + (i_) + (j_)*lddv)
    
    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t ione = 1;
    
    magma_int_t n_k_i_1, n_k;
    float scale;

    magma_int_t i;
    float ei = MAGMA_S_ZERO;

    magma_int_t info = 0;
    if (n < 0) {
        info = -1;
    } else if (k < 0 || k > n) {
        info = -2;
    } else if (nb < 1 || nb > n) {
        info = -3;
    } else if (ldda < max(1,n)) {
        info = -5;
    } else if (lddv < max(1,n)) {
        info = -7;
    } else if (lda < max(1,n)) {
        info = -9;
    } else if (ldt < max(1,nb)) {
        info = -12;
    } else if (ldy < max(1,n)) {
        info = -13;
    }
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    // adjust from 1-based indexing
    k -= 1;

    if (n <= 1)
        return info;
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Update A(k:n-1,i); Update i-th column of A - Y * T * V'
            // This updates one more row than LAPACK does (row k),
            // making the block above the panel an even multiple of nb.
            // Use last column of T as workspace, w.
            // w(0:i-1, nb-1) = VA(k+i, 0:i-1)'
            blasf77_scopy( &i,
                           A(k+i,0),  &lda,
                           T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            // If real, conjugate row of V.
            lapackf77_slacgv(&i, T(0,nb-1), &ione);
            #endif
            
            // w = T(0:i-1, 0:i-1) * w
            blasf77_strmv( "Upper", "No trans", "No trans", &i,
                           T(0,0),    &ldt,
                           T(0,nb-1), &ione );
            
            // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0),    &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i),    &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_scopy( &i,
                           A(k+1,i),  &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_strmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_sgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1),  &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_strmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1),  &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_strmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_saxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i),  &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_slarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i)
        magma_ssetvector( n_k_i_1,
                          A(k+i+1,i), 1,
                          dV(i+1,i),  1 );
        
        // Compute Y(k+1:n,i) = A vi
        // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i)
        magma_sgemv( MagmaNoTrans, n_k, n_k_i_1,
                     c_one,  dA(k,i+1), ldda,
                             dV(i+1,i), ione,
                     c_zero, dA(k,i),   ione );
        
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_S_NEGATE( tau[i]);
        blasf77_sgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i),     &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_strmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];

        // Y(k:n-1, i) = dA(k:n-1, i)
        magma_sgetvector( n-k,
                          dA(k,i), 1,
                          Y(k,i),  1 );
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;

    return info;
} /* magma_slahr2 */

Exemple #4

0

Afficher le fichier

Fichier : slahr2.cpp Projet : kjbartel/clmagma

extern "C" magma_int_t
magma_slahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaFloat_ptr da, size_t da_offset, magma_int_t ldda,
    magmaFloat_ptr dv, size_t dv_offset, magma_int_t lddv,
    float *a, magma_int_t lda,
    float *tau,
    float *t, magma_int_t ldt,
    float *y, magma_int_t ldy,
    magma_queue_t queue)
{
/*  -- clMAGMA auxiliary routine (version 0.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======

    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.

    This is an auxiliary routine called by SGEHRD.

    Arguments
    =========

    N       (input) INTEGER
            The order of the matrix A.

    K       (input) INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    NB      (input) INTEGER
            The number of columns to be reduced.

    DA      (input/output) REAL array on the GPU, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    DV      (output) REAL array on the GPU, dimension (N, NB)
            On exit this contains the Householder vectors of the transformation.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    TAU     (output) REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    T       (output) REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    LDT     (input) INTEGER
            The leading dimension of the array T.  LDT >= NB.

    Y       (output) REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    LDY     (input) INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ===============
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */


    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t c__1 = 1;
    
    magma_int_t a_dim1, a_offset, t_dim1, t_offset, y_dim1, y_offset, i__2, i__3;
    float d__1;

    magma_int_t i__;
    float ei;

    --tau;
    a_dim1 = lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    t_dim1 = ldt;
    t_offset = 1 + t_dim1;
    t -= t_offset;
    y_dim1 = ldy;
    y_offset = 1 + y_dim1;
    y -= y_offset;

    if (n <= 1)
      return MAGMA_SUCCESS;
    
    for (i__ = 1; i__ <= nb; ++i__) {
        if (i__ > 1) {

          /* Update A(K+1:N,I); Update I-th column of A - Y * V' */
          i__2 = n - k + 1;
          i__3 = i__ - 1;
          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_slacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif
          blasf77_scopy(&i__3, &a[k+i__-1+a_dim1], &lda, &t[nb*t_dim1+1], &c__1);
          blasf77_strmv("u","n","n",&i__3,&t[t_offset], &ldt, &t[nb*t_dim1+1], &c__1);

          blasf77_sgemv("NO TRANSPOSE", &i__2, &i__3, &c_neg_one, &y[k + y_dim1],
                        &ldy, &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__*a_dim1],&c__1);

          #if defined(PRECISION_z) || defined(PRECISION_c)
             lapackf77_slacgv(&i__3, &a[k+i__-1+a_dim1], &lda);
          #endif

          /* Apply I - V * T' * V' to this column (call it b) from the
             left, using the last column of T as workspace

             Let  V = ( V1 )   and   b = ( b1 )   (first I-1 rows)
                      ( V2 )             ( b2 )
             where V1 is unit lower triangular
             w := V1' * b1                                                 */
          
          i__2 = i__ - 1;
          blasf77_scopy(&i__2, &a[k+1+i__*a_dim1], &c__1, &t[nb*t_dim1+1], &c__1);
          blasf77_strmv("Lower", MagmaConjTransStr, "UNIT", &i__2,
                        &a[k + 1 + a_dim1], &lda, &t[nb * t_dim1 + 1], &c__1);

          /* w := w + V2'*b2 */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_sgemv(MagmaConjTransStr, &i__2, &i__3, &c_one,
                        &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1,
                        &c_one, &t[nb*t_dim1+1], &c__1);

          /* w := T'*w */
          i__2 = i__ - 1;
          blasf77_strmv("U", MagmaConjTransStr, "N", &i__2, &t[t_offset], &ldt,
                        &t[nb*t_dim1+1], &c__1);
          
          /* b2 := b2 - V2*w */
          i__2 = n - k - i__ + 1;
          i__3 = i__ - 1;
          blasf77_sgemv("N", &i__2, &i__3, &c_neg_one, &a[k + i__ + a_dim1], &lda,
                 &t[nb*t_dim1+1], &c__1, &c_one, &a[k+i__+i__*a_dim1], &c__1);

          /* b1 := b1 - V1*w */
          i__2 = i__ - 1;
          blasf77_strmv("L","N","U",&i__2,&a[k+1+a_dim1],&lda,&t[nb*t_dim1+1],&c__1);
          blasf77_saxpy(&i__2, &c_neg_one, &t[nb * t_dim1 + 1], &c__1,
                 &a[k + 1 + i__ * a_dim1], &c__1);
          
          a[k + i__ - 1 + (i__ - 1) * a_dim1] = ei;
        }
        
        /* Generate the elementary reflector H(I) to annihilate A(K+I+1:N,I) */
        i__2 = n - k - i__ + 1;
        i__3 = k + i__ + 1;
        lapackf77_slarfg(&i__2, &a[k + i__ + i__ * a_dim1],
                         &a[min(i__3,n) + i__ * a_dim1], &c__1, &tau[i__]);
        ei = a[k + i__ + i__ * a_dim1];
        a[k + i__ + i__ * a_dim1] = c_one;

        /* Compute  Y(K+1:N,I) */
        i__2 = n - k;
        i__3 = n - k - i__ + 1;
        magma_ssetvector( i__3, &a[k + i__ + i__*a_dim1], 1, dv, dv_offset+(i__-1)*(lddv+1),      1, queue );

        magma_sgemv(MagmaNoTrans, i__2+1, i__3, c_one,
                    da, da_offset + (-1 + k + i__ * ldda), ldda,
                    dv, dv_offset + (i__-1)*(lddv+1), c__1, c_zero,
                    da, da_offset + (-1 + k + (i__-1)*ldda), c__1, queue);
        
        i__2 = n - k - i__ + 1;
        i__3 = i__ - 1;
        blasf77_sgemv(MagmaConjTransStr, &i__2, &i__3, &c_one,
                      &a[k + i__ + a_dim1], &lda, &a[k+i__+i__*a_dim1], &c__1,
                      &c_zero, &t[i__*t_dim1+1], &c__1);

        /* Compute T(1:I,I) */
        i__2 = i__ - 1;
        d__1 = MAGMA_S_NEGATE( tau[i__] );
        blasf77_sscal(&i__2, &d__1, &t[i__ * t_dim1 + 1], &c__1);
        blasf77_strmv("U","N","N", &i__2, &t[t_offset], &ldt, &t[i__*t_dim1+1], &c__1);
        t[i__ + i__ * t_dim1] = tau[i__];

        magma_sgetvector( n - k + 1, da, da_offset+(-1+ k+(i__-1)*ldda), 1, y+ k + i__*y_dim1, 1, queue );
    }
    a[k + nb + nb * a_dim1] = ei;

    return MAGMA_SUCCESS;
} /* magma_slahr2 */

Exemple #5

0

Afficher le fichier

Fichier : slaqps.cpp Projet : cjy7117/DVFS-MAGMA

/**
    Purpose
    -------
    SLAQPS computes a step of QR factorization with column pivoting
    of a real M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A. N >= 0

    @param[in]
    offset  INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    @param[in]
    nb      INTEGER
            The number of columns to factorize.

    @param[out]
    kb      INTEGER
            The number of columns actually factorized.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    @param[in,out]
    jpvt    INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    @param[out]
    tau     REAL array, dimension (KB)
            The scalar factors of the elementary reflectors.

    @param[in,out]
    vn1     REAL array, dimension (N)
            The vector with the partial column norms.

    @param[in,out]
    vn2     REAL array, dimension (N)
            The vector with the exact column norms.

    @param[in,out]
    auxv    REAL array, dimension (NB)
            Auxiliar vector.

    @param[in,out]
    F       REAL array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    @param[in]
    ldf     INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    @ingroup magma_sgeqp3_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset,
             magma_int_t nb, magma_int_t *kb,
             float *A,  magma_int_t lda,
             float *dA, magma_int_t ldda,
             magma_int_t *jpvt, float *tau, float *vn1, float *vn2,
             float *auxv,
             float *F,  magma_int_t ldf,
             float *dF, magma_int_t lddf)
{
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    float c_zero    = MAGMA_S_MAKE( 0.,0.);
    float c_one     = MAGMA_S_MAKE( 1.,0.);
    float c_neg_one = MAGMA_S_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    float d__1;
    float z__1;
    
    magma_int_t j, k, rk;
    float Akk;
    magma_int_t pvt;
    float temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_ssqrt( lapackf77_slamch("Epsilon"));

    magma_queue_t stream;
    magma_queue_create( &stream );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // subtract 1 from Fortran isamax; pvt, k are 0-based.
        i__1 = n-k;
        pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1;
        
        if (pvt != k) {
            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_sgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb) {
                /* no need of transfer if pivot is within the panel */
                blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( stream );

                /* 2. Swap as usual on CPU                          */
                blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_ssetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );
        
            /* Multiply on GPU */
            // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_sgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione );
            
            magma_sgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, stream );
            
            blasf77_sgemv( MagmaConjTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( stream );
            blasf77_sgemv( MagmaConjTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_S_NEGATE( tau[k] );
            blasf77_sgemv( MagmaConjTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_sgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (float) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_ssqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_ssetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );

        magma_sgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = magma_cblas_snrm2( i__1, A(rk+1,lsticc), ione );
        else {
            /* Where is the data, CPU or GPU ? */
            float r1, r2;
            
            r1 = magma_cblas_snrm2( nb-k, A(rk+1,lsticc), ione );
            r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione);
            vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( stream );

    return MAGMA_SUCCESS;
} /* magma_slaqps */

Exemple #6

0

Afficher le fichier

Fichier : slatrd.cpp Projet : cjy7117/FT-MAGMA

/**
    Purpose
    -------
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       REAL array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).
    
    @param
    dA
    
    @param
    ldda
    
    @param
    dW
    
    @param
    lddw

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

        Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

        (  a   a   a   v4  v5 )              (  d                  )
        (      a   a   v4  v5 )              (  1   d              )
        (          a   1   v5 )              (  v1  1   a          )
        (              d   1  )              (  v1  v2  a   a      )
        (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slatrd(
    magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
    float *A,  magma_int_t lda,
    float *e, float *tau,
    float *W,  magma_int_t ldw,
    magmaFloat_ptr dA, magma_int_t ldda,
    magmaFloat_ptr dW, magma_int_t lddw)
{
    #define A(i_, j_) (A + (i_) + (j_)*lda)
    #define W(i_, j_) (W + (i_) + (j_)*ldw)
    
    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
    #define dW(i_, j_) (dW + (i_) + (j_)*lddw)

    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t ione = 1;

    float alpha, value;
    magma_int_t i, i_n, i_1, iw;

    /* Check arguments */
    magma_int_t info = 0;
    if ( uplo != MagmaLower && uplo != MagmaUpper ) {
        info = -1;
    } else if ( n < 0 ) {
        info = -2;
    } else if ( nb < 1 ) {
        info = -3;
    } else if ( lda < max(1,n) ) {
        info = -5;
    } else if ( ldw < max(1,n) ) {
        info = -9;
    } else if ( ldda < max(1,n) ) {
        info = -11;
    } else if ( lddw < max(1,n) ) {
        info = -13;
    }
    
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    /* Quick return if possible */
    if (n == 0) {
        return info;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    
    float *f;
    magma_smalloc_cpu( &f, n );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, W(i, iw+1), &ldw );
                #endif
                blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                               W(i, iw+1), &ldw, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, W(i, iw+1), &ldw );
                lapackf77_slacgv( &i_n, A(i, i+1),  &lda );
                #endif
                blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                               A(i, i+1), &lda, &c_one, A(0, i), &ione );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i_n, A(i, i+1), &lda );
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] );
                
                e[i-1] = MAGMA_S_REAL( alpha );
                *A(i-1,i) = MAGMA_S_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                magma_ssymv( MagmaUpper, i, c_one, dA(0, 0), ldda,
                             dA(0, i), ione, c_zero, dW(0, iw), ione );
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i, 1,
                                        dW(0, iw), lddw,
                                        W(0, iw),  ldw, stream );
                
                if (i < n-1) {
                    blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                    
                    blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                   A(0, i), &ione, &c_zero, W(i+1, iw), &ione );
                    
                    blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                   W(i+1, iw), &ione, &c_one, W(0, iw), &ione );
                }
                
                blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione );
                
                value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_saxpy( &i, &alpha, A(0, i), &ione,
                               W(0, iw), &ione );
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, W(i, 0), &ldw );
            #endif
            blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                           W(i, 0), &ldw, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, W(i, 0), &ldw );
            lapackf77_slacgv( &i, A(i, 0), &lda );
            #endif
            blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                           A(i, 0), &lda, &c_one, A(i, i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i, A(i, 0), &lda );
            #endif
            
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = MAGMA_S_ONE;
                
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
                
                magma_ssymv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda,
                             dA(i+1, i), ione, c_zero, dW(i+1, i), ione );
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
                
                blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                               W(0, i), &ione, &c_zero, f, &ione );
                
                blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                               A(i+1, i), &ione, &c_zero, W(0, i), &ione );
                
                // 3. Here is where we need it
                magma_queue_sync( stream );
                
                if (i != 0)
                    blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione );
                
                blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                               W(0, i), &ione, &c_one, W(i+1, i), &ione );
                blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione );
                
                value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                alpha = tau[i] * -0.5f * value;
                blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione );
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return info;
} /* magma_slatrd */

Exemple #7

0

Afficher le fichier

Fichier : slatrd.cpp Projet : EmergentOrder/clmagma

extern "C" magma_err_t
magma_slatrd(char uplo, magma_int_t n, magma_int_t nb,
             float *a,  magma_int_t lda,
             float *e, float *tau,
             float *w,  magma_int_t ldw,
             magmaFloat_ptr da, size_t da_offset, magma_int_t ldda,
             magmaFloat_ptr dw, size_t dw_offset, magma_int_t lddw, magma_queue_t queue)
{
/*  -- clMAGMA (version 1.1.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2014

    Purpose
    =======
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = 'U', SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = 'L', SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    =========
    UPLO    (input) CHARACTER*1
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
            = 'U': Upper triangular
            = 'L': Lower triangular

    N       (input) INTEGER
            The order of the matrix A.

    NB      (input) INTEGER
            The number of rows and columns to be reduced.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = 'U', the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = 'L', the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
            if UPLO = 'U', the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
            if UPLO = 'L', the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    E       (output) REAL array, dimension (N-1)
            If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = 'L', E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    TAU     (output) REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'.
            See Further Details.

    W       (output) REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    LDW     (input) INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ===============
    If UPLO = 'U', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = 'L', the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = 'U':                       if UPLO = 'L':

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).
    =====================================================================    */
  
    char uplo_[2]  = {uplo, 0};

    magma_int_t i;
  
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;

    float value = MAGMA_S_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
  
    float alpha;

    float *f;
    magma_smalloc_cpu( &f, n );

    if (n <= 0) {
      return 0;
    }

    magma_event_t event = NULL;

    if (lapackf77_lsame(uplo_, "U")) {

      /* Reduce last NB columns of upper triangle */
      for (i = n-1; i >= n - nb ; --i) {
        i_1 = i + 1;
        i_n = n - i - 1;
        
        iw = i - n + nb;
        if (i < n-1) {
          /* Update A(1:i,i) */
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
          #endif
          blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                        W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
              lapackf77_slacgv(&i_n, A(i, i+1), &lda);
          #endif
          blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                        A(i, i+1), &lda, &c_one, A(0, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i_n, A(i, i+1), &lda);
          #endif
        }
        if (i > 0) {
          /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
          
          alpha = *A(i-1, i);
          
          lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
          
          e[i-1] = MAGMA_S_REAL( alpha );
          MAGMA_S_SET2REAL(*A(i-1, i), 1.);
          
          /* Compute W(1:i-1,i) */
          // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
          magma_ssetvector( i, A(0, i), 0, 1, dA(0, i), 1, queue );
          
          magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                      dA(0, i), ione, c_zero, dW(0, iw), ione, queue);
          
          // 2. Start putting the result back (asynchronously)
          magma_sgetmatrix_async( i, 1,
                                  dW(0, iw),         lddw,
                                  W(0, iw)/*test*/, 0, ldw, queue, &event );
          
          if (i < n-1) {

            blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
          }
          
            // 3. Here is where we need it // TODO find the right place
            magma_event_sync(event);

          if (i < n-1) {
          
            blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);

            blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                          A(0, i), &ione, &c_zero, W(i+1, iw), &ione);

            blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                          W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
          }

          blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione);

          #if defined(PRECISION_z) || defined(PRECISION_c)
          cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
          #else
          value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
          #endif
          alpha = tau[i - 1] * -0.5f * value;
          blasf77_saxpy(&i, &alpha, A(0, i), &ione,
                        W(0, iw), &ione);
        }
      }

    } else {

      /*  Reduce first NB columns of lower triangle */
      for (i = 0; i < nb; ++i)
        {

          /* Update A(i:n,i) */
          i_n = n - i;
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, W(i, 0), &ldw);
          #endif
          blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                        W(i, 0), &ldw, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, W(i, 0), &ldw);
              lapackf77_slacgv(&i, A(i ,0), &lda);
          #endif
          blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                        A(i, 0), &lda, &c_one, A(i, i), &ione);
          #if defined(PRECISION_z) || defined(PRECISION_c)
              lapackf77_slacgv(&i, A(i, 0), &lda);
          #endif

          if (i < n-1)
            {
              /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
              i_n = n - i - 1;
              alpha = *A(i+1, i);
              lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
              e[i] = MAGMA_S_REAL( alpha );
              MAGMA_S_SET2REAL(*A(i+1, i), 1.);

              /* Compute W(i+1:n,i) */
              // 1. Send the block reflector  A(i+1:n,i) to the GPU
              magma_ssetvector( i_n, A(i+1, i), 0, 1, dA(i+1, i), 1, queue );
          
              magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                          dW(i+1, i), ione, queue);
          
              // 2. Start putting the result back (asynchronously)
              magma_sgetmatrix_async( i_n, 1,
                                      dW(i+1, i), lddw,
                                      W(i+1, i), 0, ldw, queue, &event );

              blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                            W(0, i), &ione, &c_zero, f, &ione);
              
              blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                            A(i+1, i), &ione, &c_zero, W(0, i), &ione);

              // 3. Here is where we need it
              magma_event_sync(event);

              if (i!=0)
                blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
     
              blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                            W(0, i), &ione, &c_one, W(i+1, i), &ione);
              blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione);
              
              #if defined(PRECISION_z) || defined(PRECISION_c)
              cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
              #else
              value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
              #endif
              alpha = tau[i] * -0.5f * value;
              blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu(f);

    return 0;
} /* slatrd_ */

Exemple #8

0

Afficher le fichier

Fichier : slabrd_gpu.cpp Projet : maxhutch/magma

/***************************************************************************//**
    Purpose
    -------
    SLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by SGEBRD.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.

    @param[in]
    nb      INTEGER
            The number of leading rows and columns of A to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
    \n
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[in,out]
    dA      REAL array, dimension (LDDA,N)
            Copy of A on GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    d       REAL array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    @param[out]
    e       REAL array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    @param[out]
    tauq    REAL array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    REAL array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    X       REAL array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    @param[in]
    ldx     INTEGER
            The leading dimension of the array X. LDX >= M.

    @param[out]
    dX      REAL array, dimension (LDDX,NB)
            Copy of X on GPU.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX. LDDX >= M.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[out]
    dY      REAL array, dimension (LDDY,NB)
            Copy of Y on GPU.

    @param[in]
    lddy    INTEGER
            The leading dimension of the array dY. LDDY >= N.

    @param
    work    REAL array, dimension (LWORK)
            Workspace.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK. LWORK >= max( M, N ).

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )
    @endverbatim

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    @ingroup magma_labrd
*******************************************************************************/
extern "C" magma_int_t
magma_slabrd_gpu(
    magma_int_t m, magma_int_t n, magma_int_t nb,
    float     *A, magma_int_t lda,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *d, float *e, float *tauq, float *taup,
    float     *X, magma_int_t ldx,
    magmaFloat_ptr dX, magma_int_t lddx,
    float     *Y, magma_int_t ldy,
    magmaFloat_ptr dY, magma_int_t lddy,
    float  *work, magma_int_t lwork,
    magma_queue_t queue )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define  X(i_,j_) ( X + (i_) + (j_)*ldx)
    #define  Y(i_,j_) ( Y + (i_) + (j_)*ldy)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dY(i_,j_) (dY + (i_) + (j_)*lddy)
    #define dX(i_,j_) (dX + (i_) + (j_)*lddx)
    
    /* Constants */
    const float c_neg_one = MAGMA_S_NEG_ONE;
    const float c_one     = MAGMA_S_ONE;
    const float c_zero    = MAGMA_S_ZERO;
    const magma_int_t ione = 1;
    
    /* Local variables */
    magma_int_t i, i1, m_i, m_i1, n_i, n_i1;
    float alpha;

    /* Quick return if possible */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }

    if (m >= n) {
        /* Reduce to upper bidiagonal form */
        for (i=0; i < nb; ++i) {
            /* Update A(i:m,i) */
            i1   = i + 1;
            m_i  = m - i;
            m_i1 = m - (i+1);
            n_i1 = n - (i+1);
            #ifdef COMPLEX
            lapackf77_slacgv( &i, Y(i,0), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &m_i, &i, &c_neg_one,
                           A(i,0), &lda,
                           Y(i,0), &ldy, &c_one,
                           A(i,i), &ione );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, Y(i,0), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &m_i, &i, &c_neg_one,
                           X(i,0), &ldx,
                           A(0,i), &ione, &c_one,
                           A(i,i), &ione );
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
            alpha = *A(i,i);
            lapackf77_slarfg( &m_i, &alpha, A(min(i+1,m-1),i), &ione, &tauq[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i+1 < n) {
                *A(i,i) = c_one;

                /* Compute Y(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( m_i,
                                  A(i,i), 1,
                                  dA(i,i), 1, queue );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, m_i, n_i1, c_one,
                             dA(i,i+1),   ldda,
                             dA(i,i), ione, c_zero,
                             dY(i+1,i),   ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( n_i1, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, queue );
                blasf77_sgemv( MagmaConjTransStr, &m_i, &i, &c_one,
                               A(i,0), &lda,
                               A(i,i), &ione, &c_zero,
                               Y(0,i), &ione );

                blasf77_sgemv( "N", &n_i1, &i, &c_neg_one,
                               Y(i+1,0), &ldy,
                               Y(0,i),   &ione, &c_zero,
                               work,     &ione );
                blasf77_sgemv( MagmaConjTransStr, &m_i, &i, &c_one,
                               X(i,0), &ldx,
                               A(i,i), &ione, &c_zero,
                               Y(0,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );

                if (i != 0) {
                    blasf77_saxpy( &n_i1, &c_one, work, &ione, Y(i+1,i), &ione );
                }

                blasf77_sgemv( MagmaConjTransStr, &i, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               Y(0,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                blasf77_sscal( &n_i1, &tauq[i], Y(i+1,i), &ione );

                /* Update A(i,i+1:n) */
                #ifdef COMPLEX
                lapackf77_slacgv( &n_i1, A(i,i+1), &lda );
                lapackf77_slacgv( &i1,  A(i,0), &lda );
                #endif
                blasf77_sgemv( "No transpose", &n_i1, &i1, &c_neg_one,
                               Y(i+1,0), &ldy,
                               A(i,0),   &lda, &c_one,
                               A(i,i+1), &lda );
                #ifdef COMPLEX
                lapackf77_slacgv( &i1,  A(i,0), &lda );
                lapackf77_slacgv( &i, X(i,0), &ldx );
                #endif
                blasf77_sgemv( MagmaConjTransStr, &i, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               X(i,0),   &ldx, &c_one,
                               A(i,i+1), &lda );
                #ifdef COMPLEX
                lapackf77_slacgv( &i, X(i,0), &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                alpha = *A(i,i+1);
                lapackf77_slarfg( &n_i1, &alpha, A(i,min(i+2,n-1)), &lda, &taup[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i,i+1) = c_one;

                /* Compute X(i+1:m,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( n_i1,
                                  A(i,i+1), lda,
                                  dA(i,i+1), ldda, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaNoTrans, m_i1, n_i1, c_one,
                             dA(i+1,i+1), ldda,
                             dA(i,i+1), ldda,
                             //dY(0,0), 1,
                             c_zero,
                             dX(i+1,i), ione, queue );

                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( m_i1, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, queue );

                blasf77_sgemv( MagmaConjTransStr, &n_i1, &i1, &c_one,
                               Y(i+1,0), &ldy,
                               A(i,i+1), &lda, &c_zero,
                               X(0,i),   &ione );

                blasf77_sgemv( "N", &m_i1, &i1, &c_neg_one,
                               A(i+1,0), &lda,
                               X(0,i),   &ione, &c_zero,
                               work,     &ione );
                blasf77_sgemv( "N", &i, &n_i1, &c_one,
                               A(0,i+1), &lda,
                               A(i,i+1), &lda, &c_zero,
                               X(0,i),   &ione );

                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if ((i+1) != 0) {
                    blasf77_saxpy( &m_i1, &c_one, work, &ione, X(i+1,i), &ione );
                }

                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               X(i+1,0), &ldx,
                               X(0,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                blasf77_sscal( &m_i1, &taup[i], X(i+1,i), &ione );

                #ifdef COMPLEX
                lapackf77_slacgv( &n_i1,  A(i,i+1), &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after SLACGV()
                magma_ssetvector( n_i1,
                                  A(i,i+1),  lda,
                                  dA(i,i+1), ldda, queue );
                #endif
            }
        }
    }
    else {
        /* Reduce to lower bidiagonal form */
        for (i=0; i < nb; ++i) {
            /* Update A(i,i:n) */
            i1   = i + 1;
            m_i1 = m - (i+1);
            n_i  = n - i;
            n_i1 = n - (i+1);
            #ifdef COMPLEX
            lapackf77_slacgv( &n_i, A(i,i), &lda );
            lapackf77_slacgv( &i, A(i,0), &lda );
            #endif
            blasf77_sgemv( "No transpose", &n_i, &i, &c_neg_one,
                           Y(i,0), &ldy,
                           A(i,0), &lda, &c_one,
                           A(i,i), &lda );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, A(i,0), &lda );
            lapackf77_slacgv( &i, X(i,0), &ldx );
            #endif
            blasf77_sgemv( MagmaConjTransStr, &i, &n_i, &c_neg_one,
                           A(0,i), &lda,
                           X(i,0), &ldx, &c_one,
                           A(i,i), &lda );
            #ifdef COMPLEX
            lapackf77_slacgv( &i, X(i,0), &ldx );
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            alpha = *A(i,i);
            lapackf77_slarfg( &n_i, &alpha, A(i,min(i+1,n-1)), &lda, &taup[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i+1 < m) {
                *A(i,i) = c_one;
                
                /* Compute X(i+1:m,i) */
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaNoTrans, m_i1, n_i, c_one,
                             dA(i+1,i), ldda,
                             dA(i,i), ldda,
                             //dY(0,0), 1,
                             c_zero,
                             dX(i+1,i), ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( m_i1, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, queue );
                
                blasf77_sgemv( MagmaConjTransStr, &n_i, &i, &c_one,
                               Y(i,0), &ldy,
                               A(i,i), &lda, &c_zero,
                               X(0,i), &ione );
                
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               A(i+1,0), &lda,
                               X(0,i),   &ione, &c_zero,
                               work,     &ione );
                
                blasf77_sgemv( "No transpose", &i, &n_i, &c_one,
                               A(0,i), &lda,
                               A(i,i), &lda, &c_zero,
                               X(0,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if (i != 0) {
                    blasf77_saxpy( &m_i1, &c_one, work, &ione, X(i+1,i), &ione );
                }
                
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               X(i+1,0), &ldx,
                               X(0,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                blasf77_sscal( &m_i1, &taup[i], X(i+1,i), &ione );
                #ifdef COMPLEX
                lapackf77_slacgv( &n_i, A(i,i), &lda );
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
                #endif
                
                /* Update A(i+1:m,i) */
                #ifdef COMPLEX
                lapackf77_slacgv( &i, Y(i,0), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &m_i1, &i, &c_neg_one,
                               A(i+1,0), &lda,
                               Y(i,0),   &ldy, &c_one,
                               A(i+1,i), &ione );
                #ifdef COMPLEX
                lapackf77_slacgv( &i, Y(i,0), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &m_i1, &i1, &c_neg_one,
                               X(i+1,0), &ldx,
                               A(0,i),   &ione, &c_one,
                               A(i+1,i), &ione );
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                alpha = *A(i+1,i);
                lapackf77_slarfg( &m_i1, &alpha, A(min(i+2,m-1),i), &ione, &tauq[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = c_one;
                
                /* Compute Y(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( m_i1,
                                  A(i+1,i), 1,
                                  dA(i+1,i), 1, queue );
                
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, m_i1, n_i1, c_one,
                             dA(i+1,i+1), ldda,
                             dA(i+1,i), ione, c_zero,
                             dY(i+1,i), ione, queue );
                
                // 3. Get the result back ----------------------------------
                magma_sgetmatrix_async( n_i1, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, queue );
                
                blasf77_sgemv( MagmaConjTransStr, &m_i1, &i, &c_one,
                               A(i+1,0), &lda,
                               A(i+1,i), &ione, &c_zero,
                               Y(0,i),   &ione );
                blasf77_sgemv( "No transpose", &n_i1, &i, &c_neg_one,
                               Y(i+1,0), &ldy,
                               Y(0,i),   &ione, &c_zero,
                               work,     &ione );
                
                blasf77_sgemv( MagmaConjTransStr, &m_i1, &i1, &c_one,
                               X(i+1,0), &ldx,
                               A(i+1,i), &ione, &c_zero,
                               Y(0,i),   &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( queue );
                if (i != 0) {
                    blasf77_saxpy( &n_i1, &c_one, work, &ione, Y(i+1,i), &ione );
                }
                
                blasf77_sgemv( MagmaConjTransStr, &i1, &n_i1, &c_neg_one,
                               A(0,i+1), &lda,
                               Y(0,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                blasf77_sscal( &n_i1, &tauq[i], Y(i+1,i), &ione );
            }
            #ifdef COMPLEX
            else {
                lapackf77_slacgv( &n_i, A(i,i), &lda );
                magma_ssetvector( n_i,
                                  A(i,i), lda,
                                  dA(i,i), ldda, queue );
            }
            #endif
        }
    }
    
    return info;
} /* magma_slabrd_gpu */

Exemple #9

0

Afficher le fichier

Fichier : slabrd_gpu.cpp Projet : cjy7117/DVFS-MAGMA

/**
    Purpose
    -------
    SLABRD reduces the first NB rows and columns of a real general
    m by n matrix A to upper or lower bidiagonal form by an orthogonal
    transformation Q' * A * P, and returns the matrices X and Y which
    are needed to apply the transformation to the unreduced part of A.

    If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
    bidiagonal form.

    This is an auxiliary routine called by SGEBRD.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows in the matrix A.

    @param[in]
    n       INTEGER
            The number of columns in the matrix A.

    @param[in]
    nb      INTEGER
            The number of leading rows and columns of A to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the m by n general matrix to be reduced.
            On exit, the first NB rows and columns of the matrix are
            overwritten; the rest of the array is unchanged.
            If m >= n, elements on and below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors; and
              elements above the diagonal in the first NB rows, with the
              array TAUP, represent the orthogonal matrix P as a product
              of elementary reflectors.
    \n
            If m < n, elements below the diagonal in the first NB
              columns, with the array TAUQ, represent the orthogonal
              matrix Q as a product of elementary reflectors, and
              elements on and above the diagonal in the first NB rows,
              with the array TAUP, represent the orthogonal matrix P as
              a product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[in,out]
    dA      REAL array, dimension (LDDA,N)
            Copy of A on GPU.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array dA.  LDDA >= max(1,M).

    @param[out]
    d       REAL array, dimension (NB)
            The diagonal elements of the first NB rows and columns of
            the reduced matrix.  D(i) = A(i,i).

    @param[out]
    e       REAL array, dimension (NB)
            The off-diagonal elements of the first NB rows and columns of
            the reduced matrix.

    @param[out]
    tauq    REAL array dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix Q. See Further Details.

    @param[out]
    taup    REAL array, dimension (NB)
            The scalar factors of the elementary reflectors which
            represent the orthogonal matrix P. See Further Details.

    @param[out]
    X       REAL array, dimension (LDX,NB)
            The m-by-nb matrix X required to update the unreduced part
            of A.

    @param[in]
    ldx     INTEGER
            The leading dimension of the array X. LDX >= M.

    @param[out]
    dX      REAL array, dimension (LDDX,NB)
            Copy of X on GPU.

    @param[in]
    lddx    INTEGER
            The leading dimension of the array dX. LDDX >= M.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y required to update the unreduced part
            of A.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[out]
    dY      REAL array, dimension (LDDY,NB)
            Copy of Y on GPU.

    @param[in]
    lddy    INTEGER
            The leading dimension of the array dY. LDDY >= N.

    Further Details
    ---------------
    The matrices Q and P are represented as products of elementary
    reflectors:

       Q = H(1) H(2) . . . H(nb)  and  P = G(1) G(2) . . . G(nb)

    Each H(i) and G(i) has the form:

       H(i) = I - tauq * v * v'  and G(i) = I - taup * u * u'

    where tauq and taup are real scalars, and v and u are real vectors.

    If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in
    A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in
    A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in
    A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i).

    The elements of the vectors v and u together form the m-by-nb matrix
    V and the nb-by-n matrix U' which are needed, with X and Y, to apply
    the transformation to the unreduced part of the matrix, using a block
    update of the form:  A := A - V*Y' - X*U'.

    The contents of A on exit are illustrated by the following examples
    with nb = 2:

    @verbatim
    m = 6 and n = 5 (m > n):          m = 5 and n = 6 (m < n):

      (  1   1   u1  u1  u1 )           (  1   u1  u1  u1  u1  u1 )
      (  v1  1   1   u2  u2 )           (  1   1   u2  u2  u2  u2 )
      (  v1  v2  a   a   a  )           (  v1  1   a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )           (  v1  v2  a   a   a   a  )
      (  v1  v2  a   a   a  )
    @endverbatim

    where a denotes an element of the original matrix which is unchanged,
    vi denotes an element of the vector defining H(i), and ui an element
    of the vector defining G(i).

    @ingroup magma_sgesvd_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                  float *A,  magma_int_t lda,
                  float *dA, magma_int_t ldda,
                  float *d, float *e, float *tauq, float *taup,
                  float *X,  magma_int_t ldx,
                  float *dX, magma_int_t lddx,
                  float *Y,  magma_int_t ldy,
                  float *dY, magma_int_t lddy)
{
    #define A(i_,j_) (A + (i_) + (j_)*lda)
    #define X(i_,j_) (X + (i_) + (j_)*ldx)
    #define Y(i_,j_) (Y + (i_) + (j_)*ldy)
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)
    #define dY(i_,j_) (dY + (i_) + (j_)*lddy)
    #define dX(i_,j_) (dX + (i_) + (j_)*lddx)
    
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;
    magma_int_t ione = 1;
    
    magma_int_t i__2, i__3;
    magma_int_t i;
    float alpha;

    A  -= 1 + lda;
    X  -= 1 + ldx;
    dX -= 1 + lddx;
    Y  -= 1 + ldy;
    dY -= 1 + lddy;
    --d;
    --e;
    --tauq;
    --taup;

    /* Quick return if possible */
    magma_int_t info = 0;
    if (m <= 0 || n <= 0) {
        return info;
    }

    float *f;
    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_smalloc_cpu( &f, max(n,m) );
    if ( f == NULL ) {
        info = MAGMA_ERR_HOST_ALLOC;
        return info;
    }
    
    if (m >= n) {
        /* Reduce to upper bidiagonal form */
        for (i = 1; i <= nb; ++i) {
            /*  Update A(i:m,i) */
            i__2 = m - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           A(i,1), &lda,
                           Y(i,1), &ldy, &c_one,
                           A(i,i), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, Y(i,1), &ldy );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           X(i,1), &ldx,
                           A(1,i), &ione, &c_one,
                           A(i,i), &ione );
            
            /* Generate reflection Q(i) to annihilate A(i+1:m,i) */
            alpha = *A(i,i);
            i__2 = m - i + 1;
            i__3 = i + 1;
            lapackf77_slarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i < n) {
                *A(i,i) = c_one;

                /* Compute Y(i+1:n,i) */
                i__2 = m - i + 1;
                i__3 = n - i;

                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__2,
                                  A(i,i), 1,
                                  dA(i-1,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i-1,i),   ldda,
                             dA(i-1,i-1), ione, c_zero,
                             dY(i+1,i),   ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i,1), &lda,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );

                i__2 = n - i;
                i__3 = i - 1;
                blasf77_sgemv( "N", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = m - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               X(i,1), &ldx,
                               A(i,i), &ione, &c_zero,
                               Y(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );

                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }

                i__2 = i - 1;
                i__3 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_sscal( &i__2, &tauq[i], Y(i+1,i), &ione );

                /* Update A(i,i+1:n) */
                i__2 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, A(i,i+1), &lda );
                lapackf77_slacgv( &i,  A(i,1), &lda );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i, &c_neg_one,
                               Y(i+1,1), &ldy,
                               A(i,1),   &lda, &c_one,
                               A(i,i+1), &lda );
                i__2 = i - 1;
                i__3 = n - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i,  A(i,1), &lda );
                lapackf77_slacgv( &i__2, X(i,1), &ldx );
                #endif
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                               A(1,i+1), &lda,
                               X(i,1),   &ldx, &c_one,
                               A(i,i+1), &lda );
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, X(i,1), &ldx );
                #endif

                /* Generate reflection P(i) to annihilate A(i,i+2:n) */
                i__2 = n - i;
                i__3 = i + 2;
                alpha = *A(i,i+1);
                lapackf77_slarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i,i+1) = c_one;

                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i;
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__3,
                                  A(i,i+1), lda,
                                  dA(i-1,i), ldda );
                // 2. Multiply ---------------------------------------------
                //magma_scopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 );
                magma_sgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i), ldda,
                             dA(i-1,i), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );

                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );

                i__2 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               Y(i+1,1), &ldy,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                i__2 = m - i;
                blasf77_sgemv( "N", &i__2, &i, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                i__2 = i - 1;
                i__3 = n - i;
                blasf77_sgemv( "N", &i__2, &i__3, &c_one,
                               A(1,i+1), &lda,
                               A(i,i+1), &lda, &c_zero,
                               X(1,i),   &ione );

                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i != 0) {
                    i__2 = m - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione );
                }


                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_sscal( &i__2, &taup[i], X(i+1,i), &ione );

                #if defined(PRECISION_z) || defined(PRECISION_c)
                i__2 = n - i;
                lapackf77_slacgv( &i__2,  A(i,i+1), &lda );
                // 4. Send the block reflector  A(i+1:m,i) to the GPU after SLACGV()
                magma_ssetvector( i__2,
                                  A(i,i+1),  lda,
                                  dA(i-1,i), ldda );
                #endif
            }
        }
    }
    else {
        /* Reduce to lower bidiagonal form */
        for (i = 1; i <= nb; ++i) {
        
            /* Update A(i,i:n) */
            i__2 = n - i + 1;
            i__3 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__2, A(i,i), &lda );
            lapackf77_slacgv( &i__3, A(i,1), &lda );
            #endif
            blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                           Y(i,1), &ldy,
                           A(i,1), &lda, &c_one,
                           A(i,i), &lda );
            i__2 = i - 1;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__3, A(i,1), &lda );
            lapackf77_slacgv( &i__3, X(i,1), &ldx );
            #endif
            i__3 = n - i + 1;
            blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one,
                           A(1,i), &lda,
                           X(i,1), &ldx, &c_one,
                           A(i,i), &lda );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv( &i__2, X(i,1), &ldx );
            #endif
            
            /* Generate reflection P(i) to annihilate A(i,i+1:n) */
            i__2 = n - i + 1;
            i__3 = i + 1;
            alpha = *A(i,i);
            lapackf77_slarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] );
            d[i] = MAGMA_S_REAL( alpha );
            if (i < m) {
                *A(i,i) = c_one;
                
                /* Compute X(i+1:m,i) */
                i__2 = m - i;
                i__3 = n - i + 1;
                
                // 1. Send the block reflector  A(i,i+1:n) to the GPU ------
                magma_ssetvector( i__3,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                
                // 2. Multiply ---------------------------------------------
                //magma_scopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 );
                magma_sgemv( MagmaNoTrans, i__2, i__3, c_one,
                             dA(i,i-1), ldda,
                             dA(i-1,i-1), ldda,
                             //dY(1,1), 1,
                             c_zero,
                             dX(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__2, 1,
                                        dX(i+1,i), lddx,
                                        X(i+1,i),  ldx, stream );
                
                i__2 = n - i + 1;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               Y(i,1), &ldy,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               X(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = i - 1;
                i__3 = n - i + 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_one,
                               A(1,i), &lda,
                               A(i,i), &lda, &c_zero,
                               X(1,i), &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__2 != 0) {
                    i__3 = m - i;
                    blasf77_saxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione );
                }
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               X(i+1,1), &ldx,
                               X(1,i),   &ione, &c_one,
                               X(i+1,i), &ione );
                i__2 = m - i;
                blasf77_sscal( &i__2, &taup[i], X(i+1,i), &ione );
                i__2 = n - i + 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__2, A(i,i), &lda );
                magma_ssetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
                #endif
                
                /* Update A(i+1:m,i) */
                i__2 = m - i;
                i__3 = i - 1;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               A(i+1,1), &lda,
                               Y(i,1),   &ldy, &c_one,
                               A(i+1,i), &ione );
                i__2 = m - i;
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv( &i__3, Y(i,1), &ldy );
                #endif
                blasf77_sgemv( "No transpose", &i__2, &i, &c_neg_one,
                               X(i+1,1), &ldx,
                               A(1,i),   &ione, &c_one,
                               A(i+1,i), &ione );
                
                /* Generate reflection Q(i) to annihilate A(i+2:m,i) */
                i__2 = m - i;
                i__3 = i + 2;
                alpha = *A(i+1,i);
                lapackf77_slarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] );
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = c_one;
                
                /* Compute Y(i+1:n,i) */
                i__2 = m - i;
                i__3 = n - i;
                
                // 1. Send the block reflector  A(i+1:m,i) to the GPU ------
                magma_ssetvector( i__2,
                                  A(i+1,i), 1,
                                  dA(i,i-1), 1 );
                // 2. Multiply ---------------------------------------------
                magma_sgemv( MagmaConjTrans, i__2, i__3, c_one,
                             dA(i,i),   ldda,
                             dA(i,i-1), ione, c_zero,
                             dY(i+1,i), ione );
                
                // 3. Put the result back ----------------------------------
                magma_sgetmatrix_async( i__3, 1,
                                        dY(i+1,i), lddy,
                                        Y(i+1,i),  ldy, stream );
                
                i__2 = m - i;
                i__3 = i - 1;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i__3, &c_one,
                               A(i+1,1), &lda,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                i__2 = n - i;
                i__3 = i - 1;
                blasf77_sgemv( "No transpose", &i__2, &i__3, &c_neg_one,
                               Y(i+1,1), &ldy,
                               Y(1,i),   &ione, &c_zero,
                               f,        &ione );
                
                i__2 = m - i;
                blasf77_sgemv( MagmaConjTransStr, &i__2, &i, &c_one,
                               X(i+1,1), &ldx,
                               A(i+1,i), &ione, &c_zero,
                               Y(1,i),   &ione );
                
                // 4. Sync to make sure the result is back ----------------
                magma_queue_sync( stream );
                if (i__3 != 0) {
                    i__2 = n - i;
                    blasf77_saxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione );
                }
                
                i__2 = n - i;
                blasf77_sgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one,
                               A(1,i+1), &lda,
                               Y(1,i),   &ione, &c_one,
                               Y(i+1,i), &ione );
                i__2 = n - i;
                blasf77_sscal( &i__2, &tauq[i], Y(i+1,i), &ione );
            }
            #if defined(PRECISION_z) || defined(PRECISION_c)
            else {
                i__2 = n - i + 1;
                lapackf77_slacgv( &i__2, A(i,i), &lda );
                magma_ssetvector( i__2,
                                  A(i,i), lda,
                                  dA(i-1,i-1), ldda );
            }
            #endif
        }
    }
    
    magma_queue_destroy( stream );
    magma_free_cpu( f );
    
    return info;
} /* magma_slabrd_gpu */

Exemple #10

0

Afficher le fichier

Fichier : slaqps.cpp Projet : soulsheng/magma

extern "C" magma_int_t
magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset,
             magma_int_t nb, magma_int_t *kb,
             float *A,  magma_int_t lda,
             float *dA, magma_int_t ldda,
             magma_int_t *jpvt, float *tau, float *vn1, float *vn2,
             float *auxv,
             float *F,  magma_int_t ldf,
             float *dF, magma_int_t lddf)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    SLAQPS computes a step of QR factorization with column pivoting
    of a real M-by-N matrix A by using Blas-3.  It tries to factorize
    NB columns from A starting from the row OFFSET+1, and updates all
    of the matrix with Blas-3 xGEMM.

    In some cases, due to catastrophic cancellations, it cannot
    factorize NB columns.  Hence, the actual number of factorized
    columns is returned in KB.

    Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A. N >= 0

    OFFSET  (input) INTEGER
            The number of rows of A that have been factorized in
            previous steps.

    NB      (input) INTEGER
            The number of columns to factorize.

    KB      (output) INTEGER
            The number of columns actually factorized.

    A       (input/output) REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, block A(OFFSET+1:M,1:KB) is the triangular
            factor obtained and block A(1:OFFSET,1:N) has been
            accordingly pivoted, but no factorized.
            The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has
            been updated.

    LDA     (input) INTEGER
            The leading dimension of the array A. LDA >= max(1,M).

    JPVT    (input/output) INTEGER array, dimension (N)
            JPVT(I) = K <==> Column K of the full matrix A has been
            permuted into position I in AP.

    TAU     (output) REAL array, dimension (KB)
            The scalar factors of the elementary reflectors.

    VN1     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the partial column norms.

    VN2     (input/output) DOUBLE PRECISION array, dimension (N)
            The vector with the exact column norms.

    AUXV    (input/output) REAL array, dimension (NB)
            Auxiliar vector.

    F       (input/output) REAL array, dimension (LDF,NB)
            Matrix F' = L*Y'*A.

    LDF     (input) INTEGER
            The leading dimension of the array F. LDF >= max(1,N).

    =====================================================================    */
    
#define  A(i, j) (A  + (i) + (j)*(lda ))
#define dA(i, j) (dA + (i) + (j)*(ldda))
#define  F(i, j) (F  + (i) + (j)*(ldf ))
#define dF(i, j) (dF + (i) + (j)*(lddf))

    float c_zero    = MAGMA_S_MAKE( 0.,0.);
    float c_one     = MAGMA_S_MAKE( 1.,0.);
    float c_neg_one = MAGMA_S_MAKE(-1.,0.);
    magma_int_t ione = 1;
    
    magma_int_t i__1, i__2;
    float d__1;
    float z__1;
    
    magma_int_t j, k, rk;
    float Akk;
    magma_int_t pvt;
    float temp, temp2, tol3z;
    magma_int_t itemp;

    magma_int_t lsticc;
    magma_int_t lastrk;

    lastrk = min( m, n + offset );
    tol3z = magma_ssqrt( lapackf77_slamch("Epsilon"));

    magma_queue_t stream;
    magma_queue_create( &stream );

    lsticc = 0;
    k = 0;
    while( k < nb && lsticc == 0 ) {
        rk = offset + k;
        
        /* Determine ith pivot column and swap if necessary */
        // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k.
        // C:       pvt, k, isamax are all 0-based; don't subtract 1.
        pvt = k + cblas_isamax( n-k, &vn1[k], ione );
        
        if (pvt != k) {

            if (pvt >= nb) {
                /* 1. Start copy from GPU                           */
                magma_sgetmatrix_async( m - offset - nb, 1,
                                        dA(offset + nb, pvt), ldda,
                                        A (offset + nb, pvt), lda, stream );
            }

            /* F gets swapped so F must be sent at the end to GPU   */
            i__1 = k;
            blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf );
            itemp     = jpvt[pvt];
            jpvt[pvt] = jpvt[k];
            jpvt[k]   = itemp;
            vn1[pvt] = vn1[k];
            vn2[pvt] = vn2[k];

            if (pvt < nb){
                /* no need of transfer if pivot is within the panel */
                blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione );
            }
            else {
                /* 1. Finish copy from GPU                          */
                magma_queue_sync( stream );

                /* 2. Swap as usual on CPU                          */
                blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione);

                /* 3. Restore the GPU                               */
                magma_ssetmatrix_async( m - offset - nb, 1,
                                        A (offset + nb, pvt), lda,
                                        dA(offset + nb, pvt), ldda, stream);
            }
        }

        /* Apply previous Householder reflectors to column K:
           A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'.
           Optimization: multiply with beta=0; wait for vector and subtract */
        if (k > 0) {
            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j){
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif

            i__1 = m - rk;
            i__2 = k;
            blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2,
                           &c_neg_one, A(rk, 0), &lda,
                                       F(k,  0), &ldf,
                           &c_one,     A(rk, k), &ione );

            #if defined(PRECISION_c) || defined(PRECISION_z)
            for (j = 0; j < k; ++j) {
                *F(k,j) = MAGMA_S_CNJG( *F(k,j) );
            }
            #endif
        }
        
        /*  Generate elementary reflector H(k). */
        if (rk < m-1) {
            i__1 = m - rk;
            lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] );
        } else {
            lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] );
        }
        
        Akk = *A(rk, k);
        *A(rk, k) = c_one;

        /* Compute Kth column of F:
           Compute  F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */
        if (k < n-1) {
            i__1 = m - rk;
            i__2 = n - k - 1;
        
            /* Send the vector to the GPU */
            magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda );
        
            /* Multiply on GPU */
            // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K,
            //                 TAU( K ), A( RK,  K+1 ), LDA,
            //                           A( RK,  K   ), 1,
            //                 CZERO,    F( K+1, K   ), 1 )
            magma_int_t i__3 = nb-k-1;
            magma_int_t i__4 = i__2 - i__3;
            magma_int_t i__5 = nb-k;
            magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3,
                         tau[k], dA(rk +i__5, k+1+i__3), ldda,
                                 dA(rk +i__5, k       ), ione,
                         c_zero, dF(k+1+i__3, k       ), ione );
            
            magma_sgetmatrix_async( i__2-i__3, 1,
                                    dF(k + 1 +i__3, k), i__2,
                                    F (k + 1 +i__3, k), i__2, stream );
            
            blasf77_sgemv( MagmaTransStr, &i__1, &i__3,
                           &tau[k], A(rk,  k+1), &lda,
                                    A(rk,  k  ), &ione,
                           &c_zero, F(k+1, k  ), &ione );
            
            magma_queue_sync( stream );
            blasf77_sgemv( MagmaTransStr, &i__5, &i__4,
                           &tau[k], A(rk, k+1+i__3), &lda,
                                    A(rk, k       ), &ione,
                           &c_one,  F(k+1+i__3, k ), &ione );
        }
        
        /* Padding F(1:K,K) with zeros. */
        for (j = 0; j < k; ++j) {
            *F(j, k) = c_zero;
        }
        
        /* Incremental updating of F:
           F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */
        if (k > 0) {
            i__1 = m - rk;
            i__2 = k;
            z__1 = MAGMA_S_NEGATE( tau[k] );
            blasf77_sgemv( MagmaTransStr, &i__1, &i__2,
                           &z__1,   A(rk, 0), &lda,
                                    A(rk, k), &ione,
                           &c_zero, auxv, &ione );
            
            i__1 = k;
            blasf77_sgemv( MagmaNoTransStr, &n, &i__1,
                           &c_one, F(0,0), &ldf,
                                   auxv,   &ione,
                           &c_one, F(0,k), &ione );
        }
        
        /* Optimization: On the last iteration start sending F back to the GPU */
        
        /* Update the current row of A:
           A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'.               */
        if (k < n-1) {
            i__1 = n - k - 1;
            i__2 = k + 1;
            blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2,
                           &c_neg_one, A(rk, 0  ), &lda,
                                       F(k+1,0  ), &ldf,
                           &c_one,     A(rk, k+1), &lda );
        }
        
        /* Update partial column norms. */
        if (rk < lastrk) {
            for (j = k + 1; j < n; ++j) {
                if (vn1[j] != 0.) {
                    /* NOTE: The following 4 lines follow from the analysis in
                       Lapack Working Note 176. */
                    temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j];
                    temp = max( 0., ((1. + temp) * (1. - temp)) );
        
                    d__1 = vn1[j] / vn2[j];
                    temp2 = temp * (d__1 * d__1);
        
                    if (temp2 <= tol3z) {
                        vn2[j] = (float) lsticc;
                        lsticc = j;
                    } else {
                        vn1[j] *= magma_ssqrt(temp);
                    }
                }
            }
        }
        
        *A(rk, k) = Akk;
        
        ++k;
    }
    // leave k as the last column done
    --k;
    *kb = k + 1;
    rk = offset + *kb - 1;

    /* Apply the block reflector to the rest of the matrix:
       A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)'  */
    if (*kb < min(n, m - offset)) {
        i__1 = m - rk - 1;
        i__2 = n - *kb;
        
        /* Send F to the GPU */
        magma_ssetmatrix( i__2, *kb,
                          F (*kb, 0), ldf,
                          dF(*kb, 0), i__2 );

        magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb,
                     c_neg_one, dA(rk+1, 0  ), ldda,
                                dF(*kb,  0  ), i__2,
                     c_one,     dA(rk+1, *kb), ldda );
    }
    
    /* Recomputation of difficult columns. */
    while( lsticc > 0 ) {
        itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc]));
        i__1 = m - rk - 1;
        if (lsticc <= nb)
            vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione);
        else {
            /* Where is the data, CPU or GPU ? */
            float r1, r2;
            
            r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione);
            r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione);
            
            //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione);
            vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2);
        }
        
        /* NOTE: The computation of VN1( LSTICC ) relies on the fact that
           SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */
        vn2[lsticc] = vn1[lsticc];
        lsticc = itemp;
    }
    
    magma_queue_destroy( stream );

    return MAGMA_SUCCESS;
} /* magma_slaqps */

Exemple #11

0

Afficher le fichier

/**
    Purpose
    -------
    SLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by SGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     REAL array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       REAL array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       REAL array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_sgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    float *A, magma_int_t lda,
    float *tau,
    float *T, magma_int_t ldt,
    float *Y, magma_int_t ldy,
    struct sgehrd_data *data )
{
    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    float c_zero    = MAGMA_S_ZERO;
    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    float scale;

    magma_int_t i;
    float ei = MAGMA_S_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return *info;
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv, data->queues[d] );
    }
    
    // set all Y=0
    lapackf77_slaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_S_NEGATE( tau[i-1] );
            blasf77_saxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_scopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_strmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_sgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_strmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_sgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_strmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_saxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_slarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_ssetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->queues[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_slacpy( MagmaFull, nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb,      i), nb, data->queues[d] );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_sgemv( MagmaNoTrans, n-k, dn-dki1,
                             c_one,  dA (d, k,    dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k,       i), 1, data->queues[d] );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_sgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->queues[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_S_NEGATE( tau[i] );
        blasf77_sgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_strmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If real, conjugate row of V, and undo afterwards
            #ifdef COMPLEX
            lapackf77_slacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_sgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #ifdef COMPLEX
            lapackf77_slacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_sgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->queues[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_saxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_sgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1,
                         c_one,  dA (d, 0,    dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0,       0), ldda, data->queues[d] );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_sgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->queues[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_saxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_ssetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->queues[d] );
        magma_ssetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->queues[d] );
    }

    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_slahr2 */

Exemple #12

0

Afficher le fichier

Fichier : testing_sgemv.cpp Projet : kjbartel/clmagma

int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time;
    float          magma_error, dev_error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float alpha = MAGMA_S_MAKE(  1.5, -2.3 );
    float beta  = MAGMA_S_MAKE( -0.6,  0.8 );
    float *A, *X, *Y, *Ydev, *Ymagma;
    magmaFloat_ptr dA, dX, dY;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    printf("trans = %s\n", lapack_trans_const(opts.transA) );
    #ifdef HAVE_CUBLAS
        printf("    M     N   MAGMA Gflop/s (ms)  %s Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  %s error\n",
                g_platform_str, g_platform_str );
    #else
        printf("    M     N   %s Gflop/s (ms)   CPU Gflop/s (ms)  %s error\n",
                g_platform_str, g_platform_str );
    #endif
    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = ((M+31)/32)*32;
            gflops = FLOPS_SGEMV( M, N ) / 1e9;

            if ( opts.transA == MagmaNoTrans ) {
                Xm = N;
                Ym = M;
            } else {
                Xm = M;
                Ym = N;
            }

            sizeA = lda*N;
            sizeX = incx*Xm;
            sizeY = incy*Ym;
            
            TESTING_MALLOC_CPU( A,       float, sizeA );
            TESTING_MALLOC_CPU( X,       float, sizeX );
            TESTING_MALLOC_CPU( Y,       float, sizeY );
            TESTING_MALLOC_CPU( Ydev,    float, sizeY );
            TESTING_MALLOC_CPU( Ymagma,  float, sizeY );
            
            TESTING_MALLOC_DEV( dA, float, sizeA );
            TESTING_MALLOC_DEV( dX, float, sizeX );
            TESTING_MALLOC_DEV( dY, float, sizeY );
            
            /* Initialize the matrix */
            lapackf77_slarnv( &ione, ISEED, &sizeA, A );
            lapackf77_slarnv( &ione, ISEED, &sizeX, X );
            lapackf77_slarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_ssetmatrix( M, N, A, lda, dA, 0, lda, opts.queue );
            magma_ssetvector( Xm, X, incx, dX, 0, incx, opts.queue );
            magma_ssetvector( Ym, Y, incy, dY, 0, incy, opts.queue );
            
            #ifdef HAVE_CUBLAS
                dev_time = magma_sync_wtime( 0 );
                cublasSgemv( opts.handle, cublas_trans_const(opts.transA),
                             M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy );
                dev_time = magma_sync_wtime( 0 ) - dev_time;
            #else
                dev_time = magma_sync_wtime( opts.queue );
                magma_sgemv( opts.transA, M, N,
                             alpha, dA, 0, lda,
                                    dX, 0, incx,
                             beta,  dY, 0, incy, opts.queue );
                dev_time = magma_sync_wtime( opts.queue ) - dev_time;
            #endif
            dev_perf = gflops / dev_time;
            
            magma_sgetvector( Ym, dY, 0, incy, Ydev, incy, opts.queue );
            
            /* =====================================================================
               Performs operation using MAGMABLAS (currently only with CUDA)
               =================================================================== */
            #ifdef HAVE_CUBLAS
                magma_ssetvector( Ym, Y, incy, dY, incy );
                
                magma_time = magma_sync_wtime( 0 );
                magmablas_sgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
                magma_time = magma_sync_wtime( 0 ) - magma_time;
                magma_perf = gflops / magma_time;
                
                magma_sgetvector( Ym, dY, incy, Ymagma, incy );
            #endif
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_sgemv( lapack_trans_const(opts.transA), &M, &N,
                           &alpha, A, &lda,
                                   X, &incx,
                           &beta,  Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            float Anorm = lapackf77_slange( "F", &M, &N, A, &lda, work );
            float Xnorm = lapackf77_slange( "F", &Xm, &ione, X, &Xm, work );
            
            blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy );
            dev_error = lapackf77_slange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm);
            
            #ifdef HAVE_CUBLAS
                blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy );
                magma_error = lapackf77_slange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm);
                
                printf("%5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                       (int) M, (int) N,
                       magma_perf,  1000.*magma_time,
                       dev_perf,    1000.*dev_time,
                       cpu_perf,    1000.*cpu_time,
                       magma_error, dev_error,
                       (magma_error < tol && dev_error < tol ? "ok" : "failed"));
                status += ! (magma_error < tol && dev_error < tol);
            #else
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e   %s\n",
                       (int) M, (int) N,
                       dev_perf,    1000.*dev_time,
                       cpu_perf,    1000.*cpu_time,
                       dev_error,
                       (dev_error < tol ? "ok" : "failed"));
                status += ! (dev_error < tol);
            #endif
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ydev    );
            TESTING_FREE_CPU( Ymagma  );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}

Exemple #13

0

Afficher le fichier

Fichier : slatrd.cpp Projet : EmergentOrder/magma

/**
    Purpose
    -------
    SLATRD reduces NB rows and columns of a real symmetric matrix A to
    symmetric tridiagonal form by an orthogonal similarity
    transformation Q' * A * Q, and returns the matrices V and W which are
    needed to apply the transformation to the unreduced part of A.

    If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a
    matrix, of which the upper triangle is supplied;
    if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a
    matrix, of which the lower triangle is supplied.

    This is an auxiliary routine called by SSYTRD.

    Arguments
    ---------
    @param[in]
    uplo    magma_uplo_t
            Specifies whether the upper or lower triangular part of the
            symmetric matrix A is stored:
      -     = MagmaUpper: Upper triangular
      -     = MagmaLower: Lower triangular

    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    nb      INTEGER
            The number of rows and columns to be reduced.

    @param[in,out]
    A       REAL array, dimension (LDA,N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the leading
            n-by-n upper triangular part of A contains the upper
            triangular part of the matrix A, and the strictly lower
            triangular part of A is not referenced.  If UPLO = MagmaLower, the
            leading n-by-n lower triangular part of A contains the lower
            triangular part of the matrix A, and the strictly upper
            triangular part of A is not referenced.
            On exit:
      -     if UPLO = MagmaUpper, the last NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements above the diagonal
              with the array TAU, represent the orthogonal matrix Q as a
              product of elementary reflectors;
      -     if UPLO = MagmaLower, the first NB columns have been reduced to
              tridiagonal form, with the diagonal elements overwriting
              the diagonal elements of A; the elements below the diagonal
              with the array TAU, represent the  orthogonal matrix Q as a
              product of elementary reflectors.
            See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= (1,N).

    @param[out]
    e       REAL array, dimension (N-1)
            If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal
            elements of the last NB columns of the reduced matrix;
            if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of
            the first NB columns of the reduced matrix.

    @param[out]
    tau     REAL array, dimension (N-1)
            The scalar factors of the elementary reflectors, stored in
            TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower.
            See Further Details.

    @param[out]
    W       REAL array, dimension (LDW,NB)
            The n-by-nb matrix W required to update the unreduced part
            of A.

    @param[in]
    ldw     INTEGER
            The leading dimension of the array W. LDW >= max(1,N).

    Further Details
    ---------------
    If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(n) H(n-1) . . . H(n-nb+1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i),
    and tau in TAU(i-1).

    If UPLO = MagmaLower, the matrix Q is represented as a product of elementary
    reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i),
    and tau in TAU(i).

    The elements of the vectors v together form the n-by-nb matrix V
    which is needed, with W, to apply the transformation to the unreduced
    part of the matrix, using a symmetric rank-2k update of the form:
    A := A - V*W' - W*V'.

    The contents of A on exit are illustrated by the following examples
    with n = 5 and nb = 2:

    if UPLO = MagmaUpper:                       if UPLO = MagmaLower:

      (  a   a   a   v4  v5 )              (  d                  )
      (      a   a   v4  v5 )              (  1   d              )
      (          a   1   v5 )              (  v1  1   a          )
      (              d   1  )              (  v1  v2  a   a      )
      (                  d  )              (  v1  v2  a   a   a  )

    where d denotes a diagonal element of the reduced matrix, a denotes
    an element of the original matrix that is unchanged, and vi denotes
    an element of the vector defining H(i).

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slatrd(magma_uplo_t uplo, magma_int_t n, magma_int_t nb,
             float *A,  magma_int_t lda,
             float *e, float *tau,
             float *W,  magma_int_t ldw,
             float *dA, magma_int_t ldda,
             float *dW, magma_int_t lddw)
{
#define A(i, j) (A + (j)*lda + (i))
#define W(i, j) (W + (j)*ldw + (i))

#define dA(i, j) (dA + (j)*ldda + (i))
#define dW(i, j) (dW + (j)*lddw + (i))

    magma_int_t i;
    
    float c_neg_one = MAGMA_S_NEG_ONE;
    float c_one     = MAGMA_S_ONE;
    float c_zero    = MAGMA_S_ZERO;

    float value = MAGMA_S_ZERO;
    
    magma_int_t ione = 1;

    magma_int_t i_n, i_1, iw;
    
    float alpha;
    float *f;

    if (n <= 0) {
        return 0;
    }

    magma_queue_t stream;
    magma_queue_create( &stream );
    magma_smalloc_cpu( &f, n );
    assert( f != NULL );  // TODO return error, or allocate outside slatrd

    if (uplo == MagmaUpper) {
        /* Reduce last NB columns of upper triangle */
        for (i = n-1; i >= n - nb; --i) {
            i_1 = i + 1;
            i_n = n - i - 1;
            
            iw = i - n + nb;
            if (i < n-1) {
                /* Update A(1:i,i) */
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
                #endif
                blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda,
                              W(i, iw+1), &ldw, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, W(i, iw+1), &ldw);
                lapackf77_slacgv(&i_n, A(i, i+1), &lda);
                #endif
                blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                              A(i, i+1), &lda, &c_one, A(0, i), &ione);
                #if defined(PRECISION_z) || defined(PRECISION_c)
                lapackf77_slacgv(&i_n, A(i, i+1), &lda);
                #endif
            }
            if (i > 0) {
                /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */
                alpha = *A(i-1, i);
                
                lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]);
                
                e[i-1] = MAGMA_S_REAL( alpha );
                *A(i-1,i) = MAGMA_S_ONE;
                
                /* Compute W(1:i-1,i) */
                // 1. Send the block reflector  A(0:n-i-1,i) to the GPU
                magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 );
                
                magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda,
                            dA(0, i), ione, c_zero, dW(0, iw), ione);
                
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i, 1,
                                        dW(0, iw),         lddw,
                                        W(0, iw) /*test*/, ldw, stream );
                
                if (i < n-1) {
                    blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                }
                
                // 3. Here is where we need it // TODO find the right place
                magma_queue_sync( stream );
                
                if (i < n-1) {
                    blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                    
                    blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda,
                                  A(0, i), &ione, &c_zero, W(i+1, iw), &ione);
                    
                    blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw,
                                  W(i+1, iw), &ione, &c_one, W(0, iw), &ione);
                }
                
                blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value );
                #else
                value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione );
                #endif
                alpha = tau[i - 1] * -0.5f * value;
                blasf77_saxpy(&i, &alpha, A(0, i), &ione,
                              W(0, iw), &ione);
            }
        }
    }
    else {
        /*  Reduce first NB columns of lower triangle */
        for (i = 0; i < nb; ++i) {
            /* Update A(i:n,i) */
            i_n = n - i;
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, W(i, 0), &ldw);
            #endif
            blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda,
                          W(i, 0), &ldw, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, W(i, 0), &ldw);
            lapackf77_slacgv(&i, A(i, 0), &lda);
            #endif
            blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw,
                          A(i, 0), &lda, &c_one, A(i, i), &ione);
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_slacgv(&i, A(i, 0), &lda);
            #endif
        
            if (i < n-1) {
                /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */
                i_n = n - i - 1;
                alpha = *A(i+1, i);
                lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]);
                e[i] = MAGMA_S_REAL( alpha );
                *A(i+1,i) = MAGMA_S_ONE;
        
                /* Compute W(i+1:n,i) */
                // 1. Send the block reflector  A(i+1:n,i) to the GPU
                magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 );
            
                magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero,
                            dW(i+1, i), ione);
            
                // 2. Start putting the result back (asynchronously)
                magma_sgetmatrix_async( i_n, 1,
                                        dW(i+1, i), lddw,
                                        W(i+1, i),  ldw, stream );
        
                blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda,
                              W(0, i), &ione, &c_zero, f, &ione);
                
                blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda,
                              A(i+1, i), &ione, &c_zero, W(0, i), &ione);
        
                // 3. Here is where we need it
                magma_queue_sync( stream );
        
                if (i != 0)
                    blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione);
        
                blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw,
                              W(0, i), &ione, &c_one, W(i+1, i), &ione);
                blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione);
                
                #if defined(PRECISION_z) || defined(PRECISION_c)
                cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value );
                #else
                value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione );
                #endif
                alpha = tau[i] * -0.5f * value;
                blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione);
            }
        }
    }

    magma_free_cpu( f );
    magma_queue_destroy( stream );

    return 0;
} /* magma_slatrd */

Exemple #14

0

Afficher le fichier

Fichier : strevc3.cpp Projet : cjy7117/FT-MAGMA

magma_int_t magma_strevc3(
    magma_side_t side, magma_vec_t howmany,
    magma_int_t *select,  // logical in fortran
    magma_int_t n,
    float *T,  magma_int_t ldt,
    float *VL, magma_int_t ldvl,
    float *VR, magma_int_t ldvr,
    magma_int_t mm, magma_int_t *mout,
    float *work, magma_int_t lwork,
    #ifdef COMPLEX
    float *rwork,
    #endif
    magma_int_t *info )
{
#define T(i,j)  (T  + (i) + (j)*ldt)
#define VL(i,j) (VL + (i) + (j)*ldvl)
#define VR(i,j) (VR + (i) + (j)*ldvr)
#define X(i,j)  (X  + (i)-1 + ((j)-1)*2)  // still as 1-based indices
#define work(i,j) (work + (i) + (j)*n)

    // constants
    const magma_int_t ione = 1;
    const float c_zero = 0;
    const float c_one  = 1;
    const magma_int_t nbmin = 16, nbmax = 256;

    // .. Local Scalars ..
    magma_int_t allv, bothv, leftv, over, pair, rightv, somev;
    magma_int_t i, ierr, ii, ip, is, j, k, ki, ki2,
                iv, n2, nb, nb2, version;
    float emax, remax;
    
    // .. Local Arrays ..
    // since iv is a 1-based index, allocate one extra here
    magma_int_t iscomplex[ nbmax+1 ];

    // Decode and test the input parameters
    bothv  = (side == MagmaBothSides);
    rightv = (side == MagmaRight) || bothv;
    leftv  = (side == MagmaLeft ) || bothv;

    allv  = (howmany == MagmaAllVec);
    over  = (howmany == MagmaBacktransVec);
    somev = (howmany == MagmaSomeVec);

    *info = 0;
    if ( ! rightv && ! leftv )
        *info = -1;
    else if ( ! allv && ! over && ! somev )
        *info = -2;
    else if ( n < 0 )
        *info = -4;
    else if ( ldt < max( 1, n ) )
        *info = -6;
    else if ( ldvl < 1 || ( leftv && ldvl < n ) )
        *info = -8;
    else if ( ldvr < 1 || ( rightv && ldvr < n ) )
        *info = -10;
    else if ( lwork < max( 1, 3*n ) )
        *info = -14;
    else {
        // Set mout to the number of columns required to store the selected
        // eigenvectors, standardize the array select if necessary, and
        // test mm.
        if ( somev ) {
            *mout = 0;
            pair = false;
            for( j=0; j < n; ++j ) {
                if ( pair ) {
                    pair = false;
                    select[j] = false;
                }
                else {
                    if ( j < n-1 ) {
                        if ( *T(j+1,j) == c_zero ) {
                            if ( select[j] ) {
                                *mout += 1;
                            }
                        }
                        else {
                            pair = true;
                            if ( select[j] || select[j+1] ) {
                                select[j] = true;
                                *mout += 2;
                            }
                        }
                    }
                    else if ( select[n-1] ) {
                        *mout += 1;
                    }
                }
            }
        }
        else {
            *mout = n;
        }
        if ( mm < *mout ) {
            *info = -11;
        }
    }
    
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    // Quick return if possible.
    if ( n == 0 ) {
        return *info;
    }
    
    // Use blocked version (2) if sufficient workspace.
    // Requires 1 vector for 1-norms, and 2*nb vectors for x and Q*x.
    // Zero-out the workspace to avoid potential NaN propagation.
    nb = 2;
    if ( lwork >= n + 2*n*nbmin ) {
        version = 2;
        nb = (lwork - n) / (2*n);
        nb = min( nb, nbmax );
        nb2 = 1 + 2*nb;
        lapackf77_slaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n );
    }
    else {
        version = 1;
    }

    // Compute 1-norm of each column of strictly upper triangular
    // part of T to control overflow in triangular solver.
    *work(0,0) = c_zero;
    for( j=1; j < n; ++j ) {
        *work(j,0) = c_zero;
        for( i=0; i < j; ++i ) {
            *work(j,0) += fabsf( *T(i,j) );
        }
    }

    magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0;
    timer_start( time_total );

    // Index ip is used to specify the real or complex eigenvalue:
    // ip =  0, real eigenvalue (wr),
    //    =  1, first  of conjugate complex pair: (wr,wi)
    //    = -1, second of conjugate complex pair: (wr,wi)
    // iscomplex array stores ip for each column in current block.
    if ( rightv ) {
        // ============================================================
        // Compute right eigenvectors.
        // iv is index of column in current block (1-based).
        // For complex right vector, uses iv-1 for real part and iv for complex part.
        // Non-blocked version always uses iv=2;
        // blocked     version starts with iv=nb, goes down to 1 or 2.
        // (Note the "0-th" column is used for 1-norms computed above.)
        iv = 2;
        if ( version == 2 ) {
            iv = nb;
        }

        timer_start( time_trsv );
        ip = 0;
        is = *mout - 1;
        for( ki=n-1; ki >= 0; --ki ) {
            if ( ip == -1 ) {
                // previous iteration (ki+1) was second of conjugate pair,
                // so this ki is first of conjugate pair; skip to end of loop
                ip = 1;
                continue;
            }
            else if ( ki == 0 ) {
                // last column, so this ki must be real eigenvalue
                ip = 0;
            }
            else if ( *T(ki,ki-1) == c_zero ) {
                // zero on sub-diagonal, so this ki is real eigenvalue
                ip = 0;
            }
            else {
                // non-zero on sub-diagonal, so this ki is second of conjugate pair
                ip = -1;
            }

            if ( somev ) {
                if ( ip == 0 ) {
                    if ( ! select[ki] ) {
                        continue;
                    }
                }
                else {
                    if ( ! select[ki-1] ) {
                        continue;
                    }
                }
            }

            if ( ip == 0 ) {
                // ------------------------------------------------------------
                // Real right eigenvector
                // Solve upper quasi-triangular system:
                // [ T(0:ki-1,0:ki-1) - wr ]*X = -T(0:ki-1,ki)
                magma_slaqtrsd( MagmaNoTrans, ki+1, T(0,0), ldt,
                                work(0,iv), n, work(0,0), &ierr );
                
                // Copy the vector x or Q*x to VR and normalize.
                if ( ! over ) {
                    // ------------------------------
                    // no back-transform: copy x to VR and normalize.
                    n2 = ki+1;
                    blasf77_scopy( &n2, work(0,iv), &ione, VR(0,is), &ione );

                    ii = blasf77_isamax( &n2, VR(0,is), &ione ) - 1;  // subtract 1; ii is 0-based
                    remax = c_one / fabsf( *VR(ii,is) );
                    blasf77_sscal( &n2, &remax, VR(0,is), &ione );

                    for( k=ki + 1; k < n; ++k ) {
                        *VR(k,is) = c_zero;
                    }
                }
                else if ( version == 1 ) {
                    // ------------------------------
                    // version 1: back-transform each vector with GEMV, Q*x.
                    time_trsv_sum += timer_stop( time_trsv );
                    timer_start( time_gemv );
                    if ( ki > 0 ) {
                        n2 = ki;
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VR, &ldvr,
                                       work(0, iv), &ione,
                                       work(ki,iv), VR(0,ki), &ione );
                    }
                    time_gemv_sum += timer_stop( time_gemv );
                    ii = blasf77_isamax( &n, VR(0,ki), &ione ) - 1;  // subtract 1; ii is 0-based
                    remax = c_one / fabsf( *VR(ii,ki) );
                    blasf77_sscal( &n, &remax, VR(0,ki), &ione );
                    timer_start( time_trsv );
                }
                else if ( version == 2 ) {
                    // ------------------------------
                    // version 2: back-transform block of vectors with GEMM
                    // zero out below vector
                    for( k=ki + 1; k < n; ++k ) {
                        *work(k,iv) = c_zero;
                    }
                    iscomplex[ iv ] = ip;
                    // back-transform and normalization is done below
                }
            }  // end real eigenvector
            else {
                // ------------------------------------------------------------
                // Complex right eigenvector
                // Solve upper quasi-triangular system:
                // [ T(0:ki-2,0:ki-2) - (wr+i*wi) ]*x = u
                magma_slaqtrsd( MagmaNoTrans, ki+1, T(0,0), ldt,
                                work(0,iv-1), n, work(0,0), &ierr );

                // Copy the vector x or Q*x to VR and normalize.
                if ( ! over ) {
                    // ------------------------------
                    // no back-transform: copy x to VR and normalize.
                    n2 = ki+1;
                    blasf77_scopy( &n2, work(0,iv-1), &ione, VR(0,is-1), &ione );
                    blasf77_scopy( &n2, work(0,iv  ), &ione, VR(0,is  ), &ione );

                    emax = c_zero;
                    for( k=0; k <= ki; ++k ) {
                        emax = max( emax, fabsf(*VR(k,is-1)) + fabsf(*VR(k,is)) );
                    }
                    remax = c_one / emax;
                    blasf77_sscal( &n2, &remax, VR(0,is-1), &ione );
                    blasf77_sscal( &n2, &remax, VR(0,is  ), &ione );

                    for( k=ki + 1; k < n; ++k ) {
                        *VR(k,is-1) = c_zero;
                        *VR(k,is  ) = c_zero;
                    }
                }
                else if ( version == 1 ) {
                    // ------------------------------
                    // version 1: back-transform each vector with GEMV, Q*x.
                    time_trsv_sum += timer_stop( time_trsv );
                    timer_start( time_gemv );
                    if ( ki > 1 ) {
                        n2 = ki-1;
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VR, &ldvr,
                                       work(0,   iv-1), &ione,
                                       work(ki-1,iv-1), VR(0,ki-1), &ione );
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VR, &ldvr,
                                       work(0, iv), &ione,
                                       work(ki,iv), VR(0,ki), &ione );
                    }
                    else {
                        blasf77_sscal( &n, work(ki-1,iv-1), VR(0,ki-1), &ione );
                        blasf77_sscal( &n, work(ki,  iv  ), VR(0,ki  ), &ione );
                    }
                    time_gemv_sum += timer_stop( time_gemv );

                    emax = c_zero;
                    for( k=0; k < n; ++k ) {
                        emax = max( emax, fabsf(*VR(k,ki-1)) + fabsf(*VR(k,ki)) );
                    }
                    remax = c_one / emax;
                    blasf77_sscal( &n, &remax, VR(0,ki-1), &ione );
                    blasf77_sscal( &n, &remax, VR(0,ki  ), &ione );
                    timer_start( time_trsv );
                }
                else if ( version == 2 ) {
                    // ------------------------------
                    // version 2: back-transform block of vectors with GEMM
                    // zero out below vector
                    for( k=ki + 1; k < n; ++k ) {
                        *work(k,iv-1) = c_zero;
                        *work(k,iv  ) = c_zero;
                    }
                    iscomplex[ iv-1 ] = -ip;
                    iscomplex[ iv   ] =  ip;
                    iv -= 1;
                    // back-transform and normalization is done below
                }
            }  // end real or complex vector

            if ( version == 2 ) {
                // ------------------------------------------------------------
                // Blocked version of back-transform
                // For complex case, ki2 includes both vectors (ki-1 and ki)
                if ( ip == 0 ) {
                    ki2 = ki;
                }
                else {
                    ki2 = ki - 1;
                }

                // Columns iv:nb of work are valid vectors.
                // When the number of vectors stored reaches nb-1 or nb,
                // or if this was last vector, do the GEMM
                if ( (iv <= 2) || (ki2 == 0) ) {
                    time_trsv_sum += timer_stop( time_trsv );
                    timer_start( time_gemm );
                    nb2 = nb-iv+1;
                    n2  = ki2+nb-iv+1;
                    blasf77_sgemm( "n", "n", &n, &nb2, &n2, &c_one,
                                   VR, &ldvr,
                                   work(0,iv), &n,
                                   &c_zero,
                                   work(0,nb+iv), &n );
                    time_gemm_sum += timer_stop( time_gemm );
                    
                    // normalize vectors
                    // TODO if somev, should copy vectors individually to correct location.
                    for( k=iv; k <= nb; ++k ) {
                        if ( iscomplex[k] == 0 ) {
                            // real eigenvector
                            ii = blasf77_isamax( &n, work(0,nb+k), &ione ) - 1;  // subtract 1; ii is 0-based
                            remax = c_one / fabsf( *work(ii,nb+k) );
                        }
                        else if ( iscomplex[k] == 1 ) {
                            // first eigenvector of conjugate pair
                            emax = c_zero;
                            for( ii=0; ii < n; ++ii ) {
                                emax = max( emax, fabsf( *work(ii,nb+k  ) )
                                                + fabsf( *work(ii,nb+k+1) ) );
                            }
                            remax = c_one / emax;
                        // else if iscomplex[k] == -1
                        //     second eigenvector of conjugate pair
                        //     reuse same remax as previous k
                        }
                        blasf77_sscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    nb2 = nb-iv+1;
                    lapackf77_slacpy( "F", &n, &nb2,
                                      work(0,nb+iv), &n,
                                      VR(0,ki2), &ldvr );
                    iv = nb;
                    timer_start( time_trsv );
                }
                else {
                    iv -= 1;
                }
            }  // end blocked back-transform

            is -= 1;
            if ( ip != 0 ) {
                is -= 1;
            }
        }
    }
    timer_stop( time_trsv );

    timer_stop( time_total );
    timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n",
                  time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total );

    if ( leftv ) {
        // ============================================================
        // Compute left eigenvectors.
        // iv is index of column in current block (1-based).
        // For complex left vector, uses iv for real part and iv+1 for complex part.
        // Non-blocked version always uses iv=1;
        // blocked     version starts with iv=1, goes up to nb-1 or nb.
        // (Note the "0-th" column is used for 1-norms computed above.)
        iv = 1;
        ip = 0;
        is = 0;
        for( ki=0; ki < n; ++ki ) {
            if ( ip == 1 ) {
                // previous iteration (ki-1) was first of conjugate pair,
                // so this ki is second of conjugate pair; skip to end of loop
                ip = -1;
                continue;
            }
            else if ( ki == n-1 ) {
                // last column, so this ki must be real eigenvalue
                ip = 0;
            }
            else if ( *T(ki+1,ki) == c_zero ) {
                // zero on sub-diagonal, so this ki is real eigenvalue
                ip = 0;
            }
            else {
                // non-zero on sub-diagonal, so this ki is first of conjugate pair
                ip = 1;
            }

            if ( somev ) {
                if ( ! select[ki] ) {
                    continue;
                }
            }

            if ( ip == 0 ) {
                // ------------------------------------------------------------
                // Real left eigenvector
                // Solve transposed quasi-triangular system:
                // [ T(ki+1:n,ki+1:n) - wr ]**T * X = -T(ki+1:n,ki)
                magma_slaqtrsd( MagmaTrans, n-ki, T(ki,ki), ldt,
                                work(ki,iv), n, work(ki,0), &ierr );

                // Copy the vector x or Q*x to VL and normalize.
                if ( ! over ) {
                    // ------------------------------
                    // no back-transform: copy x to VL and normalize.
                    n2 = n-ki;
                    blasf77_scopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione );

                    ii = blasf77_isamax( &n2, VL(ki,is), &ione ) + ki - 1;  // subtract 1; ii is 0-based
                    remax = c_one / fabsf( *VL(ii,is) );
                    blasf77_sscal( &n2, &remax, VL(ki,is), &ione );

                    for( k=0; k < ki; ++k ) {
                        *VL(k,is) = c_zero;
                    }
                }
                else if ( version == 1 ) {
                    // ------------------------------
                    // version 1: back-transform each vector with GEMV, Q*x.
                    if ( ki < n-1 ) {
                        n2 = n-ki-1;
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VL(0,ki+1), &ldvl,
                                       work(ki+1,iv), &ione,
                                       work(ki,  iv), VL(0,ki), &ione );
                    }
                    ii = blasf77_isamax( &n, VL(0,ki), &ione ) - 1;  // subtract 1; ii is 0-based
                    remax = c_one / fabsf( *VL(ii,ki) );
                    blasf77_sscal( &n, &remax, VL(0,ki), &ione );
                }
                else if ( version == 2 ) {
                    // ------------------------------
                    // version 2: back-transform block of vectors with GEMM
                    // zero out above vector
                    // could go from (ki+1)-NV+1 to ki
                    for( k=0; k < ki; ++k ) {
                        *work(k,iv) = c_zero;
                    }
                    iscomplex[ iv ] = ip;
                    // back-transform and normalization is done below
                }
            }  // end real eigenvector
            else {
                // ------------------------------------------------------------
                // Complex left eigenvector
                // Solve transposed quasi-triangular system:
                // [ T(ki+2:n,ki+2:n)**T - (wr-i*wi) ]*X = V
                magma_slaqtrsd( MagmaTrans, n-ki, T(ki,ki), ldt,
                                work(ki,iv), n, work(ki,0), &ierr );

                // Copy the vector x or Q*x to VL and normalize.
                if ( ! over ) {
                    // ------------------------------
                    // no back-transform: copy x to VL and normalize.
                    n2 = n-ki;
                    blasf77_scopy( &n2, work(ki,iv  ), &ione, VL(ki,is  ), &ione );
                    blasf77_scopy( &n2, work(ki,iv+1), &ione, VL(ki,is+1), &ione );

                    emax = c_zero;
                    for( k=ki; k < n; ++k ) {
                        emax = max( emax, fabsf(*VL(k,is))+ fabsf(*VL(k,is+1)) );
                    }
                    remax = c_one / emax;
                    blasf77_sscal( &n2, &remax, VL(ki,is  ), &ione );
                    blasf77_sscal( &n2, &remax, VL(ki,is+1), &ione );

                    for( k=0; k < ki; ++k ) {
                        *VL(k,is  ) = c_zero;
                        *VL(k,is+1) = c_zero;
                    }
                }
                else if ( version == 1 ) {
                    // ------------------------------
                    // version 1: back-transform each vector with GEMV, Q*x.
                    if ( ki < n-2 ) {
                        n2 = n-ki-2;
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VL(0,ki+2), &ldvl,
                                       work(ki+2,iv), &ione,
                                       work(ki,  iv), VL(0,ki), &ione );
                        blasf77_sgemv( "n", &n, &n2, &c_one,
                                       VL(0,ki+2), &ldvl,
                                       work(ki+2,iv+1), &ione,
                                       work(ki+1,iv+1), VL(0,ki+1), &ione );
                    }
                    else {
                        blasf77_sscal( &n, work(ki,  iv  ), VL(0, ki  ), &ione );
                        blasf77_sscal( &n, work(ki+1,iv+1), VL(0, ki+1), &ione );
                    }

                    emax = c_zero;
                    for( k=0; k < n; ++k ) {
                        emax = max( emax, fabsf(*VL(k,ki))+ fabsf(*VL(k,ki+1)) );
                    }
                    remax = c_one / emax;
                    blasf77_sscal( &n, &remax, VL(0,ki  ), &ione );
                    blasf77_sscal( &n, &remax, VL(0,ki+1), &ione );
                }
                else if ( version == 2 ) {
                    // ------------------------------
                    // version 2: back-transform block of vectors with GEMM
                    // zero out above vector
                    // could go from (ki+1)-NV+1 to ki
                    for( k=0; k < ki; ++k ) {
                        *work(k,iv  ) = c_zero;
                        *work(k,iv+1) = c_zero;
                    }
                    iscomplex[ iv   ] =  ip;
                    iscomplex[ iv+1 ] = -ip;
                    iv += 1;
                    // back-transform and normalization is done below
                }
            }  // end real or complex eigenvector

            if ( version == 2 ) {
                // -------------------------------------------------
                // Blocked version of back-transform
                // For complex case, (ki2+1) includes both vectors (ki+1) and (ki+2)
                if ( ip == 0 ) {
                    ki2 = ki;
                }
                else {
                    ki2 = ki + 1;
                }

                // Columns 1:iv of work are valid vectors.
                // When the number of vectors stored reaches nb-1 or nb,
                // or if this was last vector, do the GEMM
                if ( (iv >= nb-1) || (ki2 == n-1) ) {
                    n2 = n-(ki2+1)+iv;
                    blasf77_sgemm( "n", "n", &n, &iv, &n2, &c_one,
                                   VL(0,ki2-iv+1), &ldvl,
                                   work(ki2-iv+1,1), &n,
                                   &c_zero,
                                   work(0,nb+1), &n );
                    // normalize vectors
                    for( k=1; k <= iv; ++k ) {
                        if ( iscomplex[k] == 0 ) {
                            // real eigenvector
                            ii = blasf77_isamax( &n, work(0,nb+k), &ione ) - 1;  // subtract 1; ii is 0-based
                            remax = c_one / fabsf( *work(ii,nb+k) );
                        }
                        else if ( iscomplex[k] == 1) {
                            // first eigenvector of conjugate pair
                            emax = c_zero;
                            for( ii=0; ii < n; ++ii ) {
                                emax = max( emax, fabsf( *work(ii,nb+k  ) )
                                                + fabsf( *work(ii,nb+k+1) ) );
                            }
                            remax = c_one / emax;
                        // else if iscomplex[k] == -1
                        //     second eigenvector of conjugate pair
                        //     reuse same remax as previous k
                        }
                        blasf77_sscal( &n, &remax, work(0,nb+k), &ione );
                    }
                    lapackf77_slacpy( "F", &n, &iv,
                                      work(0,nb+1), &n,
                                      VL(0,ki2-iv+1), &ldvl );
                    iv = 1;
                }
                else {
                    iv += 1;
                }
            } // blocked back-transform

            is += 1;
            if ( ip != 0 ) {
                is += 1;
            }
        }
    }

    return *info;
}  // end of STREVC3