int main(int argc, char **argv) { TESTING_INIT(); const float c_neg_one = MAGMA_S_NEG_ONE; const magma_int_t ione = 1; real_Double_t atomics_perf=0, atomics_time=0; real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error=0, atomics_error=0, cublas_error, work[1]; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, ldda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Yatomics, *Ycublas, *Ymagma; magmaFloat_ptr dA, dX, dY, dwork; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N MAGMA Gflop/s (ms) Atomics Gflop/s CUBLAS Gflop/s CPU Gflop/s MAGMA error Atomics CUBLAS\n"); printf("%%=====================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_SSYMV( N ) / 1e9; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Yatomics, float, sizeY ); TESTING_MALLOC_CPU( Ycublas, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, ldda*N ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); blocks = magma_ceildiv( N, nb ); ldwork = ldda*blocks; TESTING_MALLOC_DEV( dwork, float, ldwork ); magmablas_slaset( MagmaFull, ldwork, 1, MAGMA_S_NAN, MAGMA_S_NAN, dwork, ldwork, opts.queue ); magmablas_slaset( MagmaFull, ldda, N, MAGMA_S_NAN, MAGMA_S_NAN, dA, ldda, opts.queue ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); magma_smake_symmetric( N, A, lda ); // should not use data from the opposite triangle -- fill with NAN to check magma_int_t N1 = N-1; if ( opts.uplo == MagmaUpper ) { lapackf77_slaset( "Lower", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[1], &lda ); } else { lapackf77_slaset( "Upper", &N1, &N1, &MAGMA_S_NAN, &MAGMA_S_NAN, &A[lda], &lda ); } lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( N, N, A, lda, dA, ldda, opts.queue ); magma_ssetvector( N, X, incx, dX, incx, opts.queue ); magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_ssymv( opts.uplo, N, alpha, dA, 0, ldda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_sgetvector( N, dY, incy, Ycublas, incy, opts.queue ); /* ===================================================================== Performs operation using CUBLAS - using atomics =================================================================== */ #ifdef HAVE_CUBLAS cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_ALLOWED ); magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); // sync on queue doesn't work -- need device sync or use NULL stream -- bug in CUBLAS? atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ); cublasSsymv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); atomics_time = magma_sync_wtime( NULL /*opts.queue*/ ) - atomics_time; atomics_perf = gflops / atomics_time; magma_sgetvector( N, dY, incy, Yatomics, incy, opts.queue ); cublasSetAtomicsMode( opts.handle, CUBLAS_ATOMICS_NOT_ALLOWED ); #endif /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetvector( N, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); if ( opts.version == 1 ) { magmablas_ssymv_work( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, dwork, ldwork, opts.queue ); } else { // non-work interface (has added overhead) magmablas_ssymv( opts.uplo, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( N, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_ssymv( lapack_uplo_const(opts.uplo), &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_slange( "M", &N, &ione, Ycublas, &N, work ) / N; #ifdef HAVE_CUBLAS blasf77_saxpy( &N, &c_neg_one, Y, &incy, Yatomics, &incy ); atomics_error = lapackf77_slange( "M", &N, &ione, Yatomics, &N, work ) / N; blasf77_saxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "M", &N, &ione, Ymagma, &N, work ) / N; #endif bool okay = (magma_error < tol && cublas_error < tol && atomics_error < tol); status += ! okay; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) N, magma_perf, 1000.*magma_time, atomics_perf, 1000.*atomics_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, atomics_error, (okay ? "ok" : "failed")); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Yatomics ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_err_t magma_slatrd(char uplo, magma_int_t n, magma_int_t nb, float *a, magma_int_t lda, float *e, float *tau, float *w, magma_int_t ldw, magmaFloat_ptr da, size_t da_offset, magma_int_t ldda, magmaFloat_ptr dw, size_t dw_offset, magma_int_t lddw, magma_queue_t queue) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= SLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) REAL array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t i; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float value = MAGMA_S_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; float alpha; float *f; magma_smalloc_cpu( &f, n ); if (n <= 0) { return 0; } magma_event_t event = NULL; if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_S_REAL( alpha ); MAGMA_S_SET2REAL(*A(i-1, i), 1.); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector( i, A(0, i), 0, 1, dA(0, i), 1, queue ); magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, queue); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw)/*test*/, 0, ldw, queue, &event ); if (i < n-1) { blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_event_sync(event); if (i < n-1) { blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); lapackf77_slacgv(&i, A(i ,0), &lda); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_S_REAL( alpha ); MAGMA_S_SET2REAL(*A(i+1, i), 1.); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector( i_n, A(i+1, i), 0, 1, dA(i+1, i), 1, queue ); magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, queue); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), 0, ldw, queue, &event ); blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_event_sync(event); if (i!=0) blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); return 0; } /* slatrd_ */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; float *A, *B, *C, *C2, *LU; float *dA, *dB, *dC1, *dC2; float alpha = MAGMA_S_MAKE( 0.5, 0.1 ); float beta = MAGMA_S_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_smalloc_pinned( &A, size ); assert( err == 0 ); err = magma_smalloc_pinned( &B, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C, size ); assert( err == 0 ); err = magma_smalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_smalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_smalloc( &dA, size ); assert( err == 0 ); err = magma_smalloc( &dB, size ); assert( err == 0 ); err = magma_smalloc( &dC1, size ); assert( err == 0 ); err = magma_smalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_slarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, B ); lapackf77_slarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test SSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetmatrix( m, n, A, ld, dB, ld ); magma_sswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_sswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasSaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_sgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_slange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "sswap diff %.2g\n", error ); // ----- test ISAMAX // get argmax of column of A magma_ssetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_isamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIsamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "isamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test SGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_ssetmatrix( m, n, A, ld, dA, ld ); magma_ssetvector( maxn, B, 1, dB, 1 ); magma_ssetvector( maxn, C, 1, dC1, 1 ); magma_ssetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_sgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasSaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMV( m, n ) / 1e9; printf( "sgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test SSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetvector( m, B, 1, dB, 1 ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ssymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMV( m ) / 1e9; printf( "ssymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test STRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_slacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_sgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_ssetmatrix( m, m, LU, ld, dA, ld ); magma_ssetvector( m, C, 1, dC1, 1 ); magma_ssetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_strsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_slange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( MagmaLeft, m, 1 ) / 1e9; printf( "strsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test SGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_ssetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_ssetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_sgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SGEMM( m, n, k ) / 1e9; printf( "sgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_ssetmatrix( m, m, A, ld, dA, ld ); magma_ssetmatrix( m, n, B, ld, dB, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYMM( side[is], m, n ) / 1e9; printf( "ssymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_ssetmatrix( n, k, A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYRK( k, n ) / 1e9; printf( "ssyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test SSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_ssetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_ssetmatrix( n, n, C, ld, dC1, ld ); magma_ssetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ssyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasSsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_SSYR2K( k, n ) / 1e9; printf( "ssyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test STRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRMM( side[is], m, n ) / 1e9; printf( "strmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test STRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_ssetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_ssetmatrix( m, n, C, ld, dC1, ld ); magma_ssetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_strsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasStrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasSaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_sgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_slange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_STRSM( side[is], m, n ) / 1e9; printf( "strsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- SLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e REAL array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). @param dA @param ldda @param dW @param lddw Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slatrd( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *e, float *tau, float *W, magma_int_t ldw, magmaFloat_ptr dA, magma_int_t ldda, magmaFloat_ptr dW, magma_int_t lddw) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define W(i_, j_) (W + (i_) + (j_)*ldw) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dW(i_, j_) (dW + (i_) + (j_)*lddw) const float c_neg_one = MAGMA_S_NEG_ONE; const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; const magma_int_t ione = 1; float alpha, value; magma_int_t i, i_n, i_1, iw; /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper ) { info = -1; } else if ( n < 0 ) { info = -2; } else if ( nb < 1 ) { info = -3; } else if ( lda < max(1,n) ) { info = -5; } else if ( ldw < max(1,n) ) { info = -9; } else if ( ldda < max(1,n) ) { info = -11; } else if ( lddw < max(1,n) ) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0) { return info; } magma_queue_t stream; magma_queue_create( &stream ); float *f; magma_smalloc_cpu( &f, n ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, W(i, iw+1), &ldw ); lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif blasf77_sgemv( "No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i_n, A(i, i+1), &lda ); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg( &i, &alpha, A(0, i), &ione, &tau[i - 1] ); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 ); magma_ssymv( MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione ); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw), ldw, stream ); if (i < n-1) { blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); blasf77_sgemv( MagmaConjTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione ); blasf77_sgemv( "No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione ); } blasf77_sscal( &i, &tau[i - 1], W(0, iw), &ione ); value = magma_cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy( &i, &alpha, A(0, i), &ione, W(0, iw), &ione ); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, W(i, 0), &ldw ); lapackf77_slacgv( &i, A(i, 0), &lda ); #endif blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv( &i, A(i, 0), &lda ); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg( &i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i] ); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); magma_ssymv( MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione ); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione ); blasf77_sgemv( MagmaConjTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione ); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_saxpy( &i_n, &c_one, f, &ione, W(i+1, i), &ione ); blasf77_sgemv( "No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione ); blasf77_sscal( &i_n, &tau[i], W(i+1,i), &ione ); value = magma_cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); alpha = tau[i] * -0.5f * value; blasf77_saxpy( &i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione ); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return info; } /* magma_slatrd */
/** Purpose ------- SLATRD reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = MagmaUpper, SLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = MagmaLower, SLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by SSYTRD. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the matrix A. @param[in] nb INTEGER The number of rows and columns to be reduced. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: - if UPLO = MagmaUpper, the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; - if UPLO = MagmaLower, the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= (1,N). @param[out] e REAL array, dimension (N-1) If UPLO = MagmaUpper, E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = MagmaLower, E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = MagmaUpper, and in TAU(1:nb) if UPLO = MagmaLower. See Further Details. @param[out] W REAL array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. @param[in] ldw INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slatrd(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *e, float *tau, float *W, magma_int_t ldw, float *dA, magma_int_t ldda, float *dW, magma_int_t lddw) { #define A(i, j) (A + (j)*lda + (i)) #define W(i, j) (W + (j)*ldw + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dW(i, j) (dW + (j)*lddw + (i)) magma_int_t i; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; float value = MAGMA_S_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; float alpha; float *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_smalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside slatrd if (uplo == MagmaUpper) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, W(i, iw+1), &ldw); lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif blasf77_sgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i_n, A(i, i+1), &lda); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_slarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_S_REAL( alpha ); *A(i-1,i) = MAGMA_S_ONE; /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_ssetvector( i, A(0, i), 1, dA(0, i), 1 ); magma_ssymv(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_sgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_sgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_sscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_sdot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_sdot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_saxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, W(i, 0), &ldw); lapackf77_slacgv(&i, A(i, 0), &lda); #endif blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_slacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_slarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_S_REAL( alpha ); *A(i+1,i) = MAGMA_S_ONE; /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_ssetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); magma_ssymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione); // 2. Start putting the result back (asynchronously) magma_sgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_sgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i != 0) blasf77_saxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_sgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_sscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_sdot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_sdot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_saxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu( f ); magma_queue_destroy( stream ); return 0; } /* magma_slatrd */