/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_Bcublas; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_DTRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, double, ldb*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. cublas_time = magma_sync_wtime( NULL ); cublasDtrmm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &M, &N, h_B, &ldb, work ); blasf77_daxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_Bcublas; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_DTRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, double, ldb*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasDtrmm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &M, &N, h_B, &ldb, work ); blasf77_daxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
extern "C" magma_int_t magma_dgegqr_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *dwork, double *work, magma_int_t *info ) { /* -- MAGMA (version 1.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= ZGEGQR orthogonalizes the N vectors given by a real M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the M-by-N matrix Q with orthogonal columns. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. dwork (GPU workspace) DOUBLE_PRECISION array, dimension (N,N) work (CPU workspace/output) DOUBLE_PRECISION array, dimension 3n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work must be in pinned memory. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== ===================================================================== */ magma_int_t i = 0, j, k, n2 = n*n, ione = 1; double zero = MAGMA_D_ZERO, one = MAGMA_D_ONE; double cn = 200., mins, maxs; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } double *U, *VT, *vt, *R, *G, *hwork, *tau; double *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_dmalloc_cpu( &hwork, 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork = n*n; // First part f hwork; used as workspace in svd U = hwork + n*n; // Size n*n S = (double *)(U+n*n);// Size n tau = U + n*n + n ; // Size n do { i++; magma_dgemm(MagmaTrans, MagmaNoTrans, ??, ??, ??, one, dA, ldda, dA, ldda, zero, dwork, n ); magma_dgetmatrix(??, ??, dwork, n, G, n); lapackf77_dgesvd("n", "a", &??, &??, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); mins = 100.f, maxs = 0.f; for(k=0; k<n; k++){ S[k] = magma_dsqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for(k=0; k<n;k++){ vt = VT + k*n; for(j=0; j<n; j++) vt[j]*=S[j]; } lapackf77_dgeqrf(&??, &??, ??, &n, tau, hwork, &lwork, info); if (i==1) blasf77_dcopy(&n2, VT, &ione, R, &ione); else blasf77_dtrmm("l", "u", "n", "n", &n, &n, &one, VT, &n, R, &n); magma_dsetmatrix(n, n, VT, n, G, n); magma_dtrsm('r', 'u', 'n', 'n', ??, ??, one, ??, n, ??, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); return *info; } /* magma_dgegqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; double magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *piv; magma_err_t err; double *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_DTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, double, lda*Ak ); TESTING_MALLOC( LU, double, lda*Ak ); TESTING_MALLOC( LUT, double, lda*Ak ); TESTING_MALLOC( h_B, double, ldb*N ); TESTING_MALLOC( h_B1, double, ldb*N ); TESTING_MALLOC( h_X1, double, ldb*N ); TESTING_MALLOC( h_X2, double, ldb*N ); TESTING_MALLOC( h_Bcublas, double, ldb*N ); TESTING_MALLOC( h_Bmagma, double, ldb*N ); TESTING_DEVALLOC( d_A, double, ldda*Ak ); TESTING_DEVALLOC( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, LU ); err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) ); assert( err == 0 ); lapackf77_dgetrf( &Ak, &Ak, LU, &lda, piv, &info ); int i, j; for(i=0;i<Ak;i++){ for(j=0;j<Ak;j++){ LUT[j+i*lda] = LU[i+j*lda]; } } lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda); if(opts.uplo == MagmaLower){ lapackf77_dlacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda); }else{ lapackf77_dlacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda); } lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(double)); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_dtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasDtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(double)); double alpha2 = MAGMA_D_DIV( c_one, alpha ); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); double norm1 = lapackf77_dlange( "M", &M, &N, h_X1, &ldb, work ); double normx = lapackf77_dlange( "M", &M, &N, h_Bmagma, &ldb, work ); double normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(double)); blasf77_dtrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_daxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_dlange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_dlange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( LU ); TESTING_FREE( LUT ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_FREE( h_Bmagma ); TESTING_FREE( h_B1 ); TESTING_FREE( h_X1 ); TESTING_FREE( h_X2 ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }