/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e1, e2, e3, e4, e5, *work; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; magmaFloatComplex *d_A, *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", opts.version ); return -1; } float tol, eps = lapackf77_slamch("E"); tol = 10* opts.tolerance * eps; printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9 + FLOPS_CUNGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn; } TESTING_MALLOC_PIN( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN(h_rwork, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_cgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_cgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_cprint(N, N, h_work, N); blasf77_ctrmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_clange("i", &M, &N, h_R, &M, work) / lapackf77_clange("i", &M, &N, h_A, &lda, work); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_cungqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e1 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e2 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e1 < tol ? "ok" : "failed")); status += ! (e1 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- CGEGQR orthogonalizes the N vectors given by a complex M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_cgeqr2x3_gpu) and magma_cungqr 3: MGS 4. Cholesky QR @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) COMPLEX array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) COMPLEX array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dwork, magmaFloatComplex *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; float cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (ikind == 1) { // === Iterative, based on SVD ============================================================ magmaFloatComplex *U, *VT, *vt, *R, *G, *hwork, *tau; float *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_cmalloc_cpu( &hwork, 32 + 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (float *)(U+n*n); // Size n tau = U + n*n + n; // Size n #if defined(PRECISION_c) || defined(PRECISION_z) float *rwork; magma_smalloc_cpu( &rwork, 5*n); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, G, n); #if defined(PRECISION_s) || defined(PRECISION_d) lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); #else lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, rwork, info); #endif mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_ssqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_cgeqrf(&n, &n, VT, &n, tau, hwork, &lwork, info); if (i == 1) blasf77_ccopy(&n2, VT, &ione, R, &ione); else blasf77_ctrmm("l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n); magma_csetmatrix(n, n, VT, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); #if defined(PRECISION_c) || defined(PRECISION_z) magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; magmaFloatComplex *dtau = dwork + 2*n*n, *d_T = dwork, *ddA = dwork + n*n; magmaFloatComplex *tau = work+n*n; magmablas_claset( MagmaFull, n, n, c_zero, c_zero, d_T, n ); magma_cgeqr2x3_gpu(&m, &n, dA, &ldda, dtau, d_T, ddA, (float *)(dwork+min_mn+2*n*n), info); magma_cgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_cgetmatrix( n, n, ddA, n, work, n); magma_cungqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for(magma_int_t j = 0; j<n; j++){ for(magma_int_t i = 0; i<j; i++){ *work(i, j) = magma_cdotc(m, dA(0,i), 1, dA(0,j), 1); magma_caxpy(m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1); } for(magma_int_t i = j; i<n; i++) *work(i, j) = MAGMA_C_ZERO; //*work(j,j) = MAGMA_C_MAKE( magma_scnrm2(m, dA(0,j), 1), 0. ); *work(j,j) = magma_cdotc(m, dA(0,j), 1, dA(0,j), 1); *work(j,j) = MAGMA_C_MAKE( sqrt(MAGMA_C_REAL( *work(j,j) )), 0.); magma_cscal(m, 1./ *work(j,j), dA(0,j), 1); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, work, n); lapackf77_cpotrf("u", &n, work, &n, info); magma_csetmatrix(n, n, work, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); // ================== end of ikind == 4 =================================================== } return *info; } /* magma_cgegqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *piv; magma_err_t err; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2, *LU, *LUT; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "side = %c, uplo = %c, transA = %c, diag = %c \n", opts.side, opts.uplo, opts.transA, opts.diag ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( LU, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( LUT, magmaFloatComplex, lda*Ak ); TESTING_MALLOC( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*Ak ); TESTING_DEVALLOC( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, LU ); err = magma_malloc_cpu( (void**) &piv, Ak*sizeof(magma_int_t) ); assert( err == 0 ); lapackf77_cgetrf( &Ak, &Ak, LU, &lda, piv, &info ); int i, j; for(i=0;i<Ak;i++){ for(j=0;j<Ak;j++){ LUT[j+i*lda] = LU[i+j*lda]; } } lapackf77_clacpy(MagmaUpperStr, &Ak, &Ak, LUT, &lda, LU, &lda); if(opts.uplo == MagmaLower){ lapackf77_clacpy(MagmaLowerStr, &Ak, &Ak, LU, &lda, h_A, &lda); }else{ lapackf77_clacpy(MagmaUpperStr, &Ak, &Ak, LU, &lda, h_A, &lda); } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( &opts.side, &opts.uplo, &opts.transA, &opts.diag, &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( LU ); TESTING_FREE( LUT ); TESTING_FREE( h_B ); TESTING_FREE( h_Bcublas ); TESTING_FREE( h_Bmagma ); TESTING_FREE( h_B1 ); TESTING_FREE( h_X1 ); TESTING_FREE( h_X2 ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf=0, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error=0, cublas_error, lapack_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_Blapack, *h_X; magmaFloatComplex_ptr d_A, d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% side = %s, uplo = %s, transA = %s, diag = %s, ngpu = %d\n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), int(abs_ngpu) ); printf("%% M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA CUBLAS LAPACK error\n"); printf("%%============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Blapack, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy( h_Blapack, h_B, sizeB*sizeof(magmaFloatComplex) ); magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ #if defined(HAVE_CUBLAS) magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); if (opts.ngpu == 1) { magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb, opts.queue ); } else { magma_ctrsm_m( abs_ngpu, opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); } magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb, opts.queue ); #endif /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); #if defined(HAVE_CUBLAS) // opts.handle also uses opts.queue cublasCtrsm( opts.handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); #elif defined(HAVE_clBLAS) clblasCtrsm( clblasColumnMajor, clblas_side_const(opts.side), clblas_uplo_const(opts.uplo), clblas_trans_const(opts.transA), clblas_diag_const(opts.diag), M, N, alpha, d_A, 0, ldda, d_B, 0, lddb, 1, &opts.queue, 0, NULL, NULL ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_Blapack, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - 1/alpha*A*x|| / (||A||*||x||) magmaFloatComplex inv_alpha = MAGMA_C_DIV( c_one, alpha ); float normR, normX, normA; normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); #if defined(HAVE_CUBLAS) // check magma memcpy( h_X, h_Bmagma, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); magma_error = normR/(normX*normA); #endif // check cublas memcpy( h_X, h_Bcublas, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); cublas_error = normR/(normX*normA); if ( opts.lapack ) { // check lapack // this verifies that the matrix wasn't so bad that it couldn't be solved accurately. memcpy( h_X, h_Blapack, sizeB*sizeof(magmaFloatComplex) ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &inv_alpha, h_A, &lda, h_X, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_X, &ione ); normR = lapackf77_clange( "M", &M, &N, h_X, &ldb, work ); normX = lapackf77_clange( "M", &M, &N, h_Blapack, &ldb, work ); lapack_error = normR/(normX*normA); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, lapack_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e --- %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Blapack ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_Bcublas; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. cublas_time = magma_sync_wtime( NULL ); cublasCtrmm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &M, &N, h_B, &ldb, work ); blasf77_caxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }