// Initialize matrix to random. // Having this in separate function ensures the same ISEED is always used, // so we can re-generate the identical matrix. void init_matrix( int m, int n, magmaDoubleComplex *h_A, magma_int_t lda ) { magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t n2 = lda*n; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); }
// Initialize matrix to random & symmetrize. // Having this in separate function ensures the same ISEED is always used, // so we can re-generate the identical matrix. void init_matrix( int m, int n, magmaDoubleComplex *h_A, magma_int_t lda ) { assert( m == n ); magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t n2 = lda*n; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( n, h_A, lda ); }
// On input, A and ipiv is LU factorization of A. On output, A is overwritten. // Requires m == n. // Uses init_matrix() to re-generate original A as needed. // Generates random RHS b and solves Ax=b. // Returns residual, |Ax - b| / (n |A| |x|). double get_residual( magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv ) { if ( m != n ) { printf( "\nERROR: residual check defined only for square matrices\n" ); return -1; } const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magma_int_t ione = 1; // this seed should be DIFFERENT than used in init_matrix // (else x is column of A, so residual can be exactly zero) magma_int_t ISEED[4] = {0,0,0,2}; magma_int_t info = 0; magmaDoubleComplex *x, *b; // initialize RHS TESTING_MALLOC_CPU( x, magmaDoubleComplex, n ); TESTING_MALLOC_CPU( b, magmaDoubleComplex, n ); lapackf77_zlarnv( &ione, ISEED, &n, b ); blasf77_zcopy( &n, b, &ione, x, &ione ); // solve Ax = b lapackf77_zgetrs( "Notrans", &n, &ione, A, &lda, ipiv, x, &n, &info ); if (info != 0) { printf("lapackf77_zgetrs returned error %d: %s.\n", (int) info, magma_strerror( info )); } // reset to original A init_matrix( m, n, A, lda ); // compute r = Ax - b, saved in b blasf77_zgemv( "Notrans", &m, &n, &c_one, A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_zlange( "F", &m, &n, A, &lda, work ); norm_r = lapackf77_zlange( "F", &n, &ione, b, &n, work ); norm_x = lapackf77_zlange( "F", &n, &ione, x, &n, work ); //printf( "r=\n" ); magma_zprint( 1, n, b, 1 ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); //printf( "r=%.2e, A=%.2e, x=%.2e, n=%d\n", norm_r, norm_A, norm_x, n ); return norm_r / (n * norm_A * norm_x); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_zsetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, N ); magma_zmake_hermitian( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_zhegvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); double maxw=0, diff=0; for(int j=0; j<m2; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m2*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( " | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, ldda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf("%% M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf("%% M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Xm, X, incx, dX, incx, opts.queue ); magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_zgetvector( Ym, dY, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work ); double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work ); blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); bool okay = (magma_error < tol) && (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (okay ? "ok" : "failed")); #else bool okay = (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (okay ? "ok" : "failed")); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |L - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_zgeqlf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result, following zqlt01 except using the reduced Q. This works for any M,N (square, tall, wide). =================================================================== */ if ( opts.check ) { magma_int_t ldq = M; magma_int_t ldl = min_mn; magmaDoubleComplex *Q, *L; double *work; TESTING_MALLOC_CPU( Q, magmaDoubleComplex, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( L, magmaDoubleComplex, ldl*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // copy M by K matrix V to Q (copying diagonal, which isn't needed) and // copy K by N matrix L lapackf77_zlaset( "Full", &min_mn, &N, &c_zero, &c_zero, L, &ldl ); if ( M >= N ) { // for M=5, N=3: A = [ V V V ] <= V full block (M-N by K) // K=N [ V V V ] // [ ----- ] // [ L V V ] <= V triangle (N by K, copying diagonal too) // [ L L V ] <= L triangle (K by N) // [ L L L ] magma_int_t M_N = M - N; lapackf77_zlacpy( "Full", &M_N, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_zlacpy( "Upper", &N, &min_mn, &h_R[M_N], &lda, &Q[M_N], &ldq ); lapackf77_zlacpy( "Lower", &min_mn, &N, &h_R[M_N], &lda, L, &ldl ); } else { // for M=3, N=5: A = [ L L | L V V ] <= V triangle (K by K) // K=M [ L L | L L V ] <= L triangle (K by M) // [ L L | L L L ] // ^^^============= L full block (K by N-M) magma_int_t N_M = N - M; lapackf77_zlacpy( "Upper", &M, &min_mn, &h_R[N_M*lda], &lda, Q, &ldq ); lapackf77_zlacpy( "Full", &min_mn, &N_M, h_R, &lda, L, &ldl ); lapackf77_zlacpy( "Lower", &min_mn, &M, &h_R[N_M*lda], &lda, &L[N_M*ldl], &ldl ); } // generate M by K matrix Q, where K = min(M,N) lapackf77_zungql( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // error = || L - Q^H*A || / (N * ||A||) blasf77_zgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, L, &ldl ); Anorm = lapackf77_zlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_zlange( "1", &min_mn, &N, L, &ldl, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set L = I (K by K identity), then L = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_zlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, L, &ldl ); blasf77_zherk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, L, &ldl ); error2 = lapackf77_zlanhe( "1", "Upper", &min_mn, L, &ldl, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( L ); L = NULL; TESTING_FREE_CPU( work ); work = NULL; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgeqlf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ztrmv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N; magma_int_t Ak; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_x, *h_xcublas; magmaDoubleComplex *d_A, *d_x; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "uplo = %c, transA = %c, diag = %c \n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_ZTRMM(opts.side, N, 1) / 1e9; lda = N; Ak = N; ldda = ((lda+31)/32)*32; sizeA = lda*Ak; TESTING_MALLOC( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC( h_x, magmaDoubleComplex, N ); TESTING_MALLOC( h_xcublas, magmaDoubleComplex, N ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_DEVALLOC( d_x, magmaDoubleComplex, N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &N, h_x ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_zsetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasZtrmv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ztrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &ione, h_x, &N, work ); blasf77_zaxpy( &N, &c_neg_one, h_x, &ione, h_xcublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &ione, h_xcublas, &N, work ) / Cnorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R, *work; magmaDoubleComplex_ptr d_A, dwork; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_zgetri_nb( N ); gflops = FLOPS_ZGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_zgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIzamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; //*h_R1 is used for warm-up magmaDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *h_R1; magmaDoubleComplex_ptr dT; double *rwork; double result[2] = {0., 0.}; double eps; int checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; /* Matrix size */ int N=0, n2, lda, nb, lwork, ltwork, once = 0; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,9900}; #endif int i, info; int ione = 1; int ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if ( N > 0 ){ printf(" testing_zgehrd -N %d\n\n", N); once = 1; } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC_HOST( h_A , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( tau , magmaDoubleComplex, N ); TESTING_MALLOC_HOST( h_R , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R1 , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV ( dT , magmaDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_MALLOC_HOST( h_Q, magmaDoubleComplex, lda*N ); TESTING_MALLOC_HOST( twork, magmaDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_HOST( rwork, double, N ); #endif } printf("\n\n"); printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; gflops = FLOPS( (double)N ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R1, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zgehrd ( N, ione, N, h_R1, lda, tau, h_work, lwork, dT, 0, &info, queue); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); clFinish(queue); gpu_time = get_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, 0, &info, queue); gpu_time = get_time() - gpu_time; if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, 0, nb, &info, queue); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgehrd(&N, &ione, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = get_time() - cpu_time; if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_FREE_HOST( h_work); TESTING_FREE_HOST( h_R ); TESTING_FREE_HOST( h_R1 ); TESTING_FREE_DEV ( dT ); if ( checkres ) { TESTING_FREE_HOST( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; magmaDoubleComplex *d_A; double *dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_ZGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaDoubleComplex, n2 ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( dtau, magmaDoubleComplex, min_mn ); TESTING_DEVALLOC(dwork, double, min_mn ); TESTING_MALLOC( h_work, magmaDoubleComplex, lwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_zsetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup magma_zgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); magma_zsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_zgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_zgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( dtau ); TESTING_DEVFREE( dwork ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zposv_batched */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double err = 0.0, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex_ptr d_A, d_B; magma_int_t *dinfo_array; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount = 1; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dB_array = NULL; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; batchCount = opts.batchcount ; printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("BatchCount N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_ZPOTRF( N) + FLOPS_ZPOTRS( N, nrhs ) ) / 1e9 * batchCount; sizeA = lda*N*batchCount; sizeB = ldb*nrhs*batchCount; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( work, double, N); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs*batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); magma_malloc((void**)&dA_array, batchCount * sizeof(*dA_array)); magma_malloc((void**)&dB_array, batchCount * sizeof(*dB_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); for(int i=0; i<batchCount; i++) { magma_zmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_zsetmatrix( N, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetmatrix( N, nrhs*batchCount, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ zset_pointer(dA_array, d_A, ldda, 0, 0, ldda*N, batchCount, queue); zset_pointer(dB_array, d_B, lddb, 0, 0, lddb*nrhs, batchCount, queue); gpu_time = magma_wtime(); info = magma_zposv_batched(opts.uplo, N, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_zposv_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_zposv_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_zgetmatrix( N, nrhs*batchCount, d_B, lddb, h_X, ldb ); for(magma_int_t s=0; s<batchCount; s++) { Anorm = lapackf77_zlange("I", &N, &N, h_A + s * lda * N, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X + s * ldb * nrhs, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A + s * lda * N, &lda, h_X + s * ldb * nrhs, &ldb, &c_neg_one, h_B + s * ldb * nrhs, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B + s * ldb * nrhs, &ldb, work); double error = Rnorm/(N*Anorm*Xnorm); if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(err, error); } status += ! (err < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(magma_int_t s=0; s<batchCount; s++) { lapackf77_zposv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A + s * lda * N, &lda, h_B + s * ldb * nrhs, &ldb, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zposv returned err %d: %s.\n", (int) info, magma_strerror( info )); printf( "%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } else { printf( "%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( dinfo_array ); magma_free(dA_array); magma_free(dB_array); free(cpu_info); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// -------------------- int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_time=0, cpu_perf=0, gpu_time, gpu_perf, mgpu_time, mgpu_perf, cuda_time, cuda_perf; double Ynorm, error=0, error2=0, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t n_local[MagmaMaxGPUs]; magma_int_t N, Noffset, lda, ldda, blocks, lhwork, ldwork, matsize, vecsize; magma_int_t incx = 1; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ylapack, *Ycublas, *Ymagma, *Ymagma1, *hwork; magmaDoubleComplex_ptr dA, dX, dY; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs], dwork[MagmaMaxGPUs]; magma_device_t dev; magma_queue_t queues[MagmaMaxGPUs]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code double tol = opts.tolerance * lapackf77_dlamch("E"); magma_int_t nb = 64; // required by magmablas_zhemv_mgpu implementation for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_create( dev, &queues[dev] ); } // currently, tests all offsets in the offsets array; // comment out loop below to test a specific offset. magma_int_t offset = opts.offset; magma_int_t offsets[] = { 0, 1, 31, 32, 33, 63, 64, 65, 100, 200 }; magma_int_t noffsets = sizeof(offsets) / sizeof(*offsets); printf("%% uplo = %s, ngpu %d, block size = %d, offset %d\n", lapack_uplo_const(opts.uplo), (int) opts.ngpu, (int) nb, (int) offset ); printf( "%% BLAS CUBLAS MAGMA 1 GPU MAGMA MGPU Error rel Error rel\n" "%% N offset Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) Gflop/s (msec) to CUBLAS to LAPACK\n" "%%==================================================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { // comment out these two lines & end of loop to test a specific offset for( int ioffset=0; ioffset < noffsets; ioffset += 1 ) { offset = offsets[ioffset]; for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; Noffset = N + offset; lda = Noffset; ldda = magma_roundup( Noffset, opts.align ); // multiple of 32 by default matsize = Noffset*ldda; vecsize = (Noffset-1)*incx + 1; gflops = FLOPS_ZHEMV( N ) / 1e9; blocks = magma_ceildiv( N + (offset % nb), nb ); lhwork = N*opts.ngpu; ldwork = ldda*(blocks + 1); TESTING_MALLOC_CPU( A, magmaDoubleComplex, matsize ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma1, magmaDoubleComplex, vecsize ); TESTING_MALLOC_CPU( Ylapack, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( X, magmaDoubleComplex, vecsize ); TESTING_MALLOC_PIN( hwork, magmaDoubleComplex, lhwork ); magma_setdevice( opts.device ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, matsize ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, vecsize ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, vecsize ); // TODO make magma_zmalloc_bcyclic helper function? for( dev=0; dev < opts.ngpu; dev++ ) { n_local[dev] = ((Noffset/nb)/opts.ngpu)*nb; if (dev < (Noffset/nb) % opts.ngpu) n_local[dev] += nb; else if (dev == (Noffset/nb) % opts.ngpu) n_local[dev] += Noffset % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*n_local[dev] ); TESTING_MALLOC_DEV( dwork[dev], magmaDoubleComplex, ldwork ); } ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &matsize, A ); magma_zmake_hermitian( Noffset, A, lda ); lapackf77_zlarnv( &ione, ISEED, &vecsize, X ); lapackf77_zlarnv( &ione, ISEED, &vecsize, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_setdevice( opts.device ); magma_zsetmatrix( Noffset, Noffset, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Noffset, X, incx, dX, incx, opts.queue ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); cuda_time = magma_sync_wtime(0); cublasZhemv( opts.handle, cublas_uplo_const(opts.uplo), N, &alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, &beta, dY + offset, incx ); cuda_time = magma_sync_wtime(0) - cuda_time; cuda_perf = gflops / cuda_time; magma_zgetvector( Noffset, dY, incx, Ycublas, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (1 GPU) =================================================================== */ magma_setdevice( opts.device ); magma_zsetvector( Noffset, Y, incx, dY, incx, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_zhemv_work( opts.uplo, N, alpha, dA + offset + offset*ldda, ldda, dX + offset, incx, beta, dY + offset, incx, dwork[ opts.device ], ldwork, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_zgetvector( Noffset, dY, incx, Ymagma1, incx, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (multi-GPU) =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( Noffset, Noffset, A, lda, d_lA, ldda, opts.ngpu, nb, queues ); blasf77_zcopy( &Noffset, Y, &incx, Ymagma, &incx ); // workspaces do NOT need to be zero -- set to NAN to prove for( dev=0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); magmablas_zlaset( MagmaFull, ldwork, 1, MAGMA_Z_NAN, MAGMA_Z_NAN, dwork[dev], ldwork, opts.queue ); } lapackf77_zlaset( "Full", &lhwork, &ione, &MAGMA_Z_NAN, &MAGMA_Z_NAN, hwork, &lhwork ); mgpu_time = magma_sync_wtime(0); magma_int_t info; info = magmablas_zhemv_mgpu( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } info = magmablas_zhemv_mgpu_sync( opts.uplo, N, alpha, d_lA, ldda, offset, X + offset, incx, beta, Ymagma + offset, incx, hwork, lhwork, dwork, ldwork, opts.ngpu, nb, queues ); if (info != 0) { printf("magmablas_zhemv_sync returned error %d: %s.\n", (int) info, magma_strerror( info )); } mgpu_time = magma_sync_wtime(0) - mgpu_time; mgpu_perf = gflops / mgpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { blasf77_zcopy( &Noffset, Y, &incx, Ylapack, &incx ); cpu_time = magma_wtime(); blasf77_zhemv( lapack_uplo_const(opts.uplo), &N, &alpha, A + offset + offset*lda, &lda, X + offset, &incx, &beta, Ylapack + offset, &incx ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Compute the Difference LAPACK vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ylapack, &incx ); error2 = lapackf77_zlange( "F", &Noffset, &ione, Ylapack, &Noffset, work ) / Ynorm; } /* ===================================================================== Compute the Difference Cublas vs. Magma =================================================================== */ Ynorm = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ); blasf77_zaxpy( &Noffset, &c_neg_one, Ymagma, &incx, Ycublas, &incx ); error = lapackf77_zlange( "F", &Noffset, &ione, Ycublas, &Noffset, work ) / Ynorm; bool okay = (error < tol && error2 < tol); status += ! okay; if ( opts.lapack ) { printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) offset, cpu_perf, cpu_time*1000., cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, error2, (okay ? "ok" : "failed") ); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e --- %s\n", (int) N, (int) offset, cuda_perf, cuda_time*1000., gpu_perf, gpu_time*1000., mgpu_perf, mgpu_time*1000., error, (okay ? "ok" : "failed") ); } /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( Ymagma1 ); TESTING_FREE_CPU( Ylapack ); TESTING_FREE_PIN( X ); TESTING_FREE_PIN( hwork ); magma_setdevice( opts.device ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); for( dev=0; dev < opts.ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); TESTING_FREE_DEV( dwork[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } // comment out these two lines line & top of loop test a specific offset } // end for ioffset printf( "\n" ); } for( dev=0; dev < opts.ngpu; ++dev ) { magma_queue_destroy( queues[dev] ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zunmqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaDoubleComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_zgeqrf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_ZUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqrf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaDoubleComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaDoubleComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, k ); // C is full, m x n size = ldc*n; lapackf77_zlarnv( &ione, ISEED, &size, C ); lapackf77_zlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_zlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in A, tau magma_zgeqrf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_zunmqr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_Z_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_zunmqr( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_zlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeadd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex *h_A, *h_B, *d_A, *d_B; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 3.1415, 2.718 ); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); /* Uncomment these lines to check parameters. * magma_xerbla calls lapack's xerbla to print out error. */ //magmablas_zgeadd( -1, N, alpha, d_A, ldda, d_B, ldda ); //magmablas_zgeadd( M, -1, alpha, d_A, ldda, d_B, ldda ); //magmablas_zgeadd( M, N, alpha, d_A, M-1, d_B, ldda ); //magmablas_zgeadd( M, N, alpha, d_A, ldda, d_B, N-1 ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Bl-Bm|/|Bl|\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; gflops = 2.*M*N / 1e9; TESTING_MALLOC( h_A, magmaDoubleComplex, lda *N ); TESTING_MALLOC( h_B, magmaDoubleComplex, lda *N ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, ldda*N ); lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, N, h_B, lda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ); magmablas_zgeadd( M, N, alpha, d_A, ldda, d_B, ldda ); gpu_time = magma_sync_wtime( NULL ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int j = 0; j < N; ++j ) { blasf77_zaxpy( &M, &alpha, &h_A[j*lda], &ione, &h_B[j*lda], &ione ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check result =================================================================== */ magma_zgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_zlange( "F", &M, &N, h_B, &lda, work ); blasf77_zaxpy( &size, &c_neg_one, h_A, &ione, h_B, &ione ); error = lapackf77_zlange( "F", &M, &N, h_B, &lda, work ) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double error, work[1]; // test all combinations of input parameters const char side[] = { MagmaLeft, MagmaRight }; const char trans[] = { MagmaConjTrans, MagmaNoTrans }; const char direct[] = { MagmaForward, MagmaBackward }; const char storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; if ( M < K || N < K || K <= 0 ) { printf( "skipping M %d, N %d, K %d; requires M >= K, N >= K, K >= 0.\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices magmaDoubleComplex *C, *R, *V, *T, *W; TESTING_MALLOC( C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( R, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( V, magmaDoubleComplex, ldv*K ); TESTING_MALLOC( T, magmaDoubleComplex, ldt*K ); TESTING_MALLOC( W, magmaDoubleComplex, ldw*K ); magmaDoubleComplex *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, magmaDoubleComplex, ldc*N ); TESTING_DEVALLOC( dV, magmaDoubleComplex, ldv*K ); TESTING_DEVALLOC( dT, magmaDoubleComplex, ldt*K ); TESTING_DEVALLOC( dW, magmaDoubleComplex, ldw*K ); // C is M x N. size = ldc*N; lapackf77_zlarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_zprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_zlarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_zprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_zlarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_zlaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_zprint( K, K, T, ldt ); magma_zsetmatrix( M, N, C, ldc, dC, ldc ); magma_zsetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_zsetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_zlarfb( &side[iside], &trans[itran], &direct[idir], &storev[istor], &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_zprint( M, N, C, ldc ); magma_zlarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_zgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_zprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_zlange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e\n", (int) M, (int) N, (int) K, storev[istor], side[iside], direct[idir], trans[itran], error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); }}}} printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zcgeqrsv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs; double error, gpu_error, cpu_error, Anorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R; magmaDoubleComplex_ptr d_A, d_B, d_X, d_T; magmaFloatComplex *d_SA, *d_SB; magmaDoubleComplex *h_workd, *tau, tmp[1]; magmaFloatComplex *h_works; magma_int_t lda, ldb, lhwork, lworkgpu; magma_int_t ldda, lddb, lddx; magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS double double single mixed Iter CPU GPU \n"); printf("=============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32) * 32; lddb = ((max_mn+31)/32)*32; lddx = ((N+31)/32) * 32; nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) ); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork ); h_works = (magmaFloatComplex*)h_workd; TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs ); TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb ); /* Initialize the matrices */ size = lda*N; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcgeqrsv_gpu( M, N, nrhs, d_A, ldda, d_B, lddb, d_X, lddx, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zcgeqrsv returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb ); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_workd, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfd = gflops / gpu_time; //===================================================================== // Single Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); /* The allocation of d_SA and d_SB is done here to avoid * to double the memory used on GPU with zcgeqrsv */ TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs ); magmablas_zlag2c( M, N, d_A, ldda, d_SA, ldda, &info ); magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info ); gpu_time = magma_wtime(); magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda, d_SB, lddb, h_works, lhwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfs = gflops / gpu_time; TESTING_FREE_DEV( d_SA ); TESTING_FREE_DEV( d_SB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f %7.2f %7.2f %7.2f %4d %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, (int) nrhs, cpu_perf, gpu_perfd, gpu_perfs, gpu_perf, (int) qrsv_iters, cpu_error, gpu_error, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_T ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaDoubleComplex *A, *B; double diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaDoubleComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively double x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); double total_diff = 0.; double total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_zmalloc_pinned( &A, size ); assert( A != NULL ); magma_zmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test DZASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dzasum( m, A(0,j), incx ); x_c = cblas_dzasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dzasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dzasum", diff, error ); total_diff += diff; total_error += error; } // ----- test DZNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_dznrm2( m, A(0,j), incx ); x_c = cblas_dznrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_dznrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "dznrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test ZDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "zdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test ZDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "zdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_zdotc and cblas_zdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_zdotc and blasf77_zdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A, dwork; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 }; magma_int_t ntest = 10; magma_int_t i, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; magmaDoubleComplex *work; magmaDoubleComplex tmp; double rwork[1]; magma_int_t *ipiv; magma_int_t lwork, ldwork; double A_norm, R_norm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[ntest-1] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgetri_gpu -N %d\n\n", 1024); } /* query for Lapack workspace size */ N = size[ntest-1]; lda = N; work = &tmp; lwork = -1; lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); lwork = int( MAGMA_Z_REAL( *work )); /* query for Magma workspace size */ ldwork = N * magma_get_zgetri_nb( N ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory */ n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for( i=0; i < ntest; i++ ){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZGETRI( (double)N ) / 1e9; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue ); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //warm-up magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); gpu_time = magma_wtime()-gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d\n", (int) info); gpu_perf = gflops / gpu_time; magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ); printf( "%5d %6.2f %6.2f %e\n", (int) N, cpu_perf, gpu_perf, R_norm / A_norm ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaDoubleComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_ZHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaDoubleComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaDoubleComplex, sizeY ); TESTING_DEVALLOC( dA, magmaDoubleComplex, sizeA ); TESTING_DEVALLOC( dX, magmaDoubleComplex, sizeX ); TESTING_DEVALLOC( dY, magmaDoubleComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); magma_zmake_hermitian( N, A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( N, N, A, lda, dA, lda ); magma_zsetvector( N, X, incx, dX, incx ); magma_zsetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasZhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_zsetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_zhemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_zhemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zhemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_zaxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_zlange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaDoubleComplex *A, *B; double error_cblas, error_fblas, error_inline; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t i, j, k, m, n, size, maxn, ld; // complex x for magma, cblas, fortran, inline blas respectively magmaDoubleComplex x2_m, x2_c, x2_f, x2_i; // real x for magma, cblas, fortran, inline blas respectively double x_m, x_c, x_f, x_i; MAGMA_UNUSED( x_c ); MAGMA_UNUSED( x_f ); MAGMA_UNUSED( x2_c ); MAGMA_UNUSED( x2_f ); MAGMA_UNUSED( x2_m ); magma_opts opts; opts.parse_opts( argc, argv ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; magma_int_t inc[] = { -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; magma_int_t ninc = sizeof(inc)/sizeof(*inc); magma_int_t maxinc = 0; for( i=0; i < ninc; ++i ) { maxinc = max( maxinc, abs(inc[i]) ); } printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfaults), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "!! If MAGMA_WITH_MKL or __APPLE__ are defined, known failures are skipped. !!\n" "\n" ); // tell user about disabled functions #ifndef HAVE_CBLAS printf( "n/a: HAVE_CBLAS not defined, so no cblas functions tested.\n\n" ); #endif #if defined(MAGMA_WITH_MKL) printf( "n/a: cblas_zdotc, cblas_zdotu, blasf77_zdotc, and blasf77_zdotu are disabled with MKL, due to segfaults.\n\n" ); #endif #if defined(__APPLE__) printf( "n/a: blasf77_zdotc and blasf77_zdotu are disabled on MacOS, due to segfaults.\n\n" ); #endif printf( "%% Error w.r.t. Error w.r.t. Error w.r.t.\n" "%% M N K incx incy Function CBLAS Fortran BLAS inline\n" "%%====================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { if ( itest > 0 ) { printf( "%%----------------------------------------------------------------------\n" ); } m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * maxinc; ld = max( 1, maxn ); size = ld*maxn; TESTING_MALLOC_CPU( A, magmaDoubleComplex, size ); TESTING_MALLOC_CPU( B, magmaDoubleComplex, size ); // initialize matrices lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); // ----- test DZASUM for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dzasum( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dzasum( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dzasum( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation x_i = 0; for( i=0; i < m; ++i ) { x_i += MAGMA_Z_ABS1( *A(i*incx,j) ); // |real(Aij)| + |imag(Aij)| } error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dzasum xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dzasum", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test DZNRM2 // get two-norm of column j of A for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dznrm2( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dznrm2( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dznrm2( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation (poor -- doesn't scale) x_i = 0; for( i=0; i < m; ++i ) { x_i += real( *A(i*incx,j) ) * real( *A(i*incx,j) ) + imag( *A(i*incx,j) ) * imag( *A(i*incx,j) ); // same: real( conj( *A(i*incx,j) ) * *A(i*incx,j) ); } x_i = sqrt( x_i ); error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dznrm2 xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dznrm2", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test ZDOTC // dot columns, Aj^H Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += conj( *A(A_offset + i*incx,j) ) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotc xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotc", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } printf( "\n" ); // ----- test ZDOTU // dot columns, Aj^T * Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += *A(A_offset + i*incx,j) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotu xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotu", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } // cleanup TESTING_FREE_CPU( A ); TESTING_FREE_CPU( B ); fflush( stdout ); } // itest, incx, incy opts.cleanup(); TESTING_FINALIZE(); return gStatus; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; double error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaDoubleComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_Z_REAL(temp); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_zhesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 2. * opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrs */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, matnorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *d_B; magma_int_t M, N, n2, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===============================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; if ( M < N ) { printf( "skipping M=%d, N=%d because M < N is not yet supported.\n", (int) M, (int) N ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; n2 = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; // query for workspace size lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; lhwork = -1; lapackf77_zgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = -1; lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC( h_work, magmaDoubleComplex, lhwork ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaDoubleComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS n2 = M*nrhs; lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //n2 = N*nrhs; //lapackf77_zlarnv( &ione, ISEED, &n2, h_X ); //blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_zgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*matnorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*matnorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error ); printf("%s\n", (gpu_error < tol ? "" : " failed")); status |= ! (gpu_error < tol); TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZPOTRF( N ) / 1e9; TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hpd( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zhemm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 ); magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double error=0., errorbis=0., work[1]; magmaDoubleComplex *hA, *hX, *hB, *hR; magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1]; magmaDoubleComplex_ptr dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M ); TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz ); TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M ); } lapackf77_zlarnv( &ione, iseed, &size, hA ); magma_zmake_hermitian( M, hA, lda ); size = lda*N; lapackf77_zlarnv( &ione, iseed, &size, hX ); lapackf77_zlarnv( &ione, iseed, &size, hB ); lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_zsetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex)); trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_zhemm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_zsetmatrix( M, M, hA, lda, dA2, ldda ); magma_zsetmatrix( M, N, hX, lda, dX[0], ldda ); magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_zhemm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_zprint( M, M, hA, lda ); //printf( "X =" ); magma_zprint( M, N, hX, lda ); //printf( "B =" ); magma_zprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_zhemm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_zprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }