// On input, A and ipiv is LU factorization of A. On output, A is overwritten. // Requires m == n. // Uses init_matrix() to re-generate original A as needed. // Generates random RHS b and solves Ax=b. // Returns residual, |Ax - b| / (n |A| |x|). float get_residual( magma_opts &opts, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv ) { if ( m != n ) { printf( "\nERROR: residual check defined only for square matrices\n" ); return -1; } const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; // this seed should be DIFFERENT than used in init_matrix // (else x is column of A, so residual can be exactly zero) magma_int_t ISEED[4] = {0,0,0,2}; magma_int_t info = 0; magmaFloatComplex *x, *b; // initialize RHS TESTING_MALLOC_CPU( x, magmaFloatComplex, n ); TESTING_MALLOC_CPU( b, magmaFloatComplex, n ); lapackf77_clarnv( &ione, ISEED, &n, b ); blasf77_ccopy( &n, b, &ione, x, &ione ); // solve Ax = b lapackf77_cgetrs( "Notrans", &n, &ione, A, &lda, ipiv, x, &n, &info ); if (info != 0) { printf("lapackf77_cgetrs returned error %d: %s.\n", (int) info, magma_strerror( info )); } // reset to original A init_matrix( opts, m, n, A, lda ); // compute r = Ax - b, saved in b blasf77_cgemv( "Notrans", &m, &n, &c_one, A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) float norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_clange( "F", &m, &n, A, &lda, work ); norm_r = lapackf77_clange( "F", &n, &ione, b, &n, work ); norm_x = lapackf77_clange( "F", &n, &ione, x, &n, work ); //printf( "r=\n" ); magma_cprint( 1, n, b, 1 ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); //printf( "r=%.2e, A=%.2e, x=%.2e, n=%d\n", norm_r, norm_A, norm_x, n ); return norm_r / (n * norm_A * norm_x); }
// On input, LU and ipiv is LU factorization of A. On output, LU is overwritten. // Works for any m, n. // Uses init_matrix() to re-generate original A as needed. // Returns error in factorization, |PA - LU| / (n |A|) // This allocates 3 more matrices to store A, L, and U. float get_LU_error( magma_opts &opts, magma_int_t M, magma_int_t N, magmaFloatComplex *LU, magma_int_t lda, magma_int_t *ipiv) { magma_int_t min_mn = min(M,N); magma_int_t ione = 1; magma_int_t i, j; magmaFloatComplex alpha = MAGMA_C_ONE; magmaFloatComplex beta = MAGMA_C_ZERO; magmaFloatComplex *A, *L, *U; float work[1], matnorm, residual; TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( L, magmaFloatComplex, M*min_mn ); TESTING_MALLOC_CPU( U, magmaFloatComplex, min_mn*N ); memset( L, 0, M*min_mn*sizeof(magmaFloatComplex) ); memset( U, 0, min_mn*N*sizeof(magmaFloatComplex) ); // set to original A init_matrix( opts, M, N, A, lda ); lapackf77_claswp( &N, A, &lda, &ione, &min_mn, ipiv, &ione); // copy LU to L and U, and set diagonal to 1 lapackf77_clacpy( MagmaLowerStr, &M, &min_mn, LU, &lda, L, &M ); lapackf77_clacpy( MagmaUpperStr, &min_mn, &N, LU, &lda, U, &min_mn ); for (j=0; j < min_mn; j++) L[j+j*M] = MAGMA_C_MAKE( 1., 0. ); matnorm = lapackf77_clange("f", &M, &N, A, &lda, work); blasf77_cgemm("N", "N", &M, &N, &min_mn, &alpha, L, &M, U, &min_mn, &beta, LU, &lda); for( j = 0; j < N; j++ ) { for( i = 0; i < M; i++ ) { LU[i+j*lda] = MAGMA_C_SUB( LU[i+j*lda], A[i+j*lda] ); } } residual = lapackf77_clange("f", &M, &N, LU, &lda, work); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( L ); TESTING_FREE_CPU( U ); return residual / (matnorm * N); }
double get_LU_error(magma_int_t M, magma_int_t N, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *LU, magma_int_t *IPIV) { magma_int_t min_mn = min(M,N); magma_int_t ione = 1; magma_int_t i, j; magmaDoubleComplex alpha = MAGMA_Z_ONE; magmaDoubleComplex beta = MAGMA_Z_ZERO; magmaDoubleComplex *L, *U; double work[1], matnorm, residual; TESTING_MALLOC_CPU( L, magmaDoubleComplex, M*min_mn ); TESTING_MALLOC_CPU( U, magmaDoubleComplex, min_mn*N ); memset( L, 0, M*min_mn*sizeof(magmaDoubleComplex) ); memset( U, 0, min_mn*N*sizeof(magmaDoubleComplex) ); lapackf77_zlaswp( &N, A, &lda, &ione, &min_mn, IPIV, &ione); lapackf77_zlacpy( MagmaLowerStr, &M, &min_mn, LU, &lda, L, &M ); lapackf77_zlacpy( MagmaUpperStr, &min_mn, &N, LU, &lda, U, &min_mn ); for(j=0; j<min_mn; j++) L[j+j*M] = MAGMA_Z_MAKE( 1., 0. ); matnorm = lapackf77_zlange("f", &M, &N, A, &lda, work); blasf77_zgemm("N", "N", &M, &N, &min_mn, &alpha, L, &M, U, &min_mn, &beta, LU, &lda); for( j = 0; j < N; j++ ) { for( i = 0; i < M; i++ ) { LU[i+j*lda] = MAGMA_Z_SUB( LU[i+j*lda], A[i+j*lda] ); } } residual = lapackf77_zlange("f", &M, &N, LU, &lda, work); TESTING_FREE_CPU( L ); TESTING_FREE_CPU( U ); return residual / (matnorm * N); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cprint */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *hA, *dA; //magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, lda, ldda; //size magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M + 31)/32)*32; //size = lda*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); //lapackf77_clarnv( &ione, ISEED, &size, hA ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { hA[i + j*lda] = MAGMA_C_MAKE( i + j*0.01, 0. ); } } magma_csetmatrix( M, N, hA, lda, dA, ldda ); printf( "A=" ); magma_cprint( M, N, hA, lda ); printf( "dA=" ); magma_cprint_gpu( M, N, dA, ldda ); TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B, *h_X; magmaFloatComplex_ptr d_A, d_B; magma_int_t N, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*opts.nrhs ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); magma_csetmatrix( N, N, h_A, N, d_A, ldda ); magma_csetmatrix( N, opts.nrhs, h_B, N, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv_gpu( opts.uplo, N, opts.nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ magma_cgetmatrix( N, opts.nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; float magma_error, magma_err, Ynorm, work[1]; magma_int_t M, N, Xm, Ym, lda, ldda; magma_int_t sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magmaFloatComplex *h_A, *h_X, *h_Y, *h_Ymagma; magmaFloatComplex *d_A, *d_X, *d_Y; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magmaFloatComplex **A_array = NULL; magmaFloatComplex **X_array = NULL; magmaFloatComplex **Y_array = NULL; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; opts.lapack |= opts.check; //float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_CGEMV( M, N ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N*batchCount; sizeX = incx*Xm*batchCount; sizeY = incy*Ym*batchCount; ldda = ((lda+31)/32)*32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( h_Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( h_Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( d_Y, magmaFloatComplex, sizeY ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&X_array, batchCount * sizeof(*X_array)); magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array)); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeX, h_X ); lapackf77_clarnv( &ione, ISEED, &sizeY, h_Y ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( M, N*batchCount, h_A, lda, d_A, ldda ); magma_csetvector( Xm*batchCount, h_X, incx, d_X, incx ); magma_csetvector( Ym*batchCount, h_Y, incy, d_Y, incy ); cset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream); cset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream); cset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ); magmablas_cgemv_batched(opts.transA, M, N, alpha, A_array, ldda, X_array, incx, beta, Y_array, incy, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_cgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, h_A + i*lda*N, &lda, h_X + i*Xm, &incx, &beta, h_Y + i*Ym, &incy ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; for(int s=0; s<batchCount; s++) { Ynorm = lapackf77_clange( "M", &M, &ione, h_Y + s*Ym, &incy, work ); blasf77_caxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione ); magma_err = lapackf77_clange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); } printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e \n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error); } else { printf("%10d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Y ); TESTING_FREE_CPU( h_Ymagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_Y ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( X_array ); TESTING_FREE_DEV( Y_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; if ( N > 512 ) { printf( "%5d %5d skipping because cgetf2 does not support N > 512\n", (int) M, (int) N ); continue; } TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_cgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgehrd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork; magmaFloatComplex_ptr dT; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float eps, result[2]; magma_int_t N, n2, lda, nb, lwork, ltwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; eps = lapackf77_slamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) |A-QHQ'|/N|A| |I-QQ'|/N\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_cgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; gflops = FLOPS_CGEHRD( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, nb*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgehrd( N, ione, N, h_R, lda, tau, h_work, lwork, dT, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.check ) { ltwork = 2*(N*N); TESTING_MALLOC_PIN( h_Q, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( twork, magmaFloatComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_CPU( rwork, float, N ); #endif lapackf77_clacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); for( int j = 0; j < N-1; ++j ) for( int i = j+2; i < N; ++i ) h_R[i+j*lda] = MAGMA_C_ZERO; magma_cunghr(N, ione, N, h_Q, lda, tau, dT, nb, &info); if (info != 0) { printf("magma_cunghr returned error %d: %s.\n", (int) info, magma_strerror( info )); exit(1); } #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_chst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_chst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif TESTING_FREE_PIN( h_Q ); TESTING_FREE_CPU( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_CPU( rwork ); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgehrd(&N, &ione, &N, h_R, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } if ( opts.check ) { printf(" %8.2e %8.2e %s\n", result[0]*eps, result[1]*eps, ( ( (result[0]*eps < tol) && (result[1]*eps < tol) ) ? "ok" : "failed") ); status += ! (result[0]*eps < tol); status += ! (result[1]*eps < tol); } else { printf(" --- ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_b, *h_x, *h_xcublas; magmaFloatComplex *d_A, *d_x; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_b, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_x, magmaFloatComplex, N ); TESTING_MALLOC_CPU( h_xcublas, magmaFloatComplex, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_x, magmaFloatComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &N, h_b ); blasf77_ccopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_clange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_clange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ctrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_caxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_clange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_b ); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_xcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float error, work[1]; magma_int_t status = 0; // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans [] = { MagmaConjTrans, MagmaNoTrans }; magma_direct_t direct[] = { MagmaForward, MagmaBackward }; magma_storev_t storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; if ( M < K || N < K || K <= 0 ) { printf( "%5d %5d %5d skipping because clarfb requires M >= K, N >= K, K >= 0\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices magmaFloatComplex *C, *R, *V, *T, *W; TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( V, magmaFloatComplex, ldv*K ); TESTING_MALLOC_CPU( T, magmaFloatComplex, ldt*K ); TESTING_MALLOC_CPU( W, magmaFloatComplex, ldw*K ); magmaFloatComplex_ptr dC, dV, dT, dW; TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( dV, magmaFloatComplex, ldv*K ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, ldt*K ); TESTING_MALLOC_DEV( dW, magmaFloatComplex, ldw*K ); // C is M x N. size = ldc*N; lapackf77_clarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_cprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_clarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_cprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_clarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_claset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_cprint( K, K, T, ldt ); magma_csetmatrix( M, N, C, ldc, dC, ldc ); magma_csetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_csetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_clarfb( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), lapack_direct_const( direct[idir] ), lapack_storev_const( storev[istor] ), &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_cprint( M, N, C, ldc ); magma_clarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_cgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_cprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_clange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e %s\n", (int) M, (int) N, (int) K, lapacke_storev_const(storev[istor]), lapacke_side_const(side[iside]), lapacke_direct_const(direct[idir]), lapacke_trans_const(trans[itran]), error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( V ); TESTING_FREE_CPU( T ); TESTING_FREE_CPU( W ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dV ); TESTING_FREE_DEV( dT ); TESTING_FREE_DEV( dW ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}}} printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; double error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaDoubleComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_Z_REAL(temp); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_zhesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvdx */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t mgpu_time; float *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif float *w1, result=0; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: ngpu = %d, itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.ngpu, (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M ngpu MGPU Time (sec)\n"); printf("====================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_sbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_sbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; //magma_int_t NB = 96;//magma_bulge_get_nb(N); //magma_int_t sizvblg = magma_sbulge_get_lq2(N, threads); //magma_int_t siz = max(sizvblg,n2)+2*(N*NB+N)+24*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_B, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, N ); magma_smake_symmetric( N, h_A, N ); if ( opts.warmup || opts.check ) { TESTING_MALLOC_CPU( h_Ainit, float, n2 ); TESTING_MALLOC_CPU( h_Binit, float, n2 ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } if ( opts.warmup ) { // ================================================================== // Warmup using MAGMA. I prefer to use smalltest to warmup A- // ================================================================== magma_ssygvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== mgpu_time = magma_wtime(); magma_ssygvdx_2stage_m(opts.ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_A, N, h_B, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); mgpu_time = magma_wtime() - mgpu_time; if ( opts.check ) { // =================================================================== // Check the results following the LAPACK's [zc]hegvdx routine. // A x = lambda B x is solved // and the following 3 tests computed: // (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) // | A B Z - Z D | / ( |A||Z| N ) (itype = 2) // | B A Z - Z D | / ( |A||Z| N ) (itype = 3) // =================================================================== #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_slansy("1", lapack_uplo_const(opts.uplo), &N, h_Ainit, &N, rwork); result /= lapackf77_slange("1", &N , &m1, h_A, &N, rwork); if (opts.itype == 1) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_slange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &m1, h_A, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &m1, h_A, &N, rwork)/N; } } // =================================================================== // Print execution time // =================================================================== printf("%5d %5d %4d %7.2f\n", (int) N, (int) m1, (int) opts.ngpu, mgpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } else if (opts.itype==3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result, (result < tol ? "ok" : "failed") ); } printf("\n"); status += ! (result < tol); } TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( iwork ); if ( opts.warmup || opts.check ) { TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zposv_batched */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double err = 0.0, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex_ptr d_A, d_B; magma_int_t *dinfo_array; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount = 1; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dB_array = NULL; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; batchCount = opts.batchcount ; printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("BatchCount N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_ZPOTRF( N) + FLOPS_ZPOTRS( N, nrhs ) ) / 1e9 * batchCount; sizeA = lda*N*batchCount; sizeB = ldb*nrhs*batchCount; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( work, double, N); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs*batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); magma_malloc((void**)&dA_array, batchCount * sizeof(*dA_array)); magma_malloc((void**)&dB_array, batchCount * sizeof(*dB_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); for(int i=0; i<batchCount; i++) { magma_zmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_zsetmatrix( N, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetmatrix( N, nrhs*batchCount, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ zset_pointer(dA_array, d_A, ldda, 0, 0, ldda*N, batchCount, queue); zset_pointer(dB_array, d_B, lddb, 0, 0, lddb*nrhs, batchCount, queue); gpu_time = magma_wtime(); info = magma_zposv_batched(opts.uplo, N, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_zposv_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_zposv_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_zgetmatrix( N, nrhs*batchCount, d_B, lddb, h_X, ldb ); for(magma_int_t s=0; s<batchCount; s++) { Anorm = lapackf77_zlange("I", &N, &N, h_A + s * lda * N, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X + s * ldb * nrhs, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A + s * lda * N, &lda, h_X + s * ldb * nrhs, &ldb, &c_neg_one, h_B + s * ldb * nrhs, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B + s * ldb * nrhs, &ldb, work); double error = Rnorm/(N*Anorm*Xnorm); if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(err, error); } status += ! (err < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(magma_int_t s=0; s<batchCount; s++) { lapackf77_zposv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A + s * lda * N, &lda, h_B + s * ldb * nrhs, &ldb, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zposv returned err %d: %s.\n", (int) info, magma_strerror( info )); printf( "%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } else { printf( "%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( dinfo_array ); magma_free(dA_array); magma_free(dB_array); free(cpu_info); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chegvdx */ int main( int argc, char** argv) { TESTING_INIT(); /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ real_Double_t gpu_time; magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work; #ifdef COMPLEX float *rwork; magma_int_t lrwork; #endif float *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lda, lwork, liwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; // pass ngpu = -1 to test multi-GPU code using 1 gpu magma_int_t abs_ngpu = abs( opts.ngpu ); printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n", int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), opts.fraction, int(abs_ngpu) ); if (opts.itype == 1) { printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n"); } else if (opts.itype == 2) { printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n"); } else if (opts.itype == 3) { printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n"); } printf("%%======================================================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; // TODO: test vl-vu range magma_int_t m1 = 0; float vl = 0; float vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magma_cheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, max( lwork, N*N )); // check needs N*N #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clarnv( &ione, ISEED, &n2, h_B ); magma_cmake_hpd( N, h_B, lda ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); // =================================================================== // Performs operation using MAGMA // =================================================================== gpu_time = magma_wtime(); if (opts.ngpu == 1) { magma_chegvdx_2stage( opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } else { magma_chegvdx_2stage_m( abs_ngpu, opts.itype, opts.jobz, range, opts.uplo, N, h_R, lda, h_S, lda, vl, vu, il, iu, &m1, w1, h_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); } gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_chegvdx_2stage returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A| |Z| N ) (itype = 1) | A B Z - Z D | / ( |A| |Z| N ) (itype = 2) | B A Z - Z D | / ( |A| |Z| N ) (itype = 3) (2) | D(with V, magma) - D(w/o V, lapack) | / | D | =================================================================== */ #ifdef REAL float *rwork = h_work + N*N; #endif if ( opts.jobz != MagmaNoVec ) { result[0] = 1.; result[0] /= safe_lapackf77_clanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork); if (opts.itype == 1) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_clange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } else if (opts.itype == 3) { blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for (int i=0; i < m1; ++i) blasf77_csscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_chemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_clange("1", &N, &m1, h_R, &lda, rwork)/N; } } lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_B, &lda, h_S, &lda ); lapackf77_chegvd( &opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &lda, h_S, &lda, w2, h_work, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info ); if (info != 0) { printf("lapackf77_chegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float maxw=0, diff=0; for (int j=0; j < m1; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m1*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %9.4f ", (int) N, (int) m1, gpu_time); if ( opts.check ) { bool okay = (result[1] < tolulp); if ( opts.jobz != MagmaNoVec ) { okay = okay && (result[0] < tol); printf(" %8.2e", result[0] ); } else { printf(" --- "); } printf(" %8.2e %s\n", result[1], (okay ? "ok" : "failed")); status += ! okay; } else { printf(" ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #ifdef COMPLEX TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zcgeqrsv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs; double error, gpu_error, cpu_error, Anorm, work[1]; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R; magmaDoubleComplex_ptr d_A, d_B, d_X, d_T; magmaFloatComplex *d_SA, *d_SB; magmaDoubleComplex *h_workd, *tau, tmp[1]; magmaFloatComplex *h_works; magma_int_t lda, ldb, lhwork, lworkgpu; magma_int_t ldda, lddb, lddx; magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; printf("Epsilon(double): %8.6e\n" "Epsilon(single): %8.6e\n\n", lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS double double single mixed Iter CPU GPU \n"); printf("=============================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = ((M+31)/32) * 32; lddb = ((max_mn+31)/32)*32; lddx = ((N+31)/32) * 32; nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) ); gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork ); h_works = (magmaFloatComplex*)h_workd; TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs ); TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs ); TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb ); /* Initialize the matrices */ size = lda*N; lapackf77_zlarnv( &ione, ISEED, &size, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &size, h_B ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); //===================================================================== // Mixed Precision Iterative Refinement - GPU //===================================================================== gpu_time = magma_wtime(); magma_zcgeqrsv_gpu( M, N, nrhs, d_A, ldda, d_B, lddb, d_X, lddx, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zcgeqrsv returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute the residual magma_zgetmatrix( N, nrhs, d_X, lddx, h_X, ldb ); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_zlange("f", &M, &N, h_A, &lda, work); //===================================================================== // Double Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_zgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_workd, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfd = gflops / gpu_time; //===================================================================== // Single Precision Solve //===================================================================== magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); /* The allocation of d_SA and d_SB is done here to avoid * to double the memory used on GPU with zcgeqrsv */ TESTING_MALLOC_DEV( d_SA, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_SB, magmaFloatComplex, lddb*nrhs ); magmablas_zlag2c( M, N, d_A, ldda, d_SA, ldda, &info ); magmablas_zlag2c( N, nrhs, d_B, lddb, d_SB, lddb, &info ); gpu_time = magma_wtime(); magma_cgels_gpu( MagmaNoTrans, M, N, nrhs, d_SA, ldda, d_SB, lddb, h_works, lhwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perfs = gflops / gpu_time; TESTING_FREE_DEV( d_SA ); TESTING_FREE_DEV( d_SB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_workd, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_zlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_zaxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_zlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f %7.2f %7.2f %7.2f %4d %8.2e %8.2e %8.2e %s\n", (int) M, (int) N, (int) nrhs, cpu_perf, gpu_perfd, gpu_perfs, gpu_perf, (int) qrsv_iters, cpu_error, gpu_error, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_workd ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_T ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, dwork[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *work, *tau, *tauq, *taup; float *d, *e; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K vect side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgebrd_nb( m ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*nn ); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( d, float, min(mm,nn) ); TESTING_MALLOC_CPU( e, float, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, magmaFloatComplex, min(mm,nn) ); TESTING_MALLOC_CPU( taup, magmaFloatComplex, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_cgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_cgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) printf("magma_cgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) printf("magma_cunmbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, dwork ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, dwork ) / error; printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqlf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 2. * opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgeqrf_nb(M); gflops = FLOPS_ZGEQLF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max( lwork, N*nb ); lwork = max( lwork, 2*nb*nb); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_zgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_zlange("f", &M, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A, dwork; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 }; magma_int_t ntest = 10; magma_int_t i, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; magmaDoubleComplex *work; magmaDoubleComplex tmp; double rwork[1]; magma_int_t *ipiv; magma_int_t lwork, ldwork; double A_norm, R_norm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[ntest-1] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgetri_gpu -N %d\n\n", 1024); } /* query for Lapack workspace size */ N = size[ntest-1]; lda = N; work = &tmp; lwork = -1; lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); lwork = int( MAGMA_Z_REAL( *work )); /* query for Magma workspace size */ ldwork = N * magma_get_zgetri_nb( N ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory */ n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for( i=0; i < ntest; i++ ){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZGETRI( (double)N ) / 1e9; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue ); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //warm-up magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); gpu_time = magma_wtime()-gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d\n", (int) info); gpu_perf = gflops / gpu_time; magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ); printf( "%5d %6.2f %6.2f %e\n", (int) N, cpu_perf, gpu_perf, R_norm / A_norm ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_zhemm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 ); magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; double error=0., errorbis=0., work[1]; magmaDoubleComplex *hA, *hX, *hB, *hR; magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1]; magmaDoubleComplex_ptr dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M ); TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz ); TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M ); } lapackf77_zlarnv( &ione, iseed, &size, hA ); magma_zmake_hermitian( M, hA, lda ); size = lda*N; lapackf77_zlarnv( &ione, iseed, &size, hX ); lapackf77_zlarnv( &ione, iseed, &size, hB ); lapackf77_zlacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_zsetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_zsetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(magmaDoubleComplex)); trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_zhemm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "zhemm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) iter ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_zsetmatrix( M, M, hA, lda, dA2, ldda ); magma_zsetmatrix( M, N, hX, lda, dX[0], ldda ); magma_zsetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_zhemm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_zlange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_zlange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_zprint( M, M, hA, lda ); //printf( "X =" ); magma_zprint( M, N, hX, lda ); //printf( "B =" ); magma_zprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_zhemm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_zgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_zaxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_zlange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_zprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaDoubleComplex *d_A, *d_B, *d_C; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_ZHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaDoubleComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaDoubleComplex, lddc*N ); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_zsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_zsetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasZher2k( handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_zgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_zher2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_zlange( "M", &N, &N, h_C, &ldc, work ); blasf77_zaxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_zlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaDoubleComplex *A, *B; double error_cblas, error_fblas, error_inline; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t i, j, k, m, n, size, maxn, ld; // complex x for magma, cblas, fortran, inline blas respectively magmaDoubleComplex x2_m, x2_c, x2_f, x2_i; // real x for magma, cblas, fortran, inline blas respectively double x_m, x_c, x_f, x_i; MAGMA_UNUSED( x_c ); MAGMA_UNUSED( x_f ); MAGMA_UNUSED( x2_c ); MAGMA_UNUSED( x2_f ); MAGMA_UNUSED( x2_m ); magma_opts opts; opts.parse_opts( argc, argv ); opts.tolerance = max( 100., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); gTol = tol; magma_int_t inc[] = { -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; magma_int_t ninc = sizeof(inc)/sizeof(*inc); magma_int_t maxinc = 0; for( i=0; i < ninc; ++i ) { maxinc = max( maxinc, abs(inc[i]) ); } printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfaults), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "!! If MAGMA_WITH_MKL or __APPLE__ are defined, known failures are skipped. !!\n" "\n" ); // tell user about disabled functions #ifndef HAVE_CBLAS printf( "n/a: HAVE_CBLAS not defined, so no cblas functions tested.\n\n" ); #endif #if defined(MAGMA_WITH_MKL) printf( "n/a: cblas_zdotc, cblas_zdotu, blasf77_zdotc, and blasf77_zdotu are disabled with MKL, due to segfaults.\n\n" ); #endif #if defined(__APPLE__) printf( "n/a: blasf77_zdotc and blasf77_zdotu are disabled on MacOS, due to segfaults.\n\n" ); #endif printf( "%% Error w.r.t. Error w.r.t. Error w.r.t.\n" "%% M N K incx incy Function CBLAS Fortran BLAS inline\n" "%%====================================================================================\n" ); for( int itest = 0; itest < opts.ntest; ++itest ) { if ( itest > 0 ) { printf( "%%----------------------------------------------------------------------\n" ); } m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * maxinc; ld = max( 1, maxn ); size = ld*maxn; TESTING_MALLOC_CPU( A, magmaDoubleComplex, size ); TESTING_MALLOC_CPU( B, magmaDoubleComplex, size ); // initialize matrices lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); // ----- test DZASUM for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dzasum( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dzasum( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dzasum( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation x_i = 0; for( i=0; i < m; ++i ) { x_i += MAGMA_Z_ABS1( *A(i*incx,j) ); // |real(Aij)| + |imag(Aij)| } error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dzasum xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dzasum", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test DZNRM2 // get two-norm of column j of A for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; if ( incx > 0 && incx == incy ) { // positive, no incy error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { x_m = magma_cblas_dznrm2( m, A(0,j), incx ); #ifdef HAVE_CBLAS x_c = cblas_dznrm2( m, A(0,j), incx ); error_cblas = max( error_cblas, fabs(x_m - x_c) / fabs(m*x_c) ); #else x_c = 0; error_cblas = SKIPPED_FLAG; #endif x_f = blasf77_dznrm2( &m, A(0,j), &incx ); error_fblas = max( error_fblas, fabs(x_m - x_f) / fabs(m*x_f) ); // inline implementation (poor -- doesn't scale) x_i = 0; for( i=0; i < m; ++i ) { x_i += real( *A(i*incx,j) ) * real( *A(i*incx,j) ) + imag( *A(i*incx,j) ) * imag( *A(i*incx,j) ); // same: real( conj( *A(i*incx,j) ) * *A(i*incx,j) ); } x_i = sqrt( x_i ); error_inline = max( error_inline, fabs(x_m - x_i) / fabs(m*x_i) ); //printf( "dznrm2 xm %.8e, xc %.8e, xf %.8e, xi %.8e\n", x_m, x_c, x_f, x_i ); } output( "dznrm2", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } } printf( "\n" ); // ----- test ZDOTC // dot columns, Aj^H Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotc( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotc( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += conj( *A(A_offset + i*incx,j) ) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotc xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotc", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } printf( "\n" ); // ----- test ZDOTU // dot columns, Aj^T * Bj for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; error_cblas = 0; error_fblas = 0; error_inline = 0; for( j=0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); // crashes with MKL 11.1.2, ILP64 #if defined(HAVE_CBLAS) && ! defined(MAGMA_WITH_MKL) #ifdef COMPLEX cblas_zdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_zdotu( m, A(0,j), incx, B(0,j), incy ); #endif error_cblas = max( error_cblas, fabs(x2_m - x2_c) / fabs(m*x2_c) ); #else x2_c = MAGMA_Z_ZERO; error_cblas = SKIPPED_FLAG; #endif // crashes with MKL 11.2.3 and MacOS 10.9 #if (! defined(COMPLEX) || ! defined(MAGMA_WITH_MKL)) && ! defined(__APPLE__) x2_f = blasf77_zdotu( &m, A(0,j), &incx, B(0,j), &incy ); error_fblas = max( error_fblas, fabs(x2_m - x2_f) / fabs(m*x2_f) ); #else x2_f = MAGMA_Z_ZERO; error_fblas = SKIPPED_FLAG; #endif // inline implementation x2_i = MAGMA_Z_ZERO; magma_int_t A_offset = (incx > 0 ? 0 : (-n + 1)*incx); magma_int_t B_offset = (incy > 0 ? 0 : (-n + 1)*incy); for( i=0; i < m; ++i ) { x2_i += *A(A_offset + i*incx,j) * *B(B_offset + i*incy,j); } error_inline = max( error_inline, fabs(x2_m - x2_i) / fabs(m*x2_i) ); //printf( "zdotu xm %.8e + %.8ei, xc %.8e + %.8ei, xf %.8e + %.8ei, xi %.8e + %.8ei\n", // real(x2_m), imag(x2_m), // real(x2_c), imag(x2_c), // real(x2_f), imag(x2_f), // real(x2_i), imag(x2_i) ); } output( "zdotu", m, n, k, incx, incy, error_cblas, error_fblas, error_inline ); } } // cleanup TESTING_FREE_CPU( A ); TESTING_FREE_CPU( B ); fflush( stdout ); } // itest, incx, incy opts.cleanup(); TESTING_FINALIZE(); return gStatus; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; magmaDouble_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dgetrf_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R, *work; magmaDoubleComplex_ptr d_A, dwork; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_zgetri_nb( N ); gflops = FLOPS_ZGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_zgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif /* Matrix size */ double *w1, *w2, result[2]={0,0}; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); magma_range_t range = MagmaRangeAll; if (opts.fraction != 1) range = MagmaRangeI; if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, range = %s, uplo = %s, opts.check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); magma_int_t threads = magma_get_parallel_numthreads(); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) lwork = magma_zbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = magma_zbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, N ); magma_zmake_hermitian( N, h_A, N ); magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (range == MagmaRangeI) { il = 1; iu = (int) (opts.fraction*N); } // ================================================================== // Warmup using MAGMA // ================================================================== if (opts.warmup) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } // =================================================================== // Performs operation using MAGMA // =================================================================== lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_zhegvdx_2stage(opts.itype, opts.jobz, range, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); gpu_time = magma_wtime() - gpu_time; if ( opts.check && opts.jobz != MagmaNoVec ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_int_t m2 = m1; lapackf77_zhegvd(&opts.itype, "N", lapack_uplo_const(opts.uplo), &N, h_R, &N, h_S, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); double maxw=0, diff=0; for(int j=0; j<m2; j++) { maxw = max(maxw, fabs(w1[j])); maxw = max(maxw, fabs(w2[j])); diff = max(diff, fabs(w1[j] - w2[j])); } result[1] = diff / (m2*maxw); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check && opts.jobz != MagmaNoVec ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) { printf(" | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==2) { printf(" | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype==3) { printf(" | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( " | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; double magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, ldda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 1.5, -2.3 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( -0.6, 0.8 ); magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma; magmaDoubleComplex_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf("%% M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf("%% M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_ZGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( X, magmaDoubleComplex, sizeX ); TESTING_MALLOC_CPU( Y, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ydev, magmaDoubleComplex, sizeY ); TESTING_MALLOC_CPU( Ymagma, magmaDoubleComplex, sizeY ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX ); TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &sizeA, A ); lapackf77_zlarnv( &ione, ISEED, &sizeX, X ); lapackf77_zlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, ldda, opts.queue ); magma_zsetvector( Xm, X, incx, dX, incx, opts.queue ); magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasZgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, ldda, dX, incx, &beta, dY, incy ); #else magma_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy ); #endif dev_time = magma_sync_wtime( opts.queue ) - dev_time; dev_perf = gflops / dev_time; magma_zgetvector( Ym, dY, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_zsetvector( Ym, Y, incy, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); magmablas_zgemv( opts.transA, M, N, alpha, dA, ldda, dX, incx, beta, dY, incy, opts.queue ); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetvector( Ym, dY, incy, Ymagma, incy, opts.queue ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work ); double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work ); blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); bool okay = (magma_error < tol) && (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (okay ? "ok" : "failed")); #else bool okay = (dev_error < tol); status += ! okay; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (okay ? "ok" : "failed")); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_X; magmaDouble_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default lddb = ldda; gflops = ( FLOPS_DGETRF( N, N ) + FLOPS_DGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_dsetmatrix( N, nrhs, h_B, ldb, d_B, lddb, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb, opts.queue ); Anorm = lapackf77_dlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_dlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_dlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmlq */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgelqf_nb( m, n ); ldc = m; // A is k x m (left) or k x n (right) mm = (side[iside] == MagmaLeft ? m : n); lda = k; gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for gelqf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); // this rounds it up slightly if needed to agree with lwork query lwork_max = int( real( magma_cmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*mm ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*mm; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute LQ factorization to get Householder vectors in A, tau magma_cgelqf( k, mm, A, lda, tau, W, lwork_max, &info ); if (info != 0) { printf("magma_cgelqf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmlq( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) { printf("magma_cunmlq (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cunmlq returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork, lda, aux_iwork[1]; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double result[3], eps, aux_work[1]; eps = lapackf77_dlamch( "E" ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: jobz = %s, uplo = %s\n", lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; lda = N; // query for workspace sizes magma_dsyevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, aux_work, -1, aux_iwork, -1, opts.queue, &info ); lwork = (magma_int_t) aux_work[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, N*lda ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* warm up run */ if ( opts.warmup ) { magma_dsyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, opts.queue, &info ); if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsyevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, iwork, liwork, opts.queue, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ double temp1, temp2; // tau=NULL is unused since itype=1 lapackf77_dsyt21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, h_work, h_R, &lda, h_R, &lda, NULL, h_work, &result[0] ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dsyevd( MagmaNoVec, opts.uplo, N, h_R, lda, w2, h_work, lwork, iwork, liwork, opts.queue, &info ); if (info != 0) printf("magma_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, fabs(w1[j])); temp1 = max(temp1, fabs(w2[j])); temp2 = max(temp2, fabs(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsyevd( lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e %s\n", result[0]*eps, (result[0]*eps < tol ? "ok" : "failed") ); printf("(2) | I - U'U | / N = %8.2e %s\n", result[1]*eps, (result[1]*eps < tol ? "ok" : "failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e %s\n\n", result[2] , (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0]*eps < tol && result[1]*eps < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }