int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *hA, *hR; magmaFloat_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; if (argc != 1) { for(i = 1; i<argc; i++) { if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_spotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( hA, float, n2 ); TESTING_MALLOC_PIN( hR, float, n2 ); TESTING_MALLOC_DEV( dA, float, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++) { N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( hA(i,i), MAGMA_S_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_S_CNJG( hA(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_ssetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_spotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = magma_wtime(); magma_spotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_sgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_slange("f", &N, &N, hA, &lda, work); blasf77_saxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_slange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_PIN( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing magma_ssymm_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); float c_neg_one = MAGMA_S_NEG_ONE; float calpha = MAGMA_S_MAKE( 3.456, 5.678 ); float cbeta = MAGMA_S_MAKE( 1.234, 2.456 ); real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.; real_Double_t gpu_perf2=0., gpu_time2=0.; float error=0., errorbis=0., work[1]; float *hA, *hX, *hB, *hR; float *dA[MagmaMaxGPUs], *dX[MagmaMaxGPUs], *dB[MagmaMaxGPUs], *dwork[MagmaMaxGPUs], *hwork[MagmaMaxGPUs+1]; float *dA2; magma_int_t M, N, size, lda, ldda, msize, nb, nstream; magma_int_t ione = 1; magma_int_t iseed[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); // default values nb = (opts.nb > 0 ? opts.nb : 64); nstream = (opts.nstream > 0 ? opts.nstream : 2); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx = 0; magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu); printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx); for (int i=0; i < nbcmplx; ++i) { int myngpu = gnode[i][MagmaMaxGPUs]; printf("cmplx %d has %d gpu ", i, myngpu); for(int j=0; j < myngpu; ++j) printf(" %d", (int) gnode[i][j]); printf("\n"); } magma_int_t nbevents = 2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t redevents[MagmaMaxGPUs][20]; magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; for( int d = 0; d < opts.ngpu; ++d ) { for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming); cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming); } } printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version ); printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { M = opts.msize[itest]; N = opts.nsize[itest]; for( int offset = 0; offset < N; offset += min(N,nb) ) { for( int iter = 0; iter < opts.niter; ++iter ) { msize = M - offset; lda = M; ldda = ((M + 31)/32)*32; size = lda*M; gflops = FLOPS_SSYMM( MagmaLeft, (float)msize, (float)N ) / 1e9; magma_int_t dworksiz = ldda*N*3; magma_int_t hworksiz = lda*N; TESTING_MALLOC_CPU( hA, float, lda*M ); TESTING_MALLOC_CPU( hX, float, lda*N ); TESTING_MALLOC_CPU( hB, float, lda*N ); TESTING_MALLOC_PIN( hR, float, lda*N ); for( int d = 0; d < opts.ngpu; ++d ) { magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb; magma_setdevice( d ); TESTING_MALLOC_DEV( dA[d], float, ldda*mlocal ); TESTING_MALLOC_DEV( dX[d], float, ldda*N ); TESTING_MALLOC_DEV( dB[d], float, ldda*N ); TESTING_MALLOC_DEV( dwork[d], float, dworksiz ); TESTING_MALLOC_PIN( hwork[d], float, hworksiz ); } TESTING_MALLOC_PIN( hwork[opts.ngpu], float, lda*N ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_MALLOC_DEV( dA2, float, ldda*M ); } lapackf77_slarnv( &ione, iseed, &size, hA ); magma_smake_symmetric( M, hA, lda ); size = lda*N; lapackf77_slarnv( &ione, iseed, &size, hX ); lapackf77_slarnv( &ione, iseed, &size, hB ); lapackf77_slacpy( "Full", &M, &N, hB, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix_1D_col_bcyclic( M, M, hA, lda, dA, ldda, opts.ngpu, nb ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); //magmablasSetKernelStream( streams[ d ][ 0 ] ); magma_ssetmatrix( M, N, hX, lda, dX[d], ldda ); //if (d == 0) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); // this is wrong coz when offset != 0 the gpu who do the beta*C may be not 0 so this should be related to stdev(starting device who own i=0 first col) magma_ssetmatrix( M, N, hB, lda, dB[d], ldda ); } //memset(hR, 0, lda*N*sizeof(float)); //trace_init( 1, opts.ngpu, nstream, (magma_queue_t*) streams ); //magma_int_t offset = 0; //nb; gpu_time = magma_sync_wtime(0); magmablas_ssymm_mgpu_com( MagmaLeft, MagmaLower, msize, N, calpha, dA, ldda, offset, dX, ldda, cbeta, dB, ldda, dwork, dworksiz, hR, lda, hwork, hworksiz, opts.ngpu, nb, streams, nstream, redevents2, nbevents, gnode, nbcmplx); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; #ifdef TRACING char buf[80]; snprintf( buf, sizeof(buf), "ssymm-m%d-n%d-nb%d-stream%d-ngpu%d-run%d.svg", (int) M, (int) N, (int) nb, (int) nstream, (int) opts.ngpu, (int) j ); trace_finalize( buf, "trace.css" ); #endif /* ==================================================================== Performs operation using CUBLAS =================================================================== */ if ( opts.check && iter == 0 ) { magma_setdevice( 0 ); magmablasSetKernelStream( 0 ); magma_ssetmatrix( M, M, hA, lda, dA2, ldda ); magma_ssetmatrix( M, N, hX, lda, dX[0], ldda ); magma_ssetmatrix( M, N, hB, lda, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0); magma_ssymm( MagmaLeft, MagmaLower, msize, N, calpha, dA2+offset*ldda+offset, ldda, dX[0], ldda, cbeta, dwork[0], ldda ); gpu_time2 = magma_sync_wtime(0) - gpu_time2; gpu_perf2 = gflops / gpu_time2; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { // store ||A||*||X|| errorbis = lapackf77_slange("fro", &msize, &msize, hA+offset*lda+offset, &lda, work ); errorbis *= lapackf77_slange("fro", &msize, &N, hX, &lda, work ); //printf( "A =" ); magma_sprint( M, M, hA, lda ); //printf( "X =" ); magma_sprint( M, N, hX, lda ); //printf( "B =" ); magma_sprint( M, N, hB, lda ); cpu_time = magma_wtime(); blasf77_ssymm( "Left", "Lower", &msize, &N, &calpha, hA+offset*lda+offset, &lda, hX, &lda, &cbeta, hB, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* trace_file = fopen("AJETE/C", "w"); for (int j = 0; j < N; j++) for (int i = 0; i < siz; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, hB[j*lda+i]); fclose(trace_file); */ magma_int_t firstprint=0; for(magma_int_t dev=0; dev < opts.ngpu; ++dev) { magma_setdevice( dev ); magma_sgetmatrix( M, N, dB[dev], ldda, hR, lda ); // compute relative error ||R||/||A||*||X||, where R := B_magma - B_lapack = R - B size = lda*N; blasf77_saxpy( &size, &c_neg_one, hB, &ione, hR, &ione ); error = lapackf77_slange("fro", &msize, &N, hR, &lda, work) / errorbis; //printf( "R =" ); magma_sprint( M, N, hR, lda ); if (firstprint == 0) { printf( "%5d %5d %5d %5d %7.1f (%7.4f) %7.1f (%7.4f) %7.1f (%7.4f) %8.2e %s\n", (int) M, (int) N, (int) nb, (int) offset, cpu_perf, cpu_time, gpu_perf, gpu_time, gpu_perf2, gpu_time2, error, (error < tol ? "ok" : "failed") ); } else { printf( "%89s %8.2e %s\n", " ", error, (error < tol ? "ok" : "failed") ); } status += ! (error < tol); firstprint =1; } } else { printf( "%5d %5d %5d %5d --- ( --- ) %7.1f (%7.4f) --- ( --- ) ---\n", (int) M, (int) N, (int) nb, (int) offset, gpu_perf, gpu_time ); } TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( hX ); TESTING_FREE_CPU( hB ); TESTING_FREE_PIN( hR ); for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); TESTING_FREE_DEV( dA[d] ); TESTING_FREE_DEV( dX[d] ); TESTING_FREE_DEV( dB[d] ); TESTING_FREE_DEV( dwork[d] ); TESTING_FREE_PIN( hwork[d] ); } TESTING_FREE_PIN( hwork[opts.ngpu] ); if ( opts.check ) { magma_setdevice( 0 ); TESTING_FREE_DEV( dA2 ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } // offset printf( "\n" ); } for( int d = 0; d < opts.ngpu; ++d ) { magma_setdevice( d ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[d][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( redevents[d][i] ); magma_event_destroy( redevents2[d][i] ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slacpy */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_B, *h_R; float *d_A, *d_B; magma_int_t M, N, size, lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("=================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldb = lda; ldda = ((M+31)/32)*32; lddb = ldda; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // load and save triangle (with diagonal) gbytes = sizeof(float) * 1.*N*(N+1) / 1e9; } else { // load entire matrix, save entire matrix gbytes = sizeof(float) * 2.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_B, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); h_B[i + j*ldb] = MAGMA_S_MAKE( i - j/10000. + 10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, ldb, d_B, lddb ); gpu_time = magma_sync_wtime( 0 ); //magmablas_slacpy( uplo[iuplo], M-2, N-2, d_A+1+ldda, ldda, d_B+1+lddb, lddb ); // inset by 1 row & col magmablas_slacpy( uplo[iuplo], M, N, d_A, ldda, d_B, lddb ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_slacpy( lapack_uplo_const(uplo[iuplo]), &M2, &N2, h_A+1+lda, &lda, h_B+1+ldb, &ldb ); lapackf77_slacpy( lapack_uplo_const(uplo[iuplo]), &M, &N, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_sprint( M, N, h_A, lda ); printf( "B= " ); magma_sprint( M, N, h_B, lda ); printf( "dA=" ); magma_sprint_gpu( M, N, d_A, ldda ); printf( "dB=" ); magma_sprint_gpu( M, N, d_B, ldda ); } /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_R, lda ); blasf77_saxpy(&size, &c_neg_one, h_B, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); printf("%5s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const(uplo[iuplo]), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }