extern "C" magma_int_t magma_slarfb2_gpu( magma_int_t m, magma_int_t n, magma_int_t k, const float *dV, magma_int_t ldv, const float *dT, magma_int_t ldt, float *dC, magma_int_t ldc, float *dwork, magma_int_t ldwork ) { float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; if (m <= 0 || n <= 0) return MAGMA_SUCCESS; // W = C^H V // magma_sgemm( MagmaTrans, MagmaNoTrans, magmablas_sgemm_reduce( n, k, m, c_one, dC, ldc, dV, ldv, c_zero, dwork, ldwork); // W = W T^H = C^H V T^H magma_strmm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, k, c_one, dT, ldt, dwork, ldwork); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C magma_sgemm( MagmaNoTrans, MagmaTrans, m, n, k, c_neg_one, dV, ldv, dwork, ldwork, c_one, dC, ldc); return MAGMA_SUCCESS; }
int main( int argc, char** argv) { real_Double_t gflops, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time; float magma_error, clblas_error, work[1]; magma_trans_t transA = MagmaNoTrans; magma_trans_t transB = MagmaNoTrans; magma_int_t istart = 1024; magma_int_t iend = 6240; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float *h_A, *h_B, *h_C, *h_C2, *h_C3; magmaFloat_ptr d_A, d_B, d_C; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); float beta = MAGMA_S_MAKE( -0.48, 0.38 ); int lapack = getenv("MAGMA_RUN_LAPACK") != NULL; int count = 1; printf("\nUsage: testing_sgemm [-NN|NT|TN|TT|NC|CN|TC|CT|CC] -M m -N n -K k -count c -l\n" " -l or setting $MAGMA_RUN_LAPACK runs CPU BLAS,\n" " and computes both MAGMA and CLBLAS error using CPU BLAS result.\n" " Else, MAGMA error is computed using CLBLAS result.\n\n"); for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 && i+1 < argc ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 && i+1 < argc ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } else if (strcmp("-l", argv[i])==0) { lapack = true; } else if ( strcmp("-count", argv[i]) == 0 && i+1 < argc ){ count = atoi(argv[++i]); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ldda; K += 32; M += 32; N += 32; TESTING_MALLOC_CPU( h_A, float, lda*K ); TESTING_MALLOC_CPU( h_B, float, ldb*Bn ); TESTING_MALLOC_CPU( h_C, float, ldc*N ); TESTING_MALLOC_CPU( h_C2, float, ldc*N ); TESTING_MALLOC_CPU( h_C3, float, ldc*N ); TESTING_MALLOC_DEV( d_A, float, ldda*K ); TESTING_MALLOC_DEV( d_B, float, lddb*Bn ); TESTING_MALLOC_DEV( d_C, float, lddc*N ); printf("Testing transA = %c transB = %c\n", *lapack_const(transA), *lapack_const(transB)); printf(" M N K MAGMA Gflop/s (sec) CLBLAS Gflop/s (sec) CPU Gflop/s (sec) MAGMA error CLBLAS error\n"); printf("===========================================================================================================\n"); for( i=istart; i<iend; i = (int)(i*1.25) ) { for( int cnt = 0; cnt < count; ++cnt ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS_SGEMM( M, N, K ) / 1e9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &szeA, h_A ); lapackf77_slarnv( &ione, ISEED, &szeB, h_B ); lapackf77_slarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_ssetmatrix( Am, An, h_A, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( Bm, Bn, h_B, ldb, d_B, 0, lddb, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magmablas_sgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime(); magmablas_sgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime() - magma_time; magma_perf = gflops / magma_time; magma_sgetmatrix( M, N, d_C, 0, lddc, h_C2, ldc, queue ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_sgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime(); magma_sgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime() - clblas_time; clblas_perf = gflops / clblas_time; magma_sgetmatrix( M, N, d_C, 0, lddc, h_C3, ldc, queue ); /* ===================================================================== Performs operation using BLAS =================================================================== */ if ( lapack ) { cpu_time = magma_wtime(); blasf77_sgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ if ( lapack ) { // compare both magma & clblas to lapack blasf77_saxpy(&szeC, &c_neg_one, h_C, &ione, h_C2, &ione); magma_error = lapackf77_slange("M", &M, &N, h_C2, &ldc, work); blasf77_saxpy(&szeC, &c_neg_one, h_C, &ione, h_C3, &ione); clblas_error = lapackf77_slange("M", &M, &N, h_C3, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) %7.2f (%7.4f) %8.2e %8.2e\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time, magma_error, clblas_error ); } else { // compare magma to clblas blasf77_saxpy(&szeC, &c_neg_one, h_C3, &ione, h_C2, &ione); magma_error = lapackf77_slange("M", &M, &N, h_C2, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) --- ( --- ) %8.2e ---\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, magma_error ); } } if ( count > 1 ) { printf( "\n" ); } } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_CPU( h_C3 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }