int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 1.5, -2.3 ); float beta = MAGMA_S_MAKE( -0.6, 0.8 ); float *A, *X, *Y, *Ydev, *Ymagma; magmaFloat_ptr dA, dX, dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); #ifdef HAVE_CUBLAS printf(" M N MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else printf(" M N %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_SGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, float, sizeA ); TESTING_MALLOC_CPU( X, float, sizeX ); TESTING_MALLOC_CPU( Y, float, sizeY ); TESTING_MALLOC_CPU( Ydev, float, sizeY ); TESTING_MALLOC_CPU( Ymagma, float, sizeY ); TESTING_MALLOC_DEV( dA, float, sizeA ); TESTING_MALLOC_DEV( dX, float, sizeX ); TESTING_MALLOC_DEV( dY, float, sizeY ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &sizeA, A ); lapackf77_slarnv( &ione, ISEED, &sizeX, X ); lapackf77_slarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_ssetmatrix( M, N, A, lda, dA, 0, lda, opts.queue ); magma_ssetvector( Xm, X, incx, dX, 0, incx, opts.queue ); magma_ssetvector( Ym, Y, incy, dY, 0, incy, opts.queue ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( 0 ); cublasSgemv( opts.handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); dev_time = magma_sync_wtime( 0 ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_sgemv( opts.transA, M, N, alpha, dA, 0, lda, dX, 0, incx, beta, dY, 0, incy, opts.queue ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_sgetvector( Ym, dY, 0, incy, Ydev, incy, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_sgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_sgetvector( Ym, dY, incy, Ymagma, incy ); #endif /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_sgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ float Anorm = lapackf77_slange( "F", &M, &N, A, &lda, work ); float Xnorm = lapackf77_slange( "F", &Xm, &ione, X, &Xm, work ); blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy ); dev_error = lapackf77_slange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm); #ifdef HAVE_CUBLAS blasf77_saxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_slange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ydev ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma ) { cublasStatus_t status; double g_val = *gamma; long int nfa; int len_tv; int ntv; int i_v; int i_el; int i_r, i_c; int trvei; double *tv_sq; double *v_f_g; float *tr_ar; float *tva, *vtm, *DP; float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0; cudaError_t cudaStat; cublasHandle_t handle; status = cublasCreate(&handle); len_tv = prob-> x[0].dim; ntv = prob-> l; nfa = len_tv * ntv; tva = (float*) malloc ( len_tv * ntv* sizeof(float) ); vtm = (float*) malloc ( len_tv * sizeof(float) ); DP = (float*) malloc ( ntv * sizeof(float) ); tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) ); tv_sq = (double*) malloc ( ntv * sizeof(double) ); v_f_g = (double*) malloc ( ntv * sizeof(double) ); for ( i_r = 0; i_r < ntv ; i_r++ ) { for ( i_c = 0; i_c < len_tv; i_c++ ) tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c]; } cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float)); if (cudaStat != cudaSuccess) { free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cublasDestroy( handle ); fprintf (stderr, "!!!! Device memory allocation error (A)\n"); getchar(); return; } cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float)); cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float)); for( i_r = 0; i_r < ntv; i_r++ ) for( i_c = 0; i_c < len_tv; i_c++ ) tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c]; // Copy cpu vector to gpu vector status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 ); free( tr_ar ); for( i_v = 0; i_v < ntv; i_v++ ) { tv_sq[ i_v ] = 0; for( i_el = 0; i_el < len_tv; i_el++ ) tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 ); } for ( trvei = 0; trvei < ntv; trvei++ ) { status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 ); status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 ); status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 ); for ( i_c = 0; i_c < ntv; i_c++ ) v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] )); pecm-> x[trvei].values[0] = trvei + 1; for ( i_c = 0; i_c < ntv; i_c++ ) pecm-> x[trvei].values[i_c + 1] = v_f_g[i_c]; } free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cudaFree( g_vtm ); cudaFree( g_DotProd ); cublasDestroy( handle ); }