int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_cgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgelqf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; float matnorm, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, lwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (N>0 && M>0) printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; min_mn = min(M, N); nb = magma_get_cgeqrf_nb(M); TESTING_MALLOC( tau, cuFloatComplex, min_mn ); TESTING_MALLOC( h_A, cuFloatComplex, n2 ); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2 ); lwork = -1; lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max( lwork, M*nb ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; flops = FLOPS( (float)M, (float)N ) / 1000000; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cgelqf( M, N, h_R, lda, tau, h_work, lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_cgelqf had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of lapack_cgelqf had an illegal value.\n", (int) -info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f %6.2f %e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_clange("f", &M, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_HOSTFREE( h_work ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", (int) n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t N, n2, lda, ldda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magmaFloatComplex **d_A_array = NULL; magma_int_t *dinfo_magma; magma_int_t batchCount; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) batchCount = opts.batchcount; float tol = opts.tolerance * lapackf77_slamch("E"); printf("BatchCount N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; ldda = lda = ((N+31)/32)*32; n2 = lda* N * batchCount; gflops = batchCount * FLOPS_CPOTRF( N ) / 1e9 ; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda * N * batchCount); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount); magma_malloc((void**)&d_A_array, batchCount * sizeof(*d_A_array)); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for(int i=0; i<batchCount; i++) { magma_cmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_int_t columns = N * batchCount; lapackf77_clacpy( MagmaUpperLowerStr, &N, &(columns), h_A, &lda, h_R, &lda ); magma_csetmatrix( N, columns, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ cset_pointer(d_A_array, d_A, ldda, 0, 0, ldda * N, batchCount, queue); gpu_time = magma_sync_wtime(NULL); info = magma_cpotrf_batched( opts.uplo, N, d_A_array, ldda, dinfo_magma, batchCount, queue); gpu_time = magma_sync_wtime(NULL) - gpu_time; gpu_perf = gflops / gpu_time; magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_cpotrf_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_cpotrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A + i * lda * N, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, columns, d_A, ldda, h_R, lda ); magma_int_t NN = lda*N; char const uplo = 'l'; // lapack_uplo_const(opts.uplo) float err = 0.0; for(int i=0; i<batchCount; i++) { error = lapackf77_clanhe("f", &uplo, &N, h_A + i * lda*N, &lda, work); blasf77_caxpy(&NN, &c_neg_one, h_A + i * lda*N, &ione, h_R + i * lda*N, &ione); error = lapackf77_clanhe("f", &uplo, &N, h_R + i * lda*N, &lda, work) / error; if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(fabs(error),err); } printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., err, (error < tol ? "ok" : "failed")); status += ! (err < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int)batchCount, (int) N, gpu_perf, gpu_time*1000. ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_A_array ); TESTING_FREE_DEV( dinfo_magma ); free(cpu_info); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmql */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgeqlf_nb( m ); ldc = m; // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); lda = mm; gflops = FLOPS_CUNMQL( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for geqlf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QL factorization to get Householder vectors in A, tau magma_cgeqlf( mm, k, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_cgeqlf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmql( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmql returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_cunmql (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmql( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmql returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cherk */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An; magma_int_t sizeA, sizeC; magma_int_t lda, ldc, ldda, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_C, *h_Ccublas; magmaFloatComplex_ptr d_A, d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float alpha = MAGMA_D_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CHERK(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; } else { lda = An = K; Ak = N; } ldc = N; ldda = ((lda+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCherk( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cherk( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { #ifdef MAGMA_WITH_MKL // MKL (11.1.2) has bug in multi-threaded clanhe; use single thread to work around int threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); #endif // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clanhe("fro", lapack_uplo_const(opts.uplo), &N, h_C, &ldc, work); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_clanhe( "fro", lapack_uplo_const(opts.uplo), &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); #ifdef MAGMA_WITH_MKL // end single thread to work around MKL bug magma_set_lapack_numthreads( threads ); #endif } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgehrd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *dT; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float eps, result[2]; magma_int_t N, n2, lda, nb, lwork, ltwork, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; eps = lapackf77_slamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) |A-QHQ'|/N|A| |I-QQ'|/N\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; nb = magma_get_cgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; gflops = FLOPS_CGEHRD( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, N ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, nb*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgehrd( N, ione, N, h_R, lda, tau, h_work, lwork, dT, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.check ) { ltwork = 2*(N*N); TESTING_MALLOC_PIN( h_Q, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( twork, magmaFloatComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_CPU( rwork, float, N ); #endif lapackf77_clacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); for( int j = 0; j < N-1; ++j ) for( int i = j+2; i < N; ++i ) h_R[i+j*lda] = MAGMA_C_ZERO; magma_cunghr(N, ione, N, h_Q, lda, tau, dT, nb, &info); if (info != 0) { printf("magma_cunghr returned error %d: %s.\n", (int) info, magma_strerror( info )); exit(1); } #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_chst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_chst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif TESTING_FREE_PIN( h_Q ); TESTING_FREE_CPU( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_CPU( rwork ); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgehrd(&N, &ione, &N, h_R, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgehrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } if ( opts.check ) { printf(" %8.2e %8.2e %s\n", result[0]*eps, result[1]*eps, ( ( (result[0]*eps < tol) && (result[1]*eps < tol) ) ? "ok" : "failed") ); status += ! (result[0]*eps < tol); status += ! (result[1]*eps < tol); } else { printf(" --- ---\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; float Cnorm; magma_int_t istart = 1024; magma_int_t iend = 8194; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_C2; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex mzone = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-N", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = lddc = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; K+=32; M+=32; N +=32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*K ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_C2, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*K ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); printf("\nUsage: \n"); printf(" testing_cgemm [-NN|NT|TN|TT] [-N %d] \n\n", 1024); printf("\n"); printf("Testing transA = %c transB = %c\n", transA, transB); printf(" M N K clAmdBlas GFLop/s (sec) CPU GFlop/s (sec) error\n"); printf("===========================================================================\n"); for(i=istart; i<iend; i = (int)(i*1.25) ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS( (float)M, (float)N, (float)K ) * 1e-9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &szeA, h_A ); lapackf77_clarnv( &ione, ISEED, &szeB, h_B ); lapackf77_clarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_csetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync( queue ); gpu_time = magma_wtime(); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync( queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; magma_cgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CPU-BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &M, &N, h_C, &ldc, work ); /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ blasf77_caxpy(&szeC, &mzone, h_C, &ione, h_C2, &ione); error = lapackf77_clange("M", &M, &N, h_C2, &ldc, work)/Cnorm; printf("%5d %5d %5d %8.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, K, gpu_perf, gpu_time, cpu_perf, cpu_time, error); } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float Cnorm, error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; magmaFloatComplex *C, *R, *A, *hwork, *tau; magmaFloatComplex_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_cgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_cmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*k ); TESTING_MALLOC_CPU( hwork, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, k ); TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*n ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, lda*k ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, dt_size ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_csetmatrix( mm, k, A, lda, dA, lda ); magma_cgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_cgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_cunmqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_C_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // cunmqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_cgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_cunmqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_cunmqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cunmqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / (magma_ssqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cher2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_Ccublas; magmaFloatComplex *d_A, *d_B, *d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "uplo = %c, transA = %c\n", opts.uplo, opts.transA ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[i]; K = opts.ksize[i]; gflops = FLOPS_CHER2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bk ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bk ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_csetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCher2k( opts.uplo, opts.transA, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cher2k( &opts.uplo, &opts.transA, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &N, &N, h_C, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_clange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex *dA, *dB, *dC1, *dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgeadd_batched Code is very similar to testing_clacpy_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B; magmaFloatComplex *d_A, *d_B; magmaFloatComplex **hAarray, **hBarray, **dAarray, **dBarray; magmaFloatComplex alpha = MAGMA_C_MAKE( 3.1415, 2.718 ); magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf(" M N ntile CPU GFlop/s (sec) GPU GFlop/s (sec) error \n"); printf("====================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gflops = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC( h_B, magmaFloatComplex, lda *N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*N ); TESTING_DEVALLOC( d_B, magmaFloatComplex, ldda*N ); TESTING_MALLOC( hAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC( hBarray, magmaFloatComplex*, ntile ); TESTING_DEVALLOC( dAarray, magmaFloatComplex*, ntile ); TESTING_DEVALLOC( dBarray, magmaFloatComplex*, ntile ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda ); // setup pointers for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(magmaFloatComplex*), hAarray, 1, dAarray, 1 ); magma_setvector( ntile, sizeof(magmaFloatComplex*), hBarray, 1, dBarray, 1 ); gpu_time = magma_sync_wtime( 0 ); magmablas_cgeadd_batched( mb, nb, alpha, dAarray, ldda, dBarray, ldda, ntile ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { blasf77_caxpy( &mb, &alpha, &h_A[offset + j*lda], &ione, &h_B[offset + j*lda], &ione ); } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda ); error = lapackf77_clange( "F", &M, &N, h_B, &lda, work ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_clange("f", &M, &N, h_B, &lda, work) / error; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); TESTING_FREE( hAarray ); TESTING_FREE( hBarray ); TESTING_DEVFREE( dAarray ); TESTING_DEVFREE( dBarray ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R, *work; magmaFloatComplex *d_A, *dwork; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex tmp; float error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_cgetri_nb( N ); gflops = FLOPS_CGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_cgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_C_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_clange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_cgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_cgetmatrix( N, N, d_A, ldda, h_A, lda ); if ( info != 0 ) printf("magma_cgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); //magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( N, N, d_A, ldda, h_R, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_clange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing znan_inf */ int main( int argc, char** argv) { TESTING_INIT(); #define hA(i,j) (hA + (i) + (j)*lda) magmaFloatComplex *hA, *dA; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, lda, ldda, size; magma_int_t *ii, *jj; magma_int_t i, j, cnt, tmp; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU nan + inf GPU nan + inf actual nan + inf \n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M + 31)/32)*32; size = lda*N; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &size, hA ); // up to half of matrix is NAN, and // up to half of matrix is INF. magma_int_t cnt_nan = (magma_int_t)( (rand() / ((float)RAND_MAX)) * 0.5 * M*N ); magma_int_t cnt_inf = (magma_int_t)( (rand() / ((float)RAND_MAX)) * 0.5 * M*N ); magma_int_t total = cnt_nan + cnt_inf; assert( cnt_nan >= 0 ); assert( cnt_inf >= 0 ); assert( total <= M*N ); // fill in indices TESTING_MALLOC_CPU( ii, magma_int_t, size ); TESTING_MALLOC_CPU( jj, magma_int_t, size ); for( cnt=0; cnt < size; ++cnt ) { ii[cnt] = cnt % M; jj[cnt] = cnt / M; } // shuffle indices for( cnt=0; cnt < total; ++cnt ) { i = int( rand() / ((float)RAND_MAX) * size ); tmp=ii[cnt]; ii[cnt]=ii[i]; ii[i]=tmp; tmp=jj[cnt]; jj[cnt]=jj[i]; jj[i]=tmp; } // fill in NAN and INF // for uplo, count NAN and INF in triangular portion of A int c_nan=0; int c_inf=0; for( cnt=0; cnt < cnt_nan; ++cnt ) { i = ii[cnt]; j = jj[cnt]; *hA(i,j) = MAGMA_C_NAN; if ( uplo[iuplo] == MagmaLower && i >= j ) { c_nan++; } if ( uplo[iuplo] == MagmaUpper && i <= j ) { c_nan++; } } for( cnt=cnt_nan; cnt < cnt_nan + cnt_inf; ++cnt ) { i = ii[cnt]; j = jj[cnt]; *hA(i,j) = MAGMA_C_INF; if ( uplo[iuplo] == MagmaLower && i >= j ) { c_inf++; } if ( uplo[iuplo] == MagmaUpper && i <= j ) { c_inf++; } } if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { cnt_nan = c_nan; cnt_inf = c_inf; total = cnt_nan + cnt_inf; } //printf( "nan %g + %gi\n", MAGMA_C_REAL( magma_cnan ), MAGMA_C_REAL( magma_cnan ) ); //printf( "inf %g + %gi\n", MAGMA_C_REAL( magma_cinf ), MAGMA_C_REAL( magma_cinf ) ); //magma_cprint( M, N, hA, lda ); magma_csetmatrix( M, N, hA, lda, dA, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_int_t c_cpu_nan=-1, c_cpu_inf=-1; magma_int_t c_gpu_nan=-1, c_gpu_inf=-1; magma_int_t c_cpu = magma_cnan_inf ( uplo[iuplo], M, N, hA, lda, &c_cpu_nan, &c_cpu_inf ); magma_int_t c_gpu = magma_cnan_inf_gpu( uplo[iuplo], M, N, dA, ldda, &c_gpu_nan, &c_gpu_inf ); magma_int_t c_cpu2 = magma_cnan_inf ( uplo[iuplo], M, N, hA, lda, NULL, NULL ); magma_int_t c_gpu2 = magma_cnan_inf_gpu( uplo[iuplo], M, N, dA, ldda, NULL, NULL ); /* ===================================================================== Check the result =================================================================== */ bool ok = ( c_cpu == c_gpu ) && ( c_cpu == c_cpu2 ) && ( c_gpu == c_gpu2 ) && ( c_cpu == c_cpu_nan + c_cpu_inf ) && ( c_gpu == c_gpu_nan + c_gpu_inf ) && ( c_cpu_nan == cnt_nan ) && ( c_cpu_inf == cnt_inf ) && ( c_gpu_nan == cnt_nan ) && ( c_gpu_inf == cnt_inf ); printf( "%4c %5d %5d %10d + %-10d %10d + %-10d %10d + %-10d %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, (int) c_cpu_nan, (int) c_cpu_inf, (int) c_gpu_nan, (int) c_gpu_inf, (int) cnt_nan, (int) cnt_inf, (ok ? "ok" : "failed")); status += ! ok; TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); TESTING_FREE_CPU( ii ); TESTING_FREE_CPU( jj ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgesdd (SVD with Divide & Conquer) Please keep code in testing_cgesdd.cpp and testing_cgesvd.cpp similar. */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_R, *U, *VT, *h_work; magmaFloatComplex dummy[1]; float *S1, *S2; #ifdef COMPLEX magma_int_t lrwork=0; float *rwork; #endif magma_int_t *iwork; magma_int_t M, N, N_U, M_VT, lda, ldu, ldv, n2, min_mn, max_mn, info, nb, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_vec_t jobz; magma_int_t status = 0; MAGMA_UNUSED( max_mn ); // used only in complex magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); jobz = opts.jobu; magma_vec_t jobs[] = { MagmaNoVec, MagmaSomeVec, MagmaOverwriteVec, MagmaAllVec }; if ( opts.check && ! opts.all && (jobz == MagmaNoVec)) { printf( "%% NOTE: some checks require that singular vectors are computed;\n" "%% set jobz (option -U[NASO]) to be S, O, or A.\n\n" ); } printf("%% jobz M N CPU time (sec) GPU time (sec) |S1-S2| |A-USV^H| |I-UU^H|/M |I-VV^H|/N S sorted\n"); printf("%%==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ijobz = 0; ijobz < 4; ++ijobz ) { for( int iter = 0; iter < opts.niter; ++iter ) { if ( opts.all ) { jobz = jobs[ ijobz ]; } else if ( ijobz > 0 ) { // if not testing all, run only once, when ijobz = 0, // but jobz come from opts (above loops). continue; } M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); max_mn = max(M, N); N_U = (jobz == MagmaAllVec ? M : min_mn); M_VT = (jobz == MagmaAllVec ? N : min_mn); lda = M; ldu = M; ldv = M_VT; n2 = lda*N; nb = magma_get_cgesvd_nb( M, N ); // x and y abbreviations used in cgesdd and dgesdd documentation magma_int_t x = max(M,N); magma_int_t y = min(M,N); #ifdef COMPLEX bool tall = (x >= int(y*17/9.)); // true if tall (m >> n) or wide (n >> m) #else bool tall = (x >= int(y*11/6.)); // true if tall (m >> n) or wide (n >> m) #endif // query or use formula for workspace size switch( opts.svd_work ) { case 0: { // query for workspace size lwork = -1; magma_cgesdd( jobz, M, N, NULL, lda, NULL, NULL, ldu, NULL, ldv, dummy, lwork, #ifdef COMPLEX NULL, #endif NULL, &info ); lwork = (int) MAGMA_C_REAL( dummy[0] ); break; } case 1: // minimum case 2: // optimal case 3: { // optimal (for gesdd, 2 & 3 are same; for gesvd, they differ) // formulas from cgesdd and dgesdd documentation bool sml = (opts.svd_work == 1); // 1 is small workspace, 2,3 are large workspace #ifdef COMPLEX // ---------------------------------------- if (jobz == MagmaNoVec) { if (tall) { lwork = 2*y + (2*y)*nb; } else { lwork = 2*y + (x+y)*nb; } } if (jobz == MagmaOverwriteVec) { if (tall) { if (sml) { lwork = 2*y*y + 2*y + (2*y)*nb; } else { lwork = y*y + x*y + 2*y + (2*y)*nb; } // not big deal } else { //if (sml) { lwork = 2*y + max( (x+y)*nb, y*y + y ); } //else { lwork = 2*y + max( (x+y)*nb, x*y + y*nb ); } // LAPACK 3.4.2 over-estimates workspaces. For compatability, use these: if (sml) { lwork = 2*y + max( (x+y)*nb, y*y + x ); } else { lwork = 2*y + (x+y)*nb + x*y; } } } if (jobz == MagmaSomeVec) { if (tall) { lwork = y*y + 2*y + (2*y)*nb; } else { lwork = 2*y + (x+y)*nb; } } if (jobz == MagmaAllVec) { if (tall) { if (sml) { lwork = y*y + 2*y + max( (2*y)*nb, x ); } else { lwork = y*y + 2*y + max( (2*y)*nb, x*nb ); } } else { lwork = 2*y + (x+y)*nb; } } #else // REAL ---------------------------------------- if (jobz == MagmaNoVec) { if (tall) { lwork = 3*y + max( (2*y)*nb, 7*y ); } else { lwork = 3*y + max( (x+y)*nb, 7*y ); } } if (jobz == MagmaOverwriteVec) { if (tall) { if (sml) { lwork = y*y + 3*y + max( (2*y)*nb, 4*y*y + 4*y ); } else { lwork = y*y + 3*y + max( max( (2*y)*nb, 4*y*y + 4*y ), y*y + y*nb ); } } else { if (sml) { lwork = 3*y + max( (x+y)*nb, 4*y*y + 4*y ); } else { lwork = 3*y + max( (x+y)*nb, 3*y*y + 4*y + x*y ); } // extra space not too important? } } if (jobz == MagmaSomeVec) { if (tall) { lwork = y*y + 3*y + max( (2*y)*nb, 3*y*y + 4*y ); } else { lwork = 3*y + max( (x+y)*nb, 3*y*y + 4*y ); } } if (jobz == MagmaAllVec) { if (tall) { if (sml) { lwork = y*y + max( 3*y + max( (2*y)*nb, 3*y*y + 4*y ), y + x ); } else { lwork = y*y + max( 3*y + max( (2*y)*nb, 3*y*y + 4*y ), y + x*nb ); } // LAPACK 3.4.2 over-estimates workspaces. For compatability, use these: //if (sml) { lwork = y*y + 3*y + max( (2*y)*nb, 3*y*y + 3*y + x ); } //else { lwork = y*y + max( 3*y + max( (2*y)*nb, max( 3*y*y + 3*y + x, 3*y*y + 4*y )), y + x*nb ); } } else { lwork = 3*y + max( (x+y)*nb, 3*y*y + 4*y ); } } #endif break; } default: { fprintf( stderr, "Error: unknown option svd_work %d\n", (int) opts.svd_work ); return -1; break; } } TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( VT, magmaFloatComplex, ldv*N ); // N x N (jobz=A) or min(M,N) x N TESTING_MALLOC_CPU( U, magmaFloatComplex, ldu*N_U ); // M x M (jobz=A) or M x min(M,N) TESTING_MALLOC_CPU( S1, float, min_mn ); TESTING_MALLOC_CPU( S2, float, min_mn ); TESTING_MALLOC_CPU( iwork, magma_int_t, 8*min_mn ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); #ifdef COMPLEX if (jobz == MagmaNoVec) { // requires 5*min_mn, but MKL (11.1) seems to have a bug // requiring 7*min_mn in some cases (e.g., jobz=N, m=100, n=170) lrwork = 7*min_mn; } else if (tall) { lrwork = 5*min_mn*min_mn + 5*min_mn; } else { lrwork = max( 5*min_mn*min_mn + 5*min_mn, 2*max_mn*min_mn + 2*min_mn*min_mn + min_mn ); } TESTING_MALLOC_CPU( rwork, float, lrwork ); #endif /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgesdd( jobz, M, N, h_R, lda, S1, U, ldu, VT, ldv, h_work, lwork, #ifdef COMPLEX rwork, #endif iwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) { printf("magma_cgesdd returned error %d: %s.\n", (int) info, magma_strerror( info )); } float eps = lapackf77_slamch( "E" ); float result[5] = { -1/eps, -1/eps, -1/eps, -1/eps, -1/eps }; if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvbd routine. A is factored as A = U diag(S) VT and the following 4 tests computed: (1) | A - U diag(S) VT | / ( |A| max(M,N) ) (2) | I - U^H U | / ( M ) (3) | I - VT VT^H | / ( N ) (4) S contains MNMIN nonnegative values in decreasing order. (Return 0 if true, 1/ULP if false.) =================================================================== */ magma_int_t izero = 0; // get size and location of U and V^T depending on jobz // U2=NULL and VT2=NULL if they were not computed (e.g., jobz=N) magmaFloatComplex *U2 = NULL; magmaFloatComplex *VT2 = NULL; if ( jobz == MagmaSomeVec || jobz == MagmaAllVec ) { U2 = U; VT2 = VT; } else if ( jobz == MagmaOverwriteVec ) { if ( M >= N ) { U2 = h_R; ldu = lda; VT2 = VT; } else { U2 = U; VT2 = h_R; ldv = lda; } } // cbdt01 needs M+N // cunt01 prefers N*(N+1) to check U; M*(M+1) to check V magma_int_t lwork_err = M+N; if ( U2 != NULL ) { lwork_err = max( lwork_err, N_U*(N_U+1) ); } if ( VT2 != NULL ) { lwork_err = max( lwork_err, M_VT*(M_VT+1) ); } magmaFloatComplex *h_work_err; TESTING_MALLOC_CPU( h_work_err, magmaFloatComplex, lwork_err ); // cbdt01 and cunt01 need max(M,N), depending float *rwork_err; TESTING_MALLOC_CPU( rwork_err, float, max(M,N) ); if ( U2 != NULL && VT2 != NULL ) { // since KD=0 (3rd arg), E is not referenced so pass NULL (9th arg) lapackf77_cbdt01(&M, &N, &izero, h_A, &lda, U2, &ldu, S1, NULL, VT2, &ldv, h_work_err, #ifdef COMPLEX rwork_err, #endif &result[0]); } if ( U2 != NULL ) { lapackf77_cunt01("Columns", &M, &N_U, U2, &ldu, h_work_err, &lwork_err, #ifdef COMPLEX rwork_err, #endif &result[1]); } if ( VT2 != NULL ) { lapackf77_cunt01("Rows", &M_VT, &N, VT2, &ldv, h_work_err, &lwork_err, #ifdef COMPLEX rwork_err, #endif &result[2]); } result[3] = 0.; for (int j=0; j < min_mn-1; j++) { if ( S1[j] < S1[j+1] ) result[3] = 1.; if ( S1[j] < 0. ) result[3] = 1.; } if (min_mn > 1 && S1[min_mn-1] < 0.) result[3] = 1.; result[0] *= eps; result[1] *= eps; result[2] *= eps; TESTING_FREE_CPU( h_work_err ); TESTING_FREE_CPU( rwork_err ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgesdd( lapack_vec_const(jobz), &M, &N, h_A, &lda, S2, U, &ldu, VT, &ldv, h_work, &lwork, #ifdef COMPLEX rwork, #endif iwork, &info); cpu_time = magma_wtime() - cpu_time; if (info != 0) { printf("lapackf77_cgesdd returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ float work[1], c_neg_one = -1; blasf77_saxpy(&min_mn, &c_neg_one, S1, &ione, S2, &ione); result[4] = lapackf77_slange("f", &min_mn, &ione, S2, &min_mn, work); result[4] /= lapackf77_slange("f", &min_mn, &ione, S1, &min_mn, work); printf(" %c %5d %5d %7.2f %7.2f %8.2e", lapack_vec_const(jobz)[0], (int) M, (int) N, cpu_time, gpu_time, result[4] ); } else { printf(" %c %5d %5d --- %7.2f --- ", lapack_vec_const(jobz)[0], (int) M, (int) N, gpu_time ); } if ( opts.check ) { if ( result[0] < 0. ) { printf(" --- "); } else { printf(" %#9.3g", result[0]); } if ( result[1] < 0. ) { printf(" --- "); } else { printf(" %#9.3g", result[1]); } if ( result[2] < 0. ) { printf(" --- "); } else { printf(" %#9.3g", result[2]); } bool okay = (result[0] < tol) && (result[1] < tol) && (result[2] < tol) && (result[3] == 0.) && (result[4] < tol); printf(" %3s %s\n", (result[3] == 0. ? "yes" : "no"), (okay ? "ok" : "failed")); status += ! okay; } else { printf("\n"); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( VT ); TESTING_FREE_CPU( U ); TESTING_FREE_CPU( S1 ); TESTING_FREE_CPU( S2 ); TESTING_FREE_CPU( iwork ); #ifdef COMPLEX TESTING_FREE_CPU( rwork ); #endif TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); fflush( stdout ); }} if ( opts.all || opts.niter > 1 ) { printf("\n"); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n, lda, ldda, size, info; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N |dC - C|/|C|\n"); printf("====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // for this simple case, all matrices are N-by-N n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices size = lda*n; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference, |dC - C| / |C| magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ) / lapackf77_clange( "F", &n, &n, C, &lda, work ); printf( "%5d %8.2e %s\n", (int) n, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); fflush( stdout ); } } cublasDestroy( handle ); magma_finalize(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A; float *h_work; magmaFloatComplex_ptr d_A; magmaFloat_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; float error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); float tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_clanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_clanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded clanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL clanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaFloatComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_work, float, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, float, N ); /* Initialize the matrix */ lapackf77_clarnv( &idist, ISEED, &n2, h_A ); magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_clanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded clanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_clanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabsf. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_C_NAN; magma_csetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_C_INF; magma_csetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_clanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_clanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in clanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgegqr */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float e, e1, e2, e3, e4, e5, *work; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex *h_A, *h_R, *tau, *dtau, *h_work, *h_rwork, tmp[1]; magmaFloatComplex_ptr d_A, dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1, ldwork; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // versions 1...4 are valid if (opts.version < 1 || opts.version > 4) { printf("Unknown version %d; exiting\n", (int) opts.version ); return -1; } float tol = 10. * opts.tolerance * lapackf77_slamch("E"); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||I-Q'Q||_F / M ||I-Q'Q||_I / M ||A-Q R||_I\n"); printf(" MAGMA / LAPACK MAGMA / LAPACK\n"); printf("==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because cgegqr requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because cgegqr requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGEQRF( M, N ) / 1e9 + FLOPS_CUNGQR( M, N, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max(lwork, 3*N*N); ldwork = N*N; if (opts.version == 2) { ldwork = 3*N*N + min_mn + 2; } TESTING_MALLOC_PIN( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN(h_rwork, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( work, float, M ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( dtau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dwork, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup if ( opts.warmup ) { magma_cgegqr_gpu( 1, M, N, d_A, ldda, dwork, h_work, &info ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_cgegqr_gpu( opts.version, M, N, d_A, ldda, dwork, h_rwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgegqr returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); // Regenerate R // blasf77_cgemm("t", "n", &N, &N, &M, &c_one, h_R, &M, h_A, &M, &c_zero, h_rwork, &N); // magma_cprint(N, N, h_work, N); blasf77_ctrmm("r", "u", "n", "n", &M, &N, &c_one, h_rwork, &N, h_R, &M); blasf77_caxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); e5 = lapackf77_clange("i", &M, &N, h_R, &M, work) / lapackf77_clange("i", &M, &N, h_A, &lda, work); magma_cgetmatrix( M, N, d_A, ldda, h_R, M ); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); /* Orthogonalize on the CPU */ lapackf77_cgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_cungqr(&M, &N, &N, h_A, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_cgemm("c", "n", &N, &N, &M, &c_one, h_R, &M, h_R, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e1 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e3 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; blasf77_cgemm("c", "n", &N, &N, &M, &c_one, h_A, &M, h_A, &M, &c_zero, h_work, &N); for(int ii = 0; ii < N*N; ii += N+1 ) { h_work[ii] = MAGMA_C_SUB(h_work[ii], c_one); } e2 = lapackf77_clange("f", &N, &N, h_work, &N, work) / N; e4 = lapackf77_clange("i", &N, &N, h_work, &N, work) / N; if (opts.version != 4) e = e1; else e = e1 / (10.*max(M,N)); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e / %8.2e %8.2e / %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, e1, e2, e3, e4, e5, (e < tol ? "ok" : "failed")); status += ! (e < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_rwork ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magmaFloatComplex *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); #ifdef HAVE_CUBLAS // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK printf("If running lapack (option --lapack), MAGMA and %s error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n", g_platform_str, g_platform_str ); printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else // for others, we need LAPACK for check opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*An ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Cdev, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*An ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); magma_csetmatrix( Am, An, h_A, lda, d_A, ldda ); magma_csetmatrix( Bm, Bn, h_B, ldb, d_B, lddb ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_csetmatrix( M, N, h_C, ldc, d_C, lddc ); magma_time = magma_sync_wtime( NULL ); magmablas_cgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc ); #endif /* ===================================================================== Performs operation using CUBLAS / clBLAS / Xeon Phi MKL =================================================================== */ magma_csetmatrix( M, N, h_C, ldc, d_C, lddc ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( NULL ); cublasCgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); dev_time = magma_sync_wtime( NULL ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_cgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_cgetmatrix( M, N, d_C, lddc, h_Cdev, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & dev, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "F", &M, &N, h_C, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione ); dev_error = lapackf77_clange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm; #ifdef HAVE_CUBLAS blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_clange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif } else { #ifdef HAVE_CUBLAS // compute relative error for magma, relative to dev (currently only with CUDA) Cnorm = lapackf77_clange( "F", &M, &N, h_Cdev, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione ); magma_error = lapackf77_clange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time ); #endif } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Cdev ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *h_B, *h_X; magma_int_t N, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, float, N ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); // copy A to R and B to X; save A and B for residual lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv( opts.uplo, N, opts.nrhs, h_R, lda, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, magma_err, cublas_err, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t NN; magma_int_t batchCount; magmaFloatComplex *h_A, *h_B, *h_C, *h_Cmagma, *h_Ccublas; magmaFloatComplex *d_A, *d_B, *d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magmaFloatComplex **A_array = NULL; magmaFloatComplex **B_array = NULL; magmaFloatComplex **C_array = NULL; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; cublasHandle_t handle = opts.handle; //float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), MAGMA and CUBLAS error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to CUBLAS result.\n\n" "transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB)); printf("BatchCount M N K MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CGEMM( M, N, K ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; NN = N * batchCount; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An*batchCount; sizeB = ldb*Bn*batchCount; sizeC = ldc*N*batchCount; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, sizeB ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, sizeC ); TESTING_MALLOC_CPU( h_Cmagma, magmaFloatComplex, sizeC ); TESTING_MALLOC_CPU( h_Ccublas, magmaFloatComplex, sizeC ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*An*batchCount ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn*batchCount ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N*batchCount ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&B_array, batchCount * sizeof(*B_array)); magma_malloc((void**)&C_array, batchCount * sizeof(*C_array)); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( Am, An*batchCount, h_A, lda, d_A, ldda ); magma_csetmatrix( Bm, Bn*batchCount, h_B, ldb, d_B, lddb ); magma_csetmatrix( M, N*batchCount, h_C, ldc, d_C, lddc ); cset_pointer(A_array, d_A, ldda, 0, 0, ldda*An, batchCount, queue); cset_pointer(B_array, d_B, lddb, 0, 0, lddb*Bn, batchCount, queue); cset_pointer(C_array, d_C, lddc, 0, 0, lddc*N, batchCount, queue); magma_time = magma_sync_wtime( NULL ); magmablas_cgemm_batched(opts.transA, opts.transB, M, N, K, alpha, A_array, ldda, B_array, lddb, beta, C_array, lddc, batchCount, queue); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N*batchCount, d_C, lddc, h_Cmagma, ldc ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N*batchCount, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCgemmBatched(handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, (const magmaFloatComplex**) A_array, ldda, (const magmaFloatComplex**) B_array, lddb, &beta, C_array, lddc, batchCount ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N*batchCount, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_cgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A + i*lda*An, &lda, h_B + i*ldb*Bn, &ldb, &beta, h_C + i*ldc*N, &ldc ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; cublas_error = 0.0; for(int s=0; s<batchCount; s++) { magma_int_t C_batchSize = ldc * N; Cnorm = lapackf77_clange( "M", &M, &N, h_C + s*C_batchSize, &ldc, work ); blasf77_caxpy( &C_batchSize, &c_neg_one, h_C + s*C_batchSize, &ione, h_Cmagma + s*C_batchSize, &ione ); magma_err = lapackf77_clange( "M", &M, &N, h_Cmagma + s*C_batchSize, &ldc, work ) / Cnorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); blasf77_caxpy( &C_batchSize, &c_neg_one, h_C + s*C_batchSize, &ione, h_Ccublas + s*C_batchSize, &ione ); cublas_err = lapackf77_clange( "M", &M, &N, h_Ccublas + s*C_batchSize, &ldc, work ) / Cnorm; if ( isnan(cublas_err) || isinf(cublas_err) ) { cublas_error = cublas_err; break; } cublas_error = max(fabs(cublas_err), cublas_error); } printf("%10d %5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e \n", (int) batchCount, (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error); } else { // compute relative error for magma, relative to cublas Cnorm = lapackf77_clange( "M", &M, &NN, h_Ccublas, &ldc, work ); blasf77_caxpy( &sizeC, &c_neg_one, h_Ccublas, &ione, h_Cmagma, &ione ); magma_error = lapackf77_clange( "M", &M, &NN, h_Cmagma, &ldc, work ) / Cnorm; printf("%10d %5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e ---\n", (int) batchCount, (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( B_array ); TESTING_FREE_DEV( C_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaFloatComplex *h_A, *h_R, *h_work, aux_work[1]; float *rwork, *w1, *w2, result[3], eps, aux_rwork[1]; magma_int_t *iwork, aux_iwork[1]; magma_int_t N, n2, info, lwork, lrwork, liwork, lda; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; eps = lapackf77_slamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; lda = N; // query for workspace sizes magma_cheevd( opts.jobz, opts.uplo, N, NULL, lda, NULL, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, N*lda ); TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( rwork, float, lrwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, N*lda ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, N ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* warm up run */ if ( opts.warmup ) { magma_cheevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cheevd( opts.jobz, opts.uplo, N, h_R, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; // tau=NULL is unused since itype=1 lapackf77_chet21( &ione, &opts.uplo, &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, NULL, h_work, rwork, &result[0] ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_cheevd( MagmaNoVec, opts.uplo, N, h_R, lda, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / (((float)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cheevd( &opts.jobz, &opts.uplo, &N, h_A, &lda, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_cheevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e%s\n", result[0]*eps, (result[0]*eps < tol ? "" : " failed") ); printf("(2) | I - U'U | / N = %8.2e%s\n", result[1]*eps, (result[1]*eps < tol ? "" : " failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e%s\n\n", result[2] , (result[2] < tolulp ? "" : " failed") ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( rwork ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, dwork[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; magmaFloatComplex *C, *R, *A, *work, *tau, *tauq, *taup; float *d, *e; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans }; printf(" M N K vect side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_cgebrd_nb( m ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_CUNMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*nn ); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork_max ); TESTING_MALLOC_CPU( d, float, min(mm,nn) ); TESTING_MALLOC_CPU( e, float, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, magmaFloatComplex, min(mm,nn) ); TESTING_MALLOC_CPU( taup, magmaFloatComplex, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_cgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_cgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) printf("magma_cgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) printf("magma_cunmbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_cunmbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmbr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, dwork ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, dwork ) / error; printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotri */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t N, n2, lda, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float Anorm, error, work[1]; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||A||_F\n"); printf("%%================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRI( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ if ( opts.warmup ) { magma_cpotrf( opts.uplo, N, h_R, lda, &info ); magma_cpotri( opts.uplo, N, h_R, lda, &info ); lapackf77_clacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); } /* factorize matrix */ magma_cpotrf( opts.uplo, N, h_R, lda, &info ); // check for exact singularity //h_R[ 10 + 10*lda ] = MAGMA_C_MAKE( 0.0, 0.0 ); gpu_time = magma_wtime(); magma_cpotri( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_cpotri returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime(); lapackf77_cpotri( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cpotri returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); Anorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / Anorm; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cungqr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *hA, *hR, *tau, *h_work; magmaFloatComplex *dA, *dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("Running version %d; available are (specified through --version num):\n", (int) opts.version); printf("1 - uses precomputed clarft matrices (default)\n"); printf("2 - recomputes the clarft matrices on the fly\n\n"); printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; if ( m < n || n < k ) { printf( "%5d %5d %5d skipping because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_cgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_CUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork ); TESTING_MALLOC_PIN( hR, magmaFloatComplex, lda*n ); TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaFloatComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_clarnv( &ione, ISEED, &n2, hA ); lapackf77_clacpy( MagmaUpperLowerStr, &m, &n, hA, &lda, hR, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // first, get QR factors magma_csetmatrix( m, n, hA, lda, dA, ldda ); magma_cgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( m, n, dA, ldda, hR, lda ); gpu_time = magma_wtime(); if (opts.version == 1) magma_cungqr( m, n, k, hR, lda, tau, dT, nb, &info ); else magma_cungqr2(m, n, k, hR, lda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { error = lapackf77_clange("f", &m, &n, hA, &lda, work ); lapackf77_cgeqrf( &m, &n, hA, &lda, tau, h_work, &lwork, &info ); if (info != 0) printf("lapackf77_cgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = magma_wtime(); lapackf77_cungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_caxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_clange("f", &m, &n, hR, &lda, work) / error; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( hR ); TESTING_FREE_CPU( hA ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf=0., cublas_time=0., cpu_perf=0, cpu_time=0; float error; magma_int_t cublas_enable = 0; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *dA_magma; magmaFloatComplex **dA_array = NULL; magma_int_t **dipiv_array = NULL; magma_int_t *ipiv, *cpu_info; magma_int_t *dipiv_magma, *dinfo_magma; magma_int_t M, N, n2, lda, ldda, min_mn, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t batchCount; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); //opts.lapack |= opts.check; batchCount = opts.batchcount; magma_int_t columns; float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% BatchCount M N CPU Gflop/s (ms) MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("%%==========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N * batchCount; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_CGETRF( M, N ) / 1e9 * batchCount; TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( dA_magma, magmaFloatComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( dipiv_magma, magma_int_t, min_mn * batchCount ); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaFloatComplex*, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); // make A diagonally dominant, to not need pivoting for( int s=0; s < batchCount; ++s ) { for( int i=0; i < min_mn; ++i ) { h_A[ i + i*lda + s*lda*N ] = MAGMA_C_MAKE( MAGMA_C_REAL( h_A[ i + i*lda + s*lda*N ] ) + N, MAGMA_C_IMAG( h_A[ i + i*lda + s*lda*N ] )); } } columns = N * batchCount; lapackf77_clacpy( MagmaFullStr, &M, &columns, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, columns, h_R, lda, dA_magma, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_cset_pointer( dA_array, dA_magma, ldda, 0, 0, ldda*N, batchCount, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); info = magma_cgetrf_nopiv_batched( M, N, dA_array, ldda, dinfo_magma, batchCount, opts.queue); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for (int i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_cgetrf_batched matrix %d returned internal error %d\n", i, (int)cpu_info[i] ); } } if (info != 0) { printf("magma_cgetrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for (int i=0; i < batchCount; i++) { lapackf77_cgetrf(&M, &N, h_A + i*lda*N, &lda, ipiv + i * min_mn, &info); assert( info == 0 ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, cpu_perf, cpu_time*1000., magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } else { printf("%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) M, (int) N, magma_perf, magma_time*1000., cublas_perf*cublas_enable, cublas_time*1000.*cublas_enable ); } if ( opts.check ) { // initialize ipiv to 1, 2, 3, ... for (int i=0; i < batchCount; i++) { for (int k=0; k < min_mn; k++) { ipiv[i*min_mn+k] = k+1; } } magma_cgetmatrix( M, N*batchCount, dA_magma, ldda, h_A, lda ); error = 0; for (int i=0; i < batchCount; i++) { float err; err = get_LU_error( M, N, h_R + i * lda*N, lda, h_A + i * lda*N, ipiv + i * min_mn); if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf(" --- \n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( dA_magma ); TESTING_FREE_DEV( dinfo_magma ); TESTING_FREE_DEV( dipiv_magma ); TESTING_FREE_DEV( dipiv_array ); TESTING_FREE_DEV( dA_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cunmqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; magmaFloatComplex *C, *R, *A, *W, *tau; magmaFloatComplex *dC, *dA, *dT; magma_opts opts; parse_opts( argc, argv, &opts ); // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; nb = magma_get_cgeqrf_nb( m ); ldc = ((m + 31)/32)*32; lda = ((max(m,n) + 31)/32)*32; gflops = FLOPS_CUNMQR( m, n, k, *side[iside] ) / 1e9; if ( *side[iside] == 'L' && m < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=left and m < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'R' && n < k ) { printf( "%5d %5d %5d %-5s %-9s skipping because side=right and n < k\n", (int) m, (int) n, (int) k, side[iside], trans[itran] ); continue; } if ( *side[iside] == 'L' ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + ((k + 31)/32)*32 )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + ((k + 31)/32)*32 )*nb; } TESTING_MALLOC( C, magmaFloatComplex, ldc*n ); TESTING_MALLOC( R, magmaFloatComplex, ldc*n ); TESTING_MALLOC( A, magmaFloatComplex, lda*k ); TESTING_MALLOC( W, magmaFloatComplex, lwork_max ); TESTING_MALLOC( tau, magmaFloatComplex, k ); TESTING_DEVALLOC( dC, magmaFloatComplex, ldc*n ); TESTING_DEVALLOC( dA, magmaFloatComplex, lda*k ); TESTING_DEVALLOC( dT, magmaFloatComplex, dt_size ); // C is full, m x n size = ldc*n; lapackf77_clarnv( &ione, ISEED, &size, C ); magma_csetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) lda = (*side[iside] == 'L' ? m : n); size = lda*k; lapackf77_clarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_csetmatrix( lda, k, A, lda, dA, lda ); magma_cgeqrf_gpu( lda, k, dA, lda, tau, dT, &info ); magma_cgetmatrix( lda, k, dA, lda, A, lda ); if (info != 0) printf("magma_cgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cunmqr( side[iside], trans[itran], &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cunmqr returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for work size lwork = -1; magma_cunmqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); if (info != 0) printf("magma_cunmqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_C_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) printf("invalid lwork %d, lwork_max %d\n", (int) lwork, (int) lwork_max ); gpu_time = magma_sync_wtime( 0 ); // sync needed for L,N and R,T cases magma_cunmqr_gpu( *side[iside], *trans[itran], m, n, k, dA, lda, tau, dC, ldc, W, lwork, dT, nb, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cunmqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_cgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-5s %-9s %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) m, (int) n, (int) k, side[iside], trans[itran], cpu_perf, cpu_time, gpu_perf, gpu_time, error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( A ); TESTING_FREE( W ); TESTING_FREE( tau ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dT ); }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clarfb_gpu */ int main( int argc, char** argv ) { TESTING_CUDA_INIT(); cuFloatComplex c_zero = MAGMA_C_ZERO; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; printf( "\nUsage: %s -M m -N n -K k\n\n", argv[0] ); magma_int_t m = 500; magma_int_t n = 300; magma_int_t k = 32; for( int i = 1; i < argc; i++ ) { if (strcmp("-M", argv[i]) == 0 && i+1 < argc) { m = atoi( argv[++i] ); } else if (strcmp("-N", argv[i]) == 0 && i+1 < argc) { n = atoi( argv[++i] ); } else if (strcmp("-K", argv[i]) == 0 && i+1 < argc) { k = atoi( argv[++i] ); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( k <= 0 || k > m || k > n ) { printf( "requires 0 < k <= min(m,n)\n" ); exit(1); } magma_int_t ldc = m; magma_int_t ldv = max(m,n); magma_int_t ldt = k; magma_int_t ldw = max(m,n); magma_int_t nv; ldc = ((ldc+31)/32)*32; ldv = ((ldv+31)/32)*32; ldt = ((ldt+31)/32)*32; ldw = ((ldw+31)/32)*32; // Allocate memory for matrices cuFloatComplex *C, *R, *V, *T, *W; TESTING_MALLOC( C, cuFloatComplex, ldc*n ); TESTING_MALLOC( R, cuFloatComplex, ldc*n ); TESTING_MALLOC( V, cuFloatComplex, ldv*k ); TESTING_MALLOC( T, cuFloatComplex, ldt*k ); TESTING_MALLOC( W, cuFloatComplex, ldw*k ); cuFloatComplex *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, cuFloatComplex, ldc*n ); TESTING_DEVALLOC( dV, cuFloatComplex, ldv*k ); TESTING_DEVALLOC( dT, cuFloatComplex, ldt*k ); TESTING_DEVALLOC( dW, cuFloatComplex, ldw*k ); magma_int_t size; magma_int_t iseed[4] = { 1, 2, 3, 4 }; float error, work[1]; // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaConjTransStr, MagmaNoTransStr }; const char* direct[] = { MagmaForwardStr, MagmaBackwardStr }; const char* storev[] = { MagmaColumnwiseStr, MagmaRowwiseStr }; printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("==================================================================================\n"); for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { //printf( "# ----------\n" ); //printf( "# %-10s %-10s %-10s %-10s\n", storev[istor], side[iside], direct[idir], trans[itran] ); // C is full size = ldc*n; lapackf77_clarnv( &ione, iseed, &size, C ); //printf( "C=" ); magma_cprint( m, n, C, ldc ); // V is ldv x nv. See larfb docs for description. ldv = (*side[iside] == 'L' ? m : n); nv = k; size = ldv*nv; lapackf77_clarnv( &ione, iseed, &size, V ); if ( *storev[istor] == MagmaColumnwise ) { if ( *direct[idir] == MagmaForward ) { lapackf77_claset( MagmaUpperStr, &k, &k, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaLowerStr, &k, &k, &c_zero, &c_one, &V[(ldv-k)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( *direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &k, &k, &c_zero, &c_one, V, &ldv ); } else { lapackf77_claset( MagmaUpperStr, &k, &k, &c_zero, &c_one, &V[(nv-k)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_cprint( ldv, nv, V, ldv ); // T is upper triangular for forward, and lower triangular for backward magma_int_t k1 = k-1; size = ldt*k; lapackf77_clarnv( &ione, iseed, &size, T ); if ( *direct[idir] == MagmaForward ) { lapackf77_claset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_claset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_cprint( k, k, T, ldt ); magma_csetmatrix( m, n, C, ldc, dC, ldc ); magma_csetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_csetmatrix( k, k, T, ldt, dT, ldt ); lapackf77_clarfb( side[iside], trans[itran], direct[idir], storev[istor], &m, &n, &k, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_cprint( m, n, C, ldc ); magma_clarfb_gpu( *side[iside], *trans[itran], *direct[idir], *storev[istor], m, n, k, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_cgetmatrix( m, n, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_cprint( m, n, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_clange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_caxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_clange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-10s %-10s %-10s %-10s %8.2e\n", (int) m, (int) n, (int) k, storev[istor], side[iside], direct[idir], trans[itran], error ); }}}} // Memory clean up TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); // Shutdown TESTING_CUDA_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float cublas_error, normA, normx, normr, work[1]; magma_int_t N, info; magma_int_t sizeA; magma_int_t lda, ldda; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_b, *h_x, *h_xcublas; magmaFloatComplex *d_A, *d_x; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %c, transA = %c, diag = %c\n", opts.uplo, opts.transA, opts.diag ); printf(" N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("============================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; gflops = FLOPS_CTRSM(opts.side, N, 1) / 1e9; lda = N; ldda = ((lda+31)/32)*32; sizeA = lda*N; TESTING_MALLOC( ipiv, magma_int_t, N ); TESTING_MALLOC( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC( h_b, magmaFloatComplex, N ); TESTING_MALLOC( h_x, magmaFloatComplex, N ); TESTING_MALLOC( h_xcublas, magmaFloatComplex, N ); TESTING_DEVALLOC( d_A, magmaFloatComplex, ldda*N ); TESTING_DEVALLOC( d_x, magmaFloatComplex, N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &N, &N, h_A, &lda, ipiv, &info ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &N, h_b ); blasf77_ccopy( &N, h_b, &ione, h_x, &ione ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_csetmatrix( N, N, h_A, lda, d_A, ldda ); magma_csetvector( N, h_x, 1, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsv( opts.uplo, opts.transA, opts.diag, N, d_A, ldda, d_x, 1 ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, d_x, 1, h_xcublas, 1 ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_x, &ione ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) // error for CUBLAS normA = lapackf77_clange( "F", &N, &N, h_A, &lda, work ); normx = lapackf77_clange( "F", &N, &ione, h_xcublas, &ione, work ); blasf77_ctrmv( &opts.uplo, &opts.transA, &opts.diag, &N, h_A, &lda, h_xcublas, &ione ); blasf77_caxpy( &N, &c_neg_one, h_b, &ione, h_xcublas, &ione ); normr = lapackf77_clange( "F", &N, &ione, h_xcublas, &N, work ); cublas_error = normr / (normA*normx); if ( opts.lapack ) { printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error ); } else { printf("%5d %7.2f (%7.2f) --- ( --- ) %8.2e\n", (int) N, cublas_perf, 1000.*cublas_time, cublas_error ); } TESTING_FREE( h_A ); TESTING_FREE( h_x ); TESTING_FREE( h_xcublas ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_x ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_x, *h_x2, *h_tau, *h_tau2; magmaFloatComplex *d_x, *d_tau; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float error, error2, work[1]; magma_int_t N, nb, lda, ldda, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); // does larfg on nb columns, one after another nb = (opts.nb > 0 ? opts.nb : 64); magma_queue_t queue = 0; printf(" N nb CPU GFLop/s (ms) GPU GFlop/s (ms) error tau error\n"); printf("==========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; gflops = FLOPS_CLARFG( N ) / 1e9 * nb; TESTING_MALLOC_CPU( h_x, magmaFloatComplex, N*nb ); TESTING_MALLOC_CPU( h_x2, magmaFloatComplex, N*nb ); TESTING_MALLOC_CPU( h_tau, magmaFloatComplex, nb ); TESTING_MALLOC_CPU( h_tau2, magmaFloatComplex, nb ); TESTING_MALLOC_DEV( d_x, magmaFloatComplex, ldda*nb ); TESTING_MALLOC_DEV( d_tau, magmaFloatComplex, nb ); /* Initialize the vectors */ size = N*nb; lapackf77_clarnv( &ione, ISEED, &size, h_x ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( N, nb, h_x, N, d_x, ldda ); gpu_time = magma_sync_wtime( queue ); for( int j = 0; j < nb; ++j ) { magmablas_clarfg( N, &d_x[0+j*ldda], &d_x[1+j*ldda], ione, &d_tau[j] ); } gpu_time = magma_sync_wtime( queue ) - gpu_time; gpu_perf = gflops / gpu_time; magma_cgetmatrix( N, nb, d_x, ldda, h_x2, N ); magma_cgetvector( nb, d_tau, 1, h_tau2, 1 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int j = 0; j < nb; ++j ) { lapackf77_clarfg( &N, &h_x[0+j*lda], &h_x[1+j*lda], &ione, &h_tau[j] ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Error Computation and Performance Comparison =================================================================== */ blasf77_caxpy( &size, &c_neg_one, h_x, &ione, h_x2, &ione ); error = lapackf77_clange( "F", &N, &nb, h_x2, &N, work ) / lapackf77_clange( "F", &N, &nb, h_x, &N, work ); // tau can be 0 blasf77_caxpy( &nb, &c_neg_one, h_tau, &ione, h_tau2, &ione ); error2 = lapackf77_clange( "F", &nb, &ione, h_tau, &nb, work ); if ( error2 != 0 ) { error2 = lapackf77_clange( "F", &nb, &ione, h_tau2, &nb, work ) / error2; } printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) N, (int) nb, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, error2, (error < tol && error2 < tol ? "ok" : "failed") ); status += ! (error < tol && error2 < tol); TESTING_FREE_CPU( h_x ); TESTING_FREE_CPU( h_x2 ); TESTING_FREE_CPU( h_tau ); TESTING_FREE_DEV( d_x ); TESTING_FREE_DEV( d_tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }