// init a zero matrix on GPU to store X'*X, add normalization GpuSolver::GpuSolver ( int nn, int outs, magmaDouble_ptr A, magmaDouble_ptr B ) { magma_int_t err; magma_int_t num = 0; magma_device_t device; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != MAGMA_SUCCESS or num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != MAGMA_SUCCESS ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } n = nn; nrhs = outs; magma_dmalloc( &dA, n*n ); magma_dmalloc( &dB, n*nrhs ); if ( dA == NULL || dB == NULL ) { fprintf( stderr, "malloc failed - not enough GPU memory?\n" ); } magma_dsetmatrix( n, n, A, 0, n, dA, 0, n, queue ); magma_dsetmatrix( n, nrhs, B, 0, n, dB, 0, n, queue ); };
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double matnorm, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, flag = 0, n2, check = 0; magma_int_t n_local[MagmaMaxGPUs*MagmaMaxSubs], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, info, min_mn, nb; int num_gpus = 1, num_subs = 1, tot_subs = 1; magma_int_t ione = 1; M = N = size[9]; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); flag = 1; } else if (strcmp("-M", argv[i])==0) { M = atoi(argv[++i]); flag = 1; } else if (strcmp("-NGPU", argv[i])==0) { num_gpus = atoi(argv[++i]); } else if (strcmp("-NSUB", argv[i])==0) { num_subs = atoi(argv[++i]); } else if (strcmp("-check", argv[i])==0) { check = 1; } } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } } if (num_gpus > MagmaMaxGPUs){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = MagmaMaxGPUs; } if (num_subs > MagmaMaxSubs) { printf("More buffers requested than available. Have to change it.\n"); num_subs = MagmaMaxSubs; } tot_subs = num_gpus * num_subs; printf("\nNumber of GPUs to be used = %d\n", (int) num_gpus); printf("Usage: \n"); printf(" testing_zgeqrf_msub -M %d -N %d -NGPU %d -NSUB %d %s\n\n", M, N, num_gpus, num_subs, (check == 1 ? "-check" : " ")); /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for (i=0; i<num_gpus; i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf( "\n" ); printf(" M N CPU GFlop/s (sec.) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("==========================================================================\n"); for(i=0; i<10; i++){ if (flag == 0) { M = N = size[i]; } nb = magma_get_zgeqrf_nb(M); min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_ZGEQRF( (double)M, (double)N ) / 1e9; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); /* Allocate host workspace */ lhwork = -1; lapackf77_zgeqrf(&M, &N, h_R, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lhwork ); /* Allocate device memory for the matrix */ for (int j=0; j<tot_subs; j++) { n_local[j] = ((N/nb)/tot_subs)*nb; if (j < (N/nb)%tot_subs) n_local[j] += nb; else if (j == (N/nb)%tot_subs) n_local[j] += N%nb; TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local[j] ); } /* Initialize the matrix */ init_matrix( M, N, h_R, lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_queue_t *trans_queues = (magma_queue_t*)malloc(tot_subs*sizeof(magma_queue_t)); for (int j=0; j<tot_subs; j++) { trans_queues[j] = queues[2*(j%num_gpus)]; } // warm-up magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); gpu_time = magma_wtime(); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info < 0) printf("Argument %d of magma_zgeqrf_msub had an illegal value.\n", (int) -info); if (check == 1) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_zgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, tot_subs, nb, trans_queues); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); init_matrix( M, N, h_A, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapack_zgeqrf had an illegal value.\n", (int) -info); matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); TESTING_FREE_PIN( h_A ); } else { printf("%5d %5d -- ( -- ) %6.2f (%6.2f) --\n", (int) M, (int) N, gpu_perf, gpu_time ); } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for (int j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } for (i=0; i<num_gpus; i++) { magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *hA, *hR; magmaDoubleComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_Z_SET2REAL( hA(i,i), MAGMA_Z_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_Z_CNJG( hA(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_zgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work); blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
extern "C" magma_int_t magma_ztrtri_gpu(magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *info) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2012 Purpose ======= ZTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) DOUBLE PRECISION array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse can not be computed. ===================================================================== */ /* Local variables */ magma_uplo_t uplo_ = uplo; magma_diag_t diag_ = diag; magma_int_t nb, nn, j, jb; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; int upper = lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaUpper)); int nounit = lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaNonUnit)); *info = 0; if ((! upper) && (! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaLower)))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaUnit)))) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); /* Create Queues */ magma_queue_t queues[2]; magma_device_t device; int num = 0; magma_err_t err; err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 0 failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 1 failed: %d\n", err ); exit(-1); } if (MAGMA_SUCCESS != magma_malloc_host( (void**)&work, nb*nb*sizeof(magmaDoubleComplex) )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (nb <= 1 || nb >= n) { magma_zgetmatrix( n, n, dA, dA_offset, ldda, work, 0, n, queues[0] ); lapackf77_ztrtri(lapack_const(uplo_), lapack_const(diag_), &n, work, &n, info); magma_zsetmatrix( n, n, work, 0, n, dA, dA_offset, ldda, queues[0] ); } else { if (upper){ /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j =j+ nb){ jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda, queues[0]); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0]); } } else{ /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb){ jb=min(nb,(n-j)); if((j+jb) < n){ /* Compute rows j+jb:n of current block column */ magma_ztrmm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, queues[0]); } magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0] ); } } } magma_free_host( work ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv */ int main(int argc, char **argv) { real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_LU, *h_B, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( device[ opts.device ], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[ opts.device ], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; ldb = lda; gflops = ( FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_LU, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_zlacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_zlacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info, queue ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== Anorm = lapackf77_zlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_cgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A, dwork; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 }; magma_int_t ntest = 10; magma_int_t i, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; magmaDoubleComplex *work; magmaDoubleComplex tmp; double rwork[1]; magma_int_t *ipiv; magma_int_t lwork, ldwork; double A_norm, R_norm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[ntest-1] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgetri_gpu -N %d\n\n", 1024); } /* query for Lapack workspace size */ N = size[ntest-1]; lda = N; work = &tmp; lwork = -1; lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); lwork = int( MAGMA_Z_REAL( *work )); /* query for Magma workspace size */ ldwork = N * magma_get_zgetri_nb( N ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory */ n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for( i=0; i < ntest; i++ ){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZGETRI( (double)N ) / 1e9; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue ); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //warm-up magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); gpu_time = magma_wtime()-gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d\n", (int) info); gpu_perf = gflops / gpu_time; magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ); printf( "%5d %6.2f %6.2f %e\n", (int) N, cpu_perf, gpu_perf, R_norm / A_norm ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_B, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, double, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, double, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol, eps = lapackf77_slamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_sgeqrf_nb(M); gflops = FLOPS_SGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ float *tau; TESTING_MALLOC_CPU( tau, float, min_mn ); cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; float *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, float, n2 ); // Q TESTING_MALLOC_CPU( h_W2, float, n2 ); // R TESTING_MALLOC_CPU( h_W3, float, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_slarnv( &ione, ISEED2, &n2, h_A ); lapackf77_sqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_slange("f", &M, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv_gpu */ int main(int argc , char **argv) { real_Double_t gflops, gpu_perf, gpu_time; double Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex *hA, *hB, *hX; magmaDoubleComplex_ptr dA, dB; magma_int_t *ipiv; magma_int_t N = 0, n2, lda, ldb, ldda, lddb; magma_int_t size[7] = { 1024, 2048, 3072, 4032, 5184, 6048, 7000}; magma_int_t i, info, szeB; magmaDoubleComplex z_one = MAGMA_Z_ONE; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t NRHS = 100; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-R", argv[i])==0) NRHS = atoi(argv[++i]); } if (N>0) size[0] = size[6] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgesv_gpu -N <matrix size> -R <right hand sides>\n\n"); } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[6]; n2 = N * N; ldda = ((N+31)/32) * 32; // ldda = N; lddb = ldda; TESTING_MALLOC_PIN( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( hB, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( hX, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( work, double, N ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB, magmaDoubleComplex, lddb*NRHS ); printf("\n\n"); printf(" N NRHS GPU GFlop/s (sec) ||B - AX|| / ||A||*||X||\n"); printf("===========================================================\n"); for( i = 0; i < 7; i++ ) { N = size[i]; lda = N; ldb = lda; n2 = lda*N; szeB = ldb*NRHS; ldda = ((N+31)/32)*32; //ldda = N; lddb = ldda; gflops = ( FLOPS_GETRF( (double)N, (double)N ) + FLOPS_GETRS( (double)N, (double)NRHS ) ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlarnv( &ione, ISEED, &szeB, hB ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); //===================================================================== // Solve Ax = b through an LU factorization //===================================================================== magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); gpu_time = magma_wtime(); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zposv had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Residual =================================================================== */ magma_zgetmatrix( N, NRHS, dB, 0, lddb, hX, 0, ldb, queue ); Anorm = lapackf77_zlange("I", &N, &N, hA, &lda, work); Xnorm = lapackf77_zlange("I", &N, &NRHS, hX, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &NRHS, &N, &z_one, hA, &lda, hX, &ldb, &mz_one, hB, &ldb ); Rnorm = lapackf77_zlange("I", &N, &NRHS, hB, &ldb, work); printf( "%5d %5d %7.2f (%7.2f) %8.2e\n", N, NRHS, gpu_perf, gpu_time, Rnorm/(Anorm*Xnorm) ); if (argc != 1) break; } /* clean up */ TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( hB ); TESTING_FREE_PIN( hX ); TESTING_FREE_PIN( work ); TESTING_FREE_PIN( ipiv ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dB ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A; magma_int_t *ipiv; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7104,8064,9000,10000}; #endif magma_int_t i, info, min_mn; //magma_int_t nb, maxn, ret; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_zgetrf -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_zgetrf -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgetrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } ldda = ((M+31)/32)*32; //maxn = ((N+31)/32)*32; n2 = M * N; min_mn = min(M, N); //nb = magma_get_zgetrf_nb(min_mn); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" M N CPU GFlop/ (sec)s GPU GFlop/s (sec) ||PA-LU||/(||A||*N)\n"); printf("========================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (double)M, (double)N ) *1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ magma_zgetmatrix( M, N, d_A, 0, ldda, h_A, 0, lda, queue ); error = get_LU_error(M, N, h_R, lda, h_A, ipiv); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, error); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaFloatComplex *h_R = NULL, *h_P = NULL; magmaFloatComplex_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, check = 0, info; magmaFloatComplex mz_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0; int nb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i]) == 0){ N = atoi(argv[++i]); if (N > 0) { size[0] = size[9] = N; flag = 1; } } if(strcmp("-NGPU", argv[i]) == 0) num_gpus0 = atoi(argv[++i]); if(strcmp("-NSUB", argv[i]) == 0) num_subs0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i]) == 0) uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower : MagmaUpper); if(strcmp("-check", argv[i]) == 0) check = 1; } } /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus0;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("\nUsing %d GPUs:\n", num_gpus0); printf(" testing_cpotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0, (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " ")); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9;; nb = magma_get_cpotrf_nb(N); if (num_subs0*num_gpus0 > N/nb) { num_gpus = N/nb; num_subs = 1; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); } else { num_gpus = num_gpus0; num_subs = num_subs0; } tot_subs = num_subs * num_gpus; /* Allocate host memory for the matrix */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(magmaFloatComplex), NULL, NULL); cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(magmaFloatComplex), NULL, NULL); for (k=0; k<num_gpus; k++) { h_R = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, n2*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); h_P = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lda*nb*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); } #else TESTING_MALLOC_PIN( h_P, magmaFloatComplex, lda*nb ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); #endif /* Initialize the matrix */ init_matrix( N, h_R, lda ); /* Allocate GPU memory */ if (uplo == MagmaUpper) { ldda = ((N+nb-1)/nb)*nb; n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; } else { ldda = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; n_local = ((N+nb-1)/nb)*nb; } for (j=0; j<tot_subs; j++) { TESTING_MALLOC_DEV( d_lA[j], magmaFloatComplex, n_local*ldda ); } /* Warm up to measure the performance */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], j/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); }*/ } magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], (j/(nb*tot_subs)*nb), ldda, queues[2*(k%num_gpus)]); }*/ } gpu_time = magma_wtime(); magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf( "magma_cpotrf had error %d.\n", info ); /* gather matrix from gpus */ if (uplo==MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix(j+nk, nk, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { k = ((j+kk*nb)/nb)%tot_subs; int mk = 0; mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { mk += min(nb, N-ii); } if (mk > 0 && nk > 0) { magma_cgetmatrix(mk, nk, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, h_P, 0, lda, queues[2*(k%num_gpus)]); } mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda ); mk += mii; } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix( nk, j+nk, d_lA[k], (j/(nb*tot_subs)*nb), ldda, &h_R[j], 0, lda, queues[2*(k%num_gpus)] ); }*/ } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if (check == 1) { float work[1], matnorm, diffnorm; magmaFloatComplex *h_A; TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); init_matrix( N, h_A, lda ); cpu_time = magma_wtime(); if (uplo == MagmaLower) { lapackf77_cpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); } else { lapackf77_cpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf( "lapackf77_cpotrf had error %d.\n", info ); /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_clange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); TESTING_FREE_PIN( h_A ); } else { printf( "%5d - - (- -) %6.2f (%6.2f) - -\n", N, gpu_perf, gpu_time ); } // free memory #ifdef USE_PINNED_CLMEMORY for (k=0; k<num_gpus; k++) { clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL); clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL); } clReleaseMemObject(buffer1); clReleaseMemObject(buffer2); #else TESTING_FREE_PIN( h_P ); TESTING_FREE_PIN( h_R ); #endif for (j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } /* clean up */ for (i=0; i<num_gpus; i++) { magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); return 0; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; float Cnorm; magma_int_t istart = 1024; magma_int_t iend = 8194; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_C2; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex mzone = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-N", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = lddc = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; K+=32; M+=32; N +=32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*K ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_C2, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*K ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); printf("\nUsage: \n"); printf(" testing_cgemm [-NN|NT|TN|TT] [-N %d] \n\n", 1024); printf("\n"); printf("Testing transA = %c transB = %c\n", transA, transB); printf(" M N K clAmdBlas GFLop/s (sec) CPU GFlop/s (sec) error\n"); printf("===========================================================================\n"); for(i=istart; i<iend; i = (int)(i*1.25) ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS( (float)M, (float)N, (float)K ) * 1e-9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &szeA, h_A ); lapackf77_clarnv( &ione, ISEED, &szeB, h_B ); lapackf77_clarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_csetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync( queue ); gpu_time = magma_wtime(); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync( queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; magma_cgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CPU-BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &M, &N, h_C, &ldc, work ); /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ blasf77_caxpy(&szeC, &mzone, h_C, &ione, h_C2, &ione); error = lapackf77_clange("M", &M, &N, h_C2, &ldc, work)/Cnorm; printf("%5d %5d %5d %8.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, K, gpu_perf, gpu_time, cpu_perf, cpu_time, error); } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; double *w1i, *w2i; double c_neg_one = MAGMA_D_NEG_ONE; double matnorm, tnrm, result[8]; /* Matrix size */ magma_int_t N=0, n2, lda, nb, lwork; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, j, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_vec_t jobl = MagmaVec; magma_vec_t jobr = MagmaVec; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-LN", argv[i])==0) jobl = MagmaNoVec; else if (strcmp("-LV", argv[i])==0) jobl = MagmaVec; else if (strcmp("-RN", argv[i])==0) jobr = MagmaNoVec; else if (strcmp("-RV", argv[i])==0) jobr = MagmaVec; } if ( N > 0 ) printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; lda = N; n2 = lda * N; nb = magma_get_dgehrd_nb(N); lwork = N*(2+nb); // generous workspace - required by dget22 lwork = max(lwork, N * ( 5 + 2*N)); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( w1i, double, N ); TESTING_MALLOC_CPU( w2i, double, N ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( VL, double, n2 ); TESTING_MALLOC_PIN( VR, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf(" N CPU Time(s) GPU Time(s) ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<8; i++){ if ( argc == 1 ){ N = size[i]; } lda = N; n2 = lda*N; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); gpu_time = magma_wtime(); magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeev(lapack_const(jobl), lapack_const(jobr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ if ( checkres ) { /* =================================================================== Check the result following LAPACK's [zcds]drvev routine. The following 7 tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * ugate-transpose of A, and W is as above. * * (3) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (6) VR(full) = VR(partial) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (7) VL(full) = VL(partial) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. ================================================================= */ int jj; double ulp, ulpinv, vmx, vrmx, vtst, res[2]; double *LRE, DUM; TESTING_MALLOC_PIN( LRE, double, n2 ); ulp = lapackf77_dlamch( "P" ); ulpinv = 1./ulp; // Initialize RESULT for (j = 0; j < 8; j++) result[j] = -1.; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); // Do test 1 lapackf77_dget22("N", "N", "N", &N, h_A, &lda, VR, &lda, w1, w1i, h_work, res); result[0] = res[0]; result[0] *= ulp; // Do test 2 lapackf77_dget22("T", "N", "T", &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[1]); result[1] *= ulp; // Do test 3 result[2] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VR[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VR[j * lda], ione), cblas_dnrm2(N, &VR[(j+1)* lda], ione) ); result[2] = fmax(result[2], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && magma_abs( VR[jj+j*lda] ) > vrmx) vrmx = magma_abs( VR[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[2] = ulpinv; } } result[2] *= ulp; // Do test 4 result[3] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VL[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VL[j * lda], ione), cblas_dnrm2(N, &VL[(j+1)* lda], ione) ); result[3] = fmax(result[3], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && magma_abs( VL[jj+j*lda]) > vrmx) vrmx = magma_abs( VL[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[3] = ulpinv; } } result[3] *= ulp; // Compute eigenvalues only, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaNoVec, N, h_R, lda, w2, w2i, &DUM, 1, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, N\n", (int) info); } // Do test 5 result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N N\n"); // Compute eigenvalues and right eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, V\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N V\n"); // Do test 6 result[5] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VR[j+jj*lda] != LRE[j+jj*lda] ) result[5] = 0; // Compute eigenvalues and left eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case V, N\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with V N\n"); // Do test 7 result[6] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VL[j+jj*lda] != LRE[j+jj*lda] ) result[6] = 0; printf("Test 1: | A * VR - VR * W | / ( n |A| ) = %e\n", result[0]); printf("Test 2: | A'* VL - VL * W'| / ( n |A| ) = %e\n", result[1]); printf("Test 3: | |VR(i)| - 1 | = %e\n", result[2]); printf("Test 4: | |VL(i)| - 1 | = %e\n", result[3]); printf("Test 5: W (full) == W (partial) = %f\n", result[4]); printf("Test 6: VR (full) == VR (partial) = %f\n", result[5]); printf("Test 7: VL (full) == VL (partial) = %f\n", result[6]); //==================================================================== matnorm = lapackf77_dlange("f", &N, &ione, w1, &N, h_work); blasf77_daxpy(&N, &c_neg_one, w1, &ione, w2, &ione); result[7] = lapackf77_dlange("f", &N, &ione, w2, &N, h_work) / matnorm; printf("%5d %6.2f %6.2f %e\n", (int) N, cpu_time, gpu_time, result[7]); TESTING_FREE_PIN( LRE ); } else { printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, float, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_sgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float matnorm, work[1]; float mzone = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_sgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_slange("f", &M, &N, h_A, &M, work); blasf77_saxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_slange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_spotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_spotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], float, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( h_A(i,i), MAGMA_S_REAL(h_A(i,i)) + N ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_S_CNJG( h_A(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_spotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix(N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], 0, lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_spotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_spotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; //*h_R1 is used for warm-up magmaDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *h_R1; magmaDoubleComplex_ptr dT; double *rwork; double result[2] = {0., 0.}; double eps; int checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; /* Matrix size */ int N=0, n2, lda, nb, lwork, ltwork, once = 0; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,9900}; #endif int i, info; int ione = 1; int ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if ( N > 0 ){ printf(" testing_zgehrd -N %d\n\n", N); once = 1; } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC_HOST( h_A , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( tau , magmaDoubleComplex, N ); TESTING_MALLOC_HOST( h_R , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R1 , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV ( dT , magmaDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_MALLOC_HOST( h_Q, magmaDoubleComplex, lda*N ); TESTING_MALLOC_HOST( twork, magmaDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_HOST( rwork, double, N ); #endif } printf("\n\n"); printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; gflops = FLOPS( (double)N ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R1, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zgehrd ( N, ione, N, h_R1, lda, tau, h_work, lwork, dT, 0, &info, queue); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); clFinish(queue); gpu_time = get_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, 0, &info, queue); gpu_time = get_time() - gpu_time; if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, 0, nb, &info, queue); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgehrd(&N, &ione, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = get_time() - cpu_time; if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_FREE_HOST( h_work); TESTING_FREE_HOST( h_R ); TESTING_FREE_HOST( h_R1 ); TESTING_FREE_DEV ( dT ); if ( checkres ) { TESTING_FREE_HOST( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claswp */ int main( int argc, char** argv) { /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magmaFloatComplex *h_A1, *h_A2, *h_A3, *h_AT; magmaFloatComplex_ptr d_A1; real_Double_t gpu_time, cpu_time1, cpu_time2; /* Matrix size */ int M=0, N=0, n2, lda, ldat; int size[7] = {1000,2000,3000,4000,5000,6000,7000}; int i, j; int ione = 1; int ISEED[4] = {0,0,0,1}; int *ipiv; int k1, k2, r, c, incx; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_claswp -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); M = N = size[6]; } lda = M; n2 = M*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A1, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A3, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_AT, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, n2 ); ipiv = (int*)malloc(M * sizeof(int)); if (ipiv == 0) { fprintf (stderr, "!!!! host memory allocation error (ipiv)\n"); } printf("\n\n"); printf(" M N CPU_BLAS (sec) CPU_LAPACK (sec) GPU (sec) \n"); printf("=============================================================================\n"); for(i=0; i<7; i++) { if(argc == 1){ M = N = size[i]; } lda = M; ldat = N; n2 = M*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A1 ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A1, &lda, h_A2, &lda ); for(r=0;r<M;r++){ for(c=0;c<N;c++){ h_AT[c+r*ldat] = h_A1[r+c*lda]; } } magma_csetmatrix( N, M, h_AT, 0, ldat, d_A1, 0, ldat, queue); for(j=0; j<M; j++) { ipiv[j] = (int)((rand()*1.*M) / (RAND_MAX * 1.)) + 1; } /* * BLAS swap */ /* Column Major */ cpu_time1 = magma_wtime(); for ( j=0; j<M; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A1+(ipiv[j]-1), &lda); } } cpu_time1 = magma_wtime() - cpu_time1; /* * LAPACK laswp */ cpu_time2 = magma_wtime(); k1 = 1; k2 = M; incx = 1; lapackf77_claswp(&N, h_A2, &lda, &k1, &k2, ipiv, &incx); cpu_time2 = magma_wtime() - cpu_time2; /* * GPU swap */ /* Col swap on transpose matrix*/ gpu_time = magma_wtime(); magma_cpermute_long2(N, d_A1, 0, ldat, ipiv, M, 0, queue); gpu_time = magma_wtime() - gpu_time; /* Check Result */ magma_cgetmatrix( N, M, d_A1, 0, ldat, h_AT, 0, ldat, queue); for(r=0;r<N;r++){ for(c=0;c<M;c++){ h_A3[c+r*lda] = h_AT[r+c*ldat]; } } int check_bl, check_bg, check_lg; check_bl = diffMatrix( h_A1, h_A2, M, N, lda ); check_bg = diffMatrix( h_A1, h_A3, M, N, lda ); check_lg = diffMatrix( h_A2, h_A3, M, N, lda ); printf("%5d %5d %6.2f %6.2f %6.2f %s %s %s\n", M, N, cpu_time1, cpu_time2, gpu_time, (check_bl == 0) ? "SUCCESS" : "FAILED", (check_bg == 0) ? "SUCCESS" : "FAILED", (check_lg == 0) ? "SUCCESS" : "FAILED"); if(check_lg !=0){ printf("lapack swap results:\n"); magma_cprint(M, N, h_A1, lda); printf("gpu swap transpose matrix result:\n"); magma_cprint(M, N, h_A3, lda); } if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A1 ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_A3 ); TESTING_FREE_CPU( h_AT ); TESTING_FREE_DEV( d_A1 ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr dwork; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t nsize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t i, info, min_mn; magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (32*2+2)*min_mn) ); double *h1 = (double*)malloc(sizeof(double)*N*N); memset(h1, 0, N*N*sizeof(double)); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_PIN( h_work, double, lwork ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N)) / 1e9; /* Initialize the matrix */ magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up // magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); /* magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); */ gpu_time = magma_wtime(); magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue ); magma_dgetmatrix( N, N, ddA, 0, N, h_T, 0, N, queue ); // Restore the upper triangular part of A before the check for(int col=0; col<N; col++){ for(int row=0; row<=col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / error; // Check if T is the same double terr = 0.; magma_dgetmatrix( N, N, d_T, 0, N, h_T, 0, N, queue ); for(int col=0; col<N; col++) for(int row=0; row<=col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_T ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); free(h1); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time; double magma_error, clblas_error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; magma_int_t istart = 1024; magma_int_t iend = 6240; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_C, *h_C2, *h_C3; magmaDouble_ptr d_A, d_B, d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); int lapack = getenv("MAGMA_RUN_LAPACK") != NULL; int count = 1; printf("\nUsage: testing_dgemm [-NN|NT|TN|TT|NC|CN|TC|CT|CC] -M m -N n -K k -count c -l\n" " -l or setting $MAGMA_RUN_LAPACK runs CPU BLAS,\n" " and computes both MAGMA and CLBLAS error using CPU BLAS result.\n" " Else, MAGMA error is computed using CLBLAS result.\n\n"); for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 && i+1 < argc ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 && i+1 < argc ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-l", argv[i])==0) { lapack = true; } else if ( strcmp("-count", argv[i]) == 0 && i+1 < argc ){ count = atoi(argv[++i]); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ldda; K += 32; M += 32; N += 32; TESTING_MALLOC_CPU( h_A, double, lda*K ); TESTING_MALLOC_CPU( h_B, double, ldb*Bn ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_C2, double, ldc*N ); TESTING_MALLOC_CPU( h_C3, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*K ); TESTING_MALLOC_DEV( d_B, double, lddb*Bn ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); printf("Testing transA = %c transB = %c\n", *lapack_const(transA), *lapack_const(transB)); printf(" M N K MAGMA Gflop/s (sec) CLBLAS Gflop/s (sec) CPU Gflop/s (sec) MAGMA error CLBLAS error\n"); printf("===========================================================================================================\n"); for( i=istart; i<iend; i = (int)(i*1.25) ) { for( int cnt = 0; cnt < count; ++cnt ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS_DGEMM( M, N, K ) / 1e9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &szeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &szeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_dsetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magmablas_dgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime(); magmablas_dgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime() - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_dgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime(); magma_dgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime() - clblas_time; clblas_perf = gflops / clblas_time; magma_dgetmatrix( M, N, d_C, 0, lddc, h_C3, 0, ldc, queue ); /* ===================================================================== Performs operation using BLAS =================================================================== */ if ( lapack ) { cpu_time = magma_wtime(); blasf77_dgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ if ( lapack ) { // compare both magma & clblas to lapack blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C2, &ione); magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work); blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C3, &ione); clblas_error = lapackf77_dlange("M", &M, &N, h_C3, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) %7.2f (%7.4f) %8.2e %8.2e\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time, magma_error, clblas_error ); } else { // compare magma to clblas blasf77_daxpy(&szeC, &c_neg_one, h_C3, &ione, h_C2, &ione); magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) --- ( --- ) %8.2e ---\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, magma_error ); } } if ( count > 1 ) { printf( "\n" ); } } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_CPU( h_C3 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }