int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double *hA, *hR; magmaDouble_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 }; magma_int_t i, info; double mz_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_dpotrf2_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 2, &num ); if ( err != 0 or num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( hA, double, n2 ); TESTING_MALLOC_PIN( hR, double, n2 ); TESTING_MALLOC_DEV( dA, double, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_D_SET2REAL( hA(i,i), MAGMA_D_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_D_CNJG( hA(j,i) ); } } lapackf77_dlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1); clFinish(queue1); magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1 ); clFinish(queue1); gpu_time = magma_wtime(); magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_dpotrf2 had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( MagmaLowerStr, &N, hA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_dpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_dgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue1 ); matnorm = lapackf77_dlange("f", &N, &N, hA, &lda, work); blasf77_daxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_dlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_PIN( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_cgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_zpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_zpotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], magmaDoubleComplex, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { h_A(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(h_A(i,i)) + N, 0 ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_Z_CNJG( h_A(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_zpotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zgetmatrix( N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_zpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_zpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_zlange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol, eps = lapackf77_slamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_sgeqrf_nb(M); gflops = FLOPS_SGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_CPU( h_work, float, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ float *tau; TESTING_MALLOC_CPU( tau, float, min_mn ); cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; float *h_W1, *h_W2, *h_W3; float *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, float, n2 ); // Q TESTING_MALLOC_CPU( h_W2, float, n2 ); // R TESTING_MALLOC_CPU( h_W3, float, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, float, M ); // RWORK lapackf77_slarnv( &ione, ISEED2, &n2, h_A ); lapackf77_sqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_slange("f", &M, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_zopts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); real_Double_t res; magma_z_matrix A={Magma_CSR}, A2={Magma_CSR}, A3={Magma_CSR}, A4={Magma_CSR}, A5={Magma_CSR}; int i=1; TESTING_CHECK( magma_zparse_opts( argc, argv, &zopts, &i, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_zm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_z_csr_mtx( &A, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); // filename for temporary matrix storage const char *filename = "testmatrix.mtx"; // write to file TESTING_CHECK( magma_zwrite_csrtomtx( A, filename, queue )); // read from file TESTING_CHECK( magma_z_csr_mtx( &A2, filename, queue )); // delete temporary matrix unlink( filename ); //visualize printf("A2:\n"); TESTING_CHECK( magma_zprint_matrix( A2, queue )); //visualize TESTING_CHECK( magma_zmconvert(A2, &A4, Magma_CSR, Magma_CSRL, queue )); printf("A4:\n"); TESTING_CHECK( magma_zprint_matrix( A4, queue )); TESTING_CHECK( magma_zmconvert(A4, &A5, Magma_CSR, Magma_ELL, queue )); printf("A5:\n"); TESTING_CHECK( magma_zprint_matrix( A5, queue )); // pass it to another application and back magma_int_t m, n; magma_index_t *row, *col; magmaDoubleComplex *val=NULL; TESTING_CHECK( magma_zcsrget( A2, &m, &n, &row, &col, &val, queue )); TESTING_CHECK( magma_zcsrset( m, n, row, col, val, &A3, queue )); TESTING_CHECK( magma_zmdiff( A, A2, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester IO: ok\n"); else printf("%% tester IO: failed\n"); TESTING_CHECK( magma_zmdiff( A, A3, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester matrix interface: ok\n"); else printf("%% tester matrix interface: failed\n"); magma_zmfree(&A, queue ); magma_zmfree(&A2, queue ); magma_zmfree(&A4, queue ); magma_zmfree(&A5, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
int main( int argc, char** argv ) { magma_init(); magma_print_environment(); magma_int_t err; magma_int_t num = 0; magma_device_t dev; magma_queue_t queues; magma_queue_create( 0, &queues ); const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; double dummy[1]; magma_int_t M, N, MN,lda, ldb, ldc, ldda, info; double *h_A, *h_S, *h_U, *h_VT; double *d_Acu, *d_test; magmaDouble_ptr d_A, d_U, d_S, d_VT; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; double tmp; double error, rwork[1]; magma_int_t status = 0; M = 3; N = 4; MN = M*N; ldda = magma_roundup(N, 32); lda = N; h_A = (double*)malloc(M*N*sizeof(double)); cudaMalloc((void**)& d_Acu, M*N*sizeof(double)); cudaMemset(d_Acu, 0, M*N*sizeof(double)); cudaPrintMatrix(d_Acu, M, N); magma_malloc((void**)&d_A, M*ldda*sizeof(double)); magma_malloc((void**)&d_test, M*ldda*sizeof(double)); cudaMemset(d_test, 0, M*ldda*sizeof(double)); //magma_malloc((void**)&d_A, M*ldda*sizeof(double)); //magma_malloc((void**)&d_S, N*ldda*sizeof(double)); //magma_malloc((void**)&d_U, lddbm*lddbn*sizeof(double)); //magma_malloc((void**)&d_VT, lddcm*lddcn*sizeof(double)); //printMatrix(d_A, M, K, lddan); //exit(0); //printf("\n\n ldda: %d, M: %d, N: %d \n\n", (int)lddan, (int)M, (int)N); // Initialize the matrix lapackf77_dlarnv(&ione, ISEED, &MN, h_A); //cudaMemcpy(d_Acu, h_A, M*N*sizeof(double), cudaMemcpyHostToDevice); magma_dsetmatrix(N, M, h_A, lda, d_A, ldda, queues); printMatrix(h_A, M, N, lda, queues); printf("========MTOC============\n"); magma_to_cuda(M, N, d_A, ldda, d_Acu); //printMatrix(d_test, M, N, lda, queues); printMatrix(d_A, M, N, ldda, queues); cudaPrintMatrix(d_Acu, M, N); printf("========CTOM============\n"); cuda_to_magma(M, N, d_Acu, d_test, ldda); cudaPrintMatrix(d_Acu, M, N); printMatrix(d_test, M, N, ldda, queues); //exit(0); printf("====================\n"); //cudaMemcpy(d_A, h_A, n2*sizeof(double), cudaMemcpyHostToDevice); //if(M >= N){ // printMatrix(h_U, M, N, ldda, queues); // printMatrix(h_S, 1, N, ldda, queues); // printMatrix(h_VT, N, N, ldda, queues); //}else{ // printMatrix(h_U, M, M, ldda, queues); // printMatrix(h_S, 1, M, ldda, queues); // printMatrix(h_VT, M, N, ldda, queues); //} magma_finalize(); return 0; }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", (int) n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_copts zopts; magma_queue_t queue; magma_queue_create( 0, &queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_c_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; TESTING_CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; TESTING_CHECK( magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } // for the eigensolver case zopts.solver_par.ev_length = A.num_cols; TESTING_CHECK( magma_ceigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix TESTING_CHECK( magma_cmscale( &A, zopts.scaling, queue )); // preconditioner if ( zopts.solver_par.solver != Magma_ITERREF ) { TESTING_CHECK( magma_c_precondsetup( A, b, &zopts.solver_par, &zopts.precond_par, queue ) ); } TESTING_CHECK( magma_cmconvert( A, &B, Magma_CSR, zopts.output_format, queue )); printf( "\n%% matrix info: %lld-by-%lld with %lld nonzeros\n\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); printf("matrixinfo = [\n"); printf("%% size (m x n) || nonzeros (nnz) || nnz/m || stored nnz\n"); printf("%%============================================================================%%\n"); printf(" %8lld %8lld %10lld %4lld %10lld\n", (long long) B.num_rows, (long long) B.num_cols, (long long) B.true_nnz, (long long) (B.true_nnz/B.num_rows), (long long) B.nnz ); printf("%%============================================================================%%\n"); printf("];\n"); TESTING_CHECK( magma_cmtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess TESTING_CHECK( magma_cvinit( &b, Magma_DEV, A.num_rows, 1, one, queue )); //magma_cvinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_c_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_cmfree(&x, queue ); TESTING_CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_c_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ) { printf("%%error: solver returned: %s (%lld).\n", magma_strerror( info ), (long long) info ); } printf("convergence = [\n"); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); zopts.solver_par.verbose = 0; printf("solverinfo = [\n"); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); printf("precondinfo = [\n"); printf("%% setup runtime\n"); printf(" %.6f %.6f\n", zopts.precond_par.setuptime, zopts.precond_par.runtime ); printf("];\n\n"); magma_cmfree(&B_d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&b, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_cgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_A2, *h_A3, *h_work, *h_work2, *tau, *d_work2; cuDoubleComplex *d_A, *d_work; float gpu_perf, cpu_perf, cpu2_perf; double flops; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda, M=0; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, j, info[1]; magma_int_t loop = argc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; magma_int_t num_gpus = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf_mc -M %d -N %d \n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; magma_int_t min_mn = min(M,N); /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } magma_int_t lwork = n2; h_work2 = (cuDoubleComplex*)malloc(lwork * sizeof(cuDoubleComplex)); if (h_work2 == 0) { fprintf (stderr, "!!!! host memory allocation error (h_work2)\n"); } h_A3 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A3[0])); if (h_A3 == 0) { fprintf (stderr, "!!!! host memory allocation error (A3)\n"); } tau = (cuDoubleComplex*)malloc(min_mn * sizeof(cuDoubleComplex)); if (tau == 0) { fprintf (stderr, "!!!! host memory allocation error (tau)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" M N LAPACK Gflop/s Multi-core Gflop/s ||R||_F / ||A||_F\n"); printf("===========================================================================\n"); for(i=0; i<10; i++){ if (loop == 1) { M = N = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A2 ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A2, &M, h_A3, &M ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zgeqrf(&M, &N, h_A3, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu2_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using multicore =================================================================== */ start = get_current_time(); magma_zgeqrf_mc(context, &M, &N, h_A2, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; matnorm = lapackf77_zlange("f", &M, &N, h_A2, &M, work); blasf77_zaxpy(&n2, &mone, h_A2, &one, h_A3, &one); printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu2_perf, cpu_perf, lapackf77_zlange("f", &M, &N, h_A3, &M, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A2); free(tau); free(h_A3); free(h_work2); /* Shut down the MAGMA context */ magma_finalize(context); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float matnorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_sgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_A, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_B, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, float, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = -1; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, float, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (float)M, (float)N ) + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_slarnv( &ione, ISEED, &n2, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_zopts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); real_Double_t res; magma_z_matrix Z={Magma_CSR}, Z2={Magma_CSR}, A={Magma_CSR}, A2={Magma_CSR}, AT={Magma_CSR}, AT2={Magma_CSR}, B={Magma_CSR}; int i=1; TESTING_CHECK( magma_zparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_zm_5stencil( laplace_size, &Z, queue )); } else { // file-matrix test TESTING_CHECK( magma_z_csr_mtx( &Z, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) Z.num_rows, (long long) Z.num_cols, (long long) Z.nnz ); // convert to be non-symmetric TESTING_CHECK( magma_zmconvert( Z, &A, Magma_CSR, Magma_CSRL, queue )); TESTING_CHECK( magma_zmconvert( Z, &B, Magma_CSR, Magma_CSRU, queue )); // transpose TESTING_CHECK( magma_zmtranspose( A, &AT, queue )); // quite some conversions //ELL TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELL, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELL, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //ELLPACKT TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLPACKT, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLPACKT, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //ELLRT AT2.blocksize = 8; AT2.alignment = 8; TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLRT, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLRT, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //SELLP AT2.blocksize = 8; AT2.alignment = 8; TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_SELLP, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_SELLP, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //ELLD TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLD, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLD, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //CSRCOO TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRCOO, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRCOO, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //CSRLIST TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRLIST, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRLIST, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); //CSRD TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRD, queue )); magma_zmfree(&AT, queue ); TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRD, Magma_CSR, queue )); magma_zmfree(&AT2, queue ); // transpose TESTING_CHECK( magma_zmtranspose( AT, &A2, queue )); TESTING_CHECK( magma_zmdiff( A, A2, &res, queue)); printf("%% ||A-A2||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% conversion tester: ok\n"); else printf("%% conversion tester: failed\n"); TESTING_CHECK( magma_zmlumerge( A2, B, &Z2, queue )); TESTING_CHECK( magma_zmdiff( Z, Z2, &res, queue)); printf("%% ||Z-Z2||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% LUmerge tester: ok\n"); else printf("%% LUmerge tester: failed\n"); magma_zmfree(&A, queue ); magma_zmfree(&A2, queue ); magma_zmfree(&AT, queue ); magma_zmfree(&AT2, queue ); magma_zmfree(&B, queue ); magma_zmfree(&Z2, queue ); magma_zmfree(&Z, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
int main( int argc, char** argv) { real_Double_t gflops, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time; double magma_error, clblas_error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; magma_int_t istart = 1024; magma_int_t iend = 6240; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_C, *h_C2, *h_C3; magmaDouble_ptr d_A, d_B, d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); int lapack = getenv("MAGMA_RUN_LAPACK") != NULL; int count = 1; printf("\nUsage: testing_dgemm [-NN|NT|TN|TT|NC|CN|TC|CT|CC] -M m -N n -K k -count c -l\n" " -l or setting $MAGMA_RUN_LAPACK runs CPU BLAS,\n" " and computes both MAGMA and CLBLAS error using CPU BLAS result.\n" " Else, MAGMA error is computed using CLBLAS result.\n\n"); for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 && i+1 < argc ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 && i+1 < argc ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-l", argv[i])==0) { lapack = true; } else if ( strcmp("-count", argv[i]) == 0 && i+1 < argc ){ count = atoi(argv[++i]); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ldda; K += 32; M += 32; N += 32; TESTING_MALLOC_CPU( h_A, double, lda*K ); TESTING_MALLOC_CPU( h_B, double, ldb*Bn ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_C2, double, ldc*N ); TESTING_MALLOC_CPU( h_C3, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*K ); TESTING_MALLOC_DEV( d_B, double, lddb*Bn ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); printf("Testing transA = %c transB = %c\n", *lapack_const(transA), *lapack_const(transB)); printf(" M N K MAGMA Gflop/s (sec) CLBLAS Gflop/s (sec) CPU Gflop/s (sec) MAGMA error CLBLAS error\n"); printf("===========================================================================================================\n"); for( i=istart; i<iend; i = (int)(i*1.25) ) { for( int cnt = 0; cnt < count; ++cnt ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS_DGEMM( M, N, K ) / 1e9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &szeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &szeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_dsetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magmablas_dgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime(); magmablas_dgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime() - magma_time; magma_perf = gflops / magma_time; magma_dgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_dgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime(); magma_dgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime() - clblas_time; clblas_perf = gflops / clblas_time; magma_dgetmatrix( M, N, d_C, 0, lddc, h_C3, 0, ldc, queue ); /* ===================================================================== Performs operation using BLAS =================================================================== */ if ( lapack ) { cpu_time = magma_wtime(); blasf77_dgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ if ( lapack ) { // compare both magma & clblas to lapack blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C2, &ione); magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work); blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C3, &ione); clblas_error = lapackf77_dlange("M", &M, &N, h_C3, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) %7.2f (%7.4f) %8.2e %8.2e\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time, magma_error, clblas_error ); } else { // compare magma to clblas blasf77_daxpy(&szeC, &c_neg_one, h_C3, &ione, h_C2, &ione); magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) --- ( --- ) %8.2e ---\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, magma_error ); } } if ( count > 1 ) { printf( "\n" ); } } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_CPU( h_C3 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaFloatComplex *h_R = NULL, *h_P = NULL; magmaFloatComplex_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, check = 0, info; magmaFloatComplex mz_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0; int nb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i]) == 0){ N = atoi(argv[++i]); if (N > 0) { size[0] = size[9] = N; flag = 1; } } if(strcmp("-NGPU", argv[i]) == 0) num_gpus0 = atoi(argv[++i]); if(strcmp("-NSUB", argv[i]) == 0) num_subs0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i]) == 0) uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower : MagmaUpper); if(strcmp("-check", argv[i]) == 0) check = 1; } } /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus0;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("\nUsing %d GPUs:\n", num_gpus0); printf(" testing_cpotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0, (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " ")); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9;; nb = magma_get_cpotrf_nb(N); if (num_subs0*num_gpus0 > N/nb) { num_gpus = N/nb; num_subs = 1; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); } else { num_gpus = num_gpus0; num_subs = num_subs0; } tot_subs = num_subs * num_gpus; /* Allocate host memory for the matrix */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(magmaFloatComplex), NULL, NULL); cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(magmaFloatComplex), NULL, NULL); for (k=0; k<num_gpus; k++) { h_R = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, n2*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); h_P = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lda*nb*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); } #else TESTING_MALLOC_PIN( h_P, magmaFloatComplex, lda*nb ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); #endif /* Initialize the matrix */ init_matrix( N, h_R, lda ); /* Allocate GPU memory */ if (uplo == MagmaUpper) { ldda = ((N+nb-1)/nb)*nb; n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; } else { ldda = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; n_local = ((N+nb-1)/nb)*nb; } for (j=0; j<tot_subs; j++) { TESTING_MALLOC_DEV( d_lA[j], magmaFloatComplex, n_local*ldda ); } /* Warm up to measure the performance */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], j/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); }*/ } magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], (j/(nb*tot_subs)*nb), ldda, queues[2*(k%num_gpus)]); }*/ } gpu_time = magma_wtime(); magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf( "magma_cpotrf had error %d.\n", info ); /* gather matrix from gpus */ if (uplo==MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix(j+nk, nk, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { k = ((j+kk*nb)/nb)%tot_subs; int mk = 0; mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { mk += min(nb, N-ii); } if (mk > 0 && nk > 0) { magma_cgetmatrix(mk, nk, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, h_P, 0, lda, queues[2*(k%num_gpus)]); } mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda ); mk += mii; } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix( nk, j+nk, d_lA[k], (j/(nb*tot_subs)*nb), ldda, &h_R[j], 0, lda, queues[2*(k%num_gpus)] ); }*/ } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if (check == 1) { float work[1], matnorm, diffnorm; magmaFloatComplex *h_A; TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); init_matrix( N, h_A, lda ); cpu_time = magma_wtime(); if (uplo == MagmaLower) { lapackf77_cpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); } else { lapackf77_cpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf( "lapackf77_cpotrf had error %d.\n", info ); /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_clange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); TESTING_FREE_PIN( h_A ); } else { printf( "%5d - - (- -) %6.2f (%6.2f) - -\n", N, gpu_perf, gpu_time ); } // free memory #ifdef USE_PINNED_CLMEMORY for (k=0; k<num_gpus; k++) { clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL); clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL); } clReleaseMemObject(buffer1); clReleaseMemObject(buffer2); #else TESTING_FREE_PIN( h_P ); TESTING_FREE_PIN( h_R ); #endif for (j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } /* clean up */ for (i=0; i<num_gpus; i++) { magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); return 0; }
SEXP smagmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri) { magma_init(); int ndevices; ndevices = INTEGER_VALUE(ngpu); int idevice; for(idevice=0; idevice < ndevices; idevice++) { magma_setdevice(idevice); if(CUBLAS_STATUS_SUCCESS != cublasInit()) { printf("Error: gpu %d: cublasInit failed\n", idevice); magma_finalize(); exit(-1); } } // magma_print_devices(); int In, INB; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); double *PA = NUMERIC_POINTER(A); float *sPA = calloc(In*In, sizeof(float)); int i,j; for(i = 0; i < In; i++) { for(j = 0; j < In; j++) { sPA[i*In + j] = (float) PA[i*In + j]; } } magma_int_t N, status, info, nGPUs; N = In; status = 0; nGPUs = ndevices; //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri) magma_spotrf_m(nGPUs, MagmaLower, N, sPA, N, &info); else magma_spotrf_m(nGPUs, MagmaUpper, N, sPA, N, &info); if(info != 0) { printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} magma_finalize(); cublasShutdown(); //caste sPA back to double and set upper or lower triangle to zero if necessary: int IZeroTri = INTEGER_VALUE(zeroTri); int zeroUTri = IZeroTri & lTri; int zeroLTri = IZeroTri & !lTri; if(!IZeroTri) { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { PA[i*In + j] = (double) sPA[i*In + j]; } } } else if(zeroUTri) { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { if(i > j) PA[i*In + j] = 0; else PA[i*In + j] = (double) sPA[i*In + j]; } } } else { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { if(i < j) PA[i*In + j] = 0; else PA[i*In + j] = (double) sPA[i*In + j]; } } } UNPROTECT(1); free(sPA); return(R_NilValue); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A; magma_int_t *ipiv; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7104,8064,9000,10000}; #endif magma_int_t i, info, min_mn; //magma_int_t nb, maxn, ret; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_zgetrf -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_zgetrf -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgetrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } ldda = ((M+31)/32)*32; //maxn = ((N+31)/32)*32; n2 = M * N; min_mn = min(M, N); //nb = magma_get_zgetrf_nb(min_mn); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" M N CPU GFlop/ (sec)s GPU GFlop/s (sec) ||PA-LU||/(||A||*N)\n"); printf("========================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (double)M, (double)N ) *1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ magma_zgetmatrix( M, N, d_A, 0, ldda, h_A, 0, lda, queue ); error = get_LU_error(M, N, h_R, lda, h_A, ipiv); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, error); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; float Cnorm; magma_int_t istart = 1024; magma_int_t iend = 8194; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_C2; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex mzone = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-N", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = lddc = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; K+=32; M+=32; N +=32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*K ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_C2, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*K ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); printf("\nUsage: \n"); printf(" testing_cgemm [-NN|NT|TN|TT] [-N %d] \n\n", 1024); printf("\n"); printf("Testing transA = %c transB = %c\n", transA, transB); printf(" M N K clAmdBlas GFLop/s (sec) CPU GFlop/s (sec) error\n"); printf("===========================================================================\n"); for(i=istart; i<iend; i = (int)(i*1.25) ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS( (float)M, (float)N, (float)K ) * 1e-9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &szeA, h_A ); lapackf77_clarnv( &ione, ISEED, &szeB, h_B ); lapackf77_clarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_csetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync( queue ); gpu_time = magma_wtime(); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync( queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; magma_cgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CPU-BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &M, &N, h_C, &ldc, work ); /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ blasf77_caxpy(&szeC, &mzone, h_C, &ione, h_C2, &ione); error = lapackf77_clange("M", &M, &N, h_C2, &ldc, work)/Cnorm; printf("%5d %5d %5d %8.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, K, gpu_perf, gpu_time, cpu_perf, cpu_time, error); } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv_gpu */ int main(int argc , char **argv) { real_Double_t gflops, gpu_perf, gpu_time; double Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex *hA, *hB, *hX; magmaDoubleComplex_ptr dA, dB; magma_int_t *ipiv; magma_int_t N = 0, n2, lda, ldb, ldda, lddb; magma_int_t size[7] = { 1024, 2048, 3072, 4032, 5184, 6048, 7000}; magma_int_t i, info, szeB; magmaDoubleComplex z_one = MAGMA_Z_ONE; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t NRHS = 100; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-R", argv[i])==0) NRHS = atoi(argv[++i]); } if (N>0) size[0] = size[6] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgesv_gpu -N <matrix size> -R <right hand sides>\n\n"); } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[6]; n2 = N * N; ldda = ((N+31)/32) * 32; // ldda = N; lddb = ldda; TESTING_MALLOC_PIN( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( hB, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( hX, magmaDoubleComplex, N*NRHS ); TESTING_MALLOC_PIN( work, double, N ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dB, magmaDoubleComplex, lddb*NRHS ); printf("\n\n"); printf(" N NRHS GPU GFlop/s (sec) ||B - AX|| / ||A||*||X||\n"); printf("===========================================================\n"); for( i = 0; i < 7; i++ ) { N = size[i]; lda = N; ldb = lda; n2 = lda*N; szeB = ldb*NRHS; ldda = ((N+31)/32)*32; //ldda = N; lddb = ldda; gflops = ( FLOPS_GETRF( (double)N, (double)N ) + FLOPS_GETRS( (double)N, (double)NRHS ) ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlarnv( &ione, ISEED, &szeB, hB ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); //===================================================================== // Solve Ax = b through an LU factorization //===================================================================== magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue ); gpu_time = magma_wtime(); magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zposv had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Residual =================================================================== */ magma_zgetmatrix( N, NRHS, dB, 0, lddb, hX, 0, ldb, queue ); Anorm = lapackf77_zlange("I", &N, &N, hA, &lda, work); Xnorm = lapackf77_zlange("I", &N, &NRHS, hX, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &NRHS, &N, &z_one, hA, &lda, hX, &ldb, &mz_one, hB, &ldb ); Rnorm = lapackf77_zlange("I", &N, &NRHS, hB, &ldb, work); printf( "%5d %5d %7.2f (%7.2f) %8.2e\n", N, NRHS, gpu_perf, gpu_time, Rnorm/(Anorm*Xnorm) ); if (argc != 1) break; } /* clean up */ TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( hB ); TESTING_FREE_PIN( hX ); TESTING_FREE_PIN( work ); TESTING_FREE_PIN( ipiv ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dB ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n, lda, ldda, size, info; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N |dC - C|/|C|\n"); printf("====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // for this simple case, all matrices are N-by-N n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices size = lda*n; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference, |dC - C| / |C| magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ) / lapackf77_clange( "F", &n, &n, C, &lda, work ); printf( "%5d %8.2e %s\n", (int) n, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); fflush( stdout ); } } cublasDestroy( handle ); magma_finalize(); return status; }
int main( int argc, char **argv ) { printf("Starting\n"); int size; cudaError_t cudaStat; magma_err_t magmaStat; cublasStatus_t stat; cublasHandle_t handle; int it,i; cublasOperation_t N = 'N'; cublasOperation_t T = 'T'; char N2 = 'N'; char T2 = 'T'; double one = 1., zero=0.; char uplo = 'L'; int info; int err; double* A; double* B; magmaStat = magma_init(); int use_pinned; if(argc > 1) { use_pinned = atoi(argv[1]); } else use_pinned = 0; printf("Setting use_pinned to %d\n", use_pinned); for( size = 256; size <= 8192; size*=2 ) { if(use_pinned) { // allocate pinned memory on CPU err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); } else { // allocate standard memory on CPU A = (double*) malloc( sizeof(double)*size*size ); B = (double*) malloc( sizeof(double)*size*size ); } cudaDeviceSynchronize(); double tInit = read_timer(); double *dA,*dB; // allocate memory on GPU magma_malloc( (void**) &dA, sizeof(double)*size*size ); magma_malloc( (void**) &dB, sizeof(double)*size*size ); cudaDeviceSynchronize(); double tAlloc = read_timer(); fillMatrix(B, size*size); cudaDeviceSynchronize(); double tInit2 = read_timer(); // transfer data to GPU magma_dsetmatrix( size, size, B, size, dB, size ); cudaDeviceSynchronize(); double tTransferToGPU = read_timer(); // matrix multiply magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size ); // magma_dgemm is apparently synonymous with magmablas_dgemm cudaDeviceSynchronize(); double tMatMult = read_timer(); // Cholesky decomposition on GPU with GPU interface (called with object on GPU) magma_dpotrf_gpu( 'L', size, dA, size, &info ); cudaDeviceSynchronize(); double tChol = read_timer(); // transfer data back to CPU magma_dgetmatrix( size, size, dA, size, A, size ); cudaDeviceSynchronize(); double tTransferFromGPU = read_timer(); // standard BLAS matrix multiply on CPU dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tMatMultBlas = read_timer(); // Cholesky decomposition on GPU with CPU interface (called with object on CPU) magma_dpotrf( 'L', size, A, size, &info ); cudaDeviceSynchronize(); double tCholCpuInterface = read_timer(); // recreate A = B * B (could just do a save and copy instead....) dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tInit3 = read_timer(); // standard Lapack Cholesky decomposition on CPU dpotrf_(&uplo, &size, A, &size, &info); cudaDeviceSynchronize(); double tCholCpu= read_timer(); printf("====================================================\n"); printf("Timing results for n = %d\n", size); printf("GPU memory allocation time: %f\n", tAlloc - tInit); printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); if(use_pinned) { magma_free_pinned(A); magma_free_pinned(B); } else { free(A); free(B); } magma_free(dA); magma_free(dB); } return EXIT_SUCCESS; }
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri) { magma_init(); // magma_print_devices(); double *h_R; int In, INB, ID; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); ID = INTEGER_VALUE(id); double *PA = NUMERIC_POINTER(A); int i,j; magma_int_t N, n2, lda, status, info, max_size; N=In; lda = N; n2 = lda*N; /* for(i = 0; i < In; i++) { for(j = 0; j < In; j++) { printf("%.8f ", PA[i+j*In]); } printf("\n"); } */ if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) { fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); magma_finalize(); exit(-1); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda ); N = In; status = 0; magma_setdevice(ID); //printf("Modified by Vinay in one GPU\n"); //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri) magma_dpotrf(MagmaLower, N, h_R, N, &info); else magma_dpotrf(MagmaUpper, N, h_R, N, &info); if(info != 0) { printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda ); //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} magma_free_pinned(h_R); magma_finalize(); cublasShutdown(); /* int IZeroTri; IZeroTri = INTEGER_VALUE(zeroTri); if(IZeroTri & lTri) { for(i = 1; i < In; i++) { for(j=0; j< i; j++) { PA[i*In+j] = 0.0; } } } else if(IZeroTri) for(i = 0; i < In; i++) { for(j=i+1; j < In; j++) { PA[i*In+j] = 0.0; } }*/ return(R_NilValue); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf_mc */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_work, *h_A2; cuDoubleComplex *d_A; float gpu_perf, cpu_perf, cpu_perf2; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6048,7200,8064,8928,10080}; magma_int_t i, j, info[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; int num_gpus = 0; magma_int_t loop = argc; if (argc != 1) { for(i = 1; i<argc; i++) { if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if (N==0) { N = size[9]; loop = 1; } else { size[0] = size[9] = N; } } else { printf("\nUsage: \n"); printf(" testing_zpotrf_mc -N %d -B 128 \n\n", 1024); N = size[9]; } lda = N; n2 = size[9] * size[9]; /* Allocate host memory for the matrix */ h_A = (cuDoubleComplex*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); } /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" N Multicore GFlop/s ||R||_F / ||A||_F\n"); printf("=============================================\n"); for(i=0; i<10; i++) { N = lda = size[i]; n2 = N*N; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); for(j=0; j<N; j++) MAGMA_Z_SET2REAL( h_A[j*lda+j], ( MAGMA_Z_GET_X(h_A[j*lda+j]) + 2000. ) ); for(j=0; j<n2; j++) h_A2[j] = h_A[j]; /* ===================================================================== Performs operation using LAPACK =================================================================== */ //lapackf77_zpotrf("L", &N, h_A, &lda, info); lapackf77_zpotrf("U", &N, h_A, &lda, info); if (info[0] < 0) printf("Argument %d of zpotrf had an illegal value.\n", -info[0]); /* ===================================================================== Performs operation using multi-core =================================================================== */ start = get_current_time(); //magma_zpotrf_mc(context, "L", &N, h_A2, &lda, info); magma_zpotrf_mc(context, "U", &N, h_A2, &lda, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of magma_zpotrf_mc had an illegal value.\n", -info[0]); cpu_perf2 = FLOPS( (double)N ) / (1000000.*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; int one = 1; matnorm = lapackf77_zlange("f", &N, &N, h_A, &N, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_A2, &one); printf("%5d %6.2f %e\n", size[i], cpu_perf2, lapackf77_zlange("f", &N, &N, h_A2, &N, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A); free(h_A2); /* Shut down the MAGMA context */ magma_finalize(context); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A, dwork; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 }; magma_int_t ntest = 10; magma_int_t i, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; magmaDoubleComplex *work; magmaDoubleComplex tmp; double rwork[1]; magma_int_t *ipiv; magma_int_t lwork, ldwork; double A_norm, R_norm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[ntest-1] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgetri_gpu -N %d\n\n", 1024); } /* query for Lapack workspace size */ N = size[ntest-1]; lda = N; work = &tmp; lwork = -1; lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); lwork = int( MAGMA_Z_REAL( *work )); /* query for Magma workspace size */ ldwork = N * magma_get_zgetri_nb( N ); /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory */ n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( ipiv, magma_int_t, N ); TESTING_MALLOC( work, magmaDoubleComplex, lwork ); TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for( i=0; i < ntest; i++ ){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZGETRI( (double)N ) / 1e9; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue ); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //warm-up magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); gpu_time = get_time(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); gpu_time = get_time()-gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d\n", (int) info); gpu_perf = gflops / gpu_time; magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ); printf( "%5d %6.2f %6.2f %e\n", (int) N, cpu_perf, gpu_perf, R_norm / A_norm ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( ipiv ); TESTING_FREE( work ); TESTING_FREE( h_A ); TESTING_FREE_HOST( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }