extern "C" int finalize_environment() { magma_queue_destroy(queue); magma_finalize(); return 0; }
int main() { //Magma initialization magma_init(); //Declaration of local variables double *a, *b, *dev_a, results=0; const int N=4098; int i,j; magma_int_t info=0, lda=N, ngpu=2; //Memory Allocation Segment magma_malloc_pinned((void**) &a,(N*N)*sizeof(double)); magma_malloc_pinned((void**) &b,(N*N)*sizeof(double)); //Generate two copies of Symmetric Positive Definite Matrix for(i=0;i<N;i++) { for(j=0;j<i;j++) { a[i*N+j] = 1e-9* (double)rand(); a[j*N+i] = a[i*N+j]; b[i*N+j] = a[i*N+j]; b[j*N+i] = b[i*N+j]; } a[i*N+i] = 1e-9*(double)rand() + 1000.0; b[i*N+i] = a[i*N+i]; } //Call custom Magma Cholesky for obtaining results rr_dpotrf_m(ngpu,MagmaUpper,N,a,N,&info); //Call Standard Magma Cholesky for result validation magma_dpotrf(MagmaUpper,N,b,N,&info); if(info != 0) { printf("magma_dpotrf original returned error %d: %s. \n",(int) info, magma_strerror(info)); } //Validate the results; Compute the RMS error value. for(i=0;i<N;i++) for(j=0;j<N;j++) results = results + (a[i*N+j] - b[i*N+j]) * (a[i*N+j] - b[i*N+j]); //Display the results of the test if(results < 1e-5) printf("The two functions have identical results\n"); else printf("The custom function had significant errors. The RMS value was %g\n",results); magma_free_pinned(a); magma_free_pinned(b); magma_finalize(); return 0; }
// ------------------------------------------------------------ int main( int argc, char** argv ) { magma_init(); magma_int_t n = 1000; magma_int_t nrhs = 1; printf( "using MAGMA CPU interface\n" ); cpu_interface( n, nrhs ); printf( "using MAGMA GPU interface\n" ); gpu_interface( n, nrhs ); magma_finalize(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix Z={Magma_CSR}; int i=1; TESTING_CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); printf("matrixinfo = [\n"); printf("%% size (n) || nonzeros (nnz) || nnz/n\n"); printf("%%=============================================================%%\n"); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &Z, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &Z, argv[i], queue )); } printf(" %10lld %10lld %10lld\n", (long long) Z.num_rows, (long long) Z.nnz, (long long) (Z.nnz/Z.num_rows) ); magma_smfree(&Z, queue ); i++; } printf("%%=============================================================%%\n"); printf("];\n"); magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
int main(int argc, char** argv) { magma_init(); magma_timestr_t start , end; double gpu_time ; double *c; int dim[] = {20000,30000,40000}; int i,n; n = sizeof(dim) / sizeof(dim[0]); for(i=0; i < n; i++) { magma_int_t m = dim[i]; magma_int_t mm=m*m; magma_err_t err; err = magma_dmalloc_cpu ( &c , mm ); //generate random symmetric, positive matrix double *ml = generate_sym_matrix(m); start = get_current_time(); //find the inverse matrix for MxM symmetric, positive definite matrix using the cholesky decomposition. //Compute GPU cholesky decomposition with CPU interface c = cholesky(ml, m); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1e3; printf("gpu time for %dx%d: %7.5f sec\n", m, m, gpu_time); //copy upper diag copy_upper_diag(c,m); free(c); } magma_finalize (); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr dwork; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t nsize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t i, info, min_mn; magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (32*2+2)*min_mn) ); double *h1 = (double*)malloc(sizeof(double)*N*N); memset(h1, 0, N*N*sizeof(double)); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_PIN( h_work, double, lwork ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N)) / 1e9; /* Initialize the matrix */ magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up // magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); /* magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); */ gpu_time = magma_wtime(); magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue ); magma_dgetmatrix( N, N, ddA, 0, N, h_T, 0, N, queue ); // Restore the upper triangular part of A before the check for(int col=0; col<N; col++){ for(int row=0; row<=col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / error; // Check if T is the same double terr = 0.; magma_dgetmatrix( N, N, d_T, 0, N, h_T, 0, N, queue ); for(int col=0; col<N; col++) for(int row=0; row<=col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_T ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); free(h1); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; //*h_R1 is used for warm-up magmaDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *h_R1; magmaDoubleComplex_ptr dT; double *rwork; double result[2] = {0., 0.}; double eps; int checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; /* Matrix size */ int N=0, n2, lda, nb, lwork, ltwork, once = 0; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,9900}; #endif int i, info; int ione = 1; int ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if ( N > 0 ){ printf(" testing_zgehrd -N %d\n\n", N); once = 1; } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC_HOST( h_A , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( tau , magmaDoubleComplex, N ); TESTING_MALLOC_HOST( h_R , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R1 , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV ( dT , magmaDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_MALLOC_HOST( h_Q, magmaDoubleComplex, lda*N ); TESTING_MALLOC_HOST( twork, magmaDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_HOST( rwork, double, N ); #endif } printf("\n\n"); printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; gflops = FLOPS( (double)N ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R1, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zgehrd ( N, ione, N, h_R1, lda, tau, h_work, lwork, dT, 0, &info, queue); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); clFinish(queue); gpu_time = get_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, 0, &info, queue); gpu_time = get_time() - gpu_time; if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, 0, nb, &info, queue); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgehrd(&N, &ione, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = get_time() - cpu_time; if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_FREE_HOST( h_work); TESTING_FREE_HOST( h_R ); TESTING_FREE_HOST( h_R1 ); TESTING_FREE_DEV ( dT ); if ( checkres ) { TESTING_FREE_HOST( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float matnorm, work[1]; float mzone = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_sgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_slange("f", &M, &N, h_A, &M, work); blasf77_saxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_slange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; double *w1i, *w2i; double c_neg_one = MAGMA_D_NEG_ONE; double matnorm, tnrm, result[8]; /* Matrix size */ magma_int_t N=0, n2, lda, nb, lwork; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, j, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_vec_t jobl = MagmaVec; magma_vec_t jobr = MagmaVec; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-LN", argv[i])==0) jobl = MagmaNoVec; else if (strcmp("-LV", argv[i])==0) jobl = MagmaVec; else if (strcmp("-RN", argv[i])==0) jobr = MagmaNoVec; else if (strcmp("-RV", argv[i])==0) jobr = MagmaVec; } if ( N > 0 ) printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; lda = N; n2 = lda * N; nb = magma_get_dgehrd_nb(N); lwork = N*(2+nb); // generous workspace - required by dget22 lwork = max(lwork, N * ( 5 + 2*N)); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( w1i, double, N ); TESTING_MALLOC_CPU( w2i, double, N ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( VL, double, n2 ); TESTING_MALLOC_PIN( VR, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf(" N CPU Time(s) GPU Time(s) ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<8; i++){ if ( argc == 1 ){ N = size[i]; } lda = N; n2 = lda*N; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); gpu_time = magma_wtime(); magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeev(lapack_const(jobl), lapack_const(jobr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ if ( checkres ) { /* =================================================================== Check the result following LAPACK's [zcds]drvev routine. The following 7 tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * ugate-transpose of A, and W is as above. * * (3) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (6) VR(full) = VR(partial) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (7) VL(full) = VL(partial) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. ================================================================= */ int jj; double ulp, ulpinv, vmx, vrmx, vtst, res[2]; double *LRE, DUM; TESTING_MALLOC_PIN( LRE, double, n2 ); ulp = lapackf77_dlamch( "P" ); ulpinv = 1./ulp; // Initialize RESULT for (j = 0; j < 8; j++) result[j] = -1.; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); // Do test 1 lapackf77_dget22("N", "N", "N", &N, h_A, &lda, VR, &lda, w1, w1i, h_work, res); result[0] = res[0]; result[0] *= ulp; // Do test 2 lapackf77_dget22("T", "N", "T", &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[1]); result[1] *= ulp; // Do test 3 result[2] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VR[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VR[j * lda], ione), cblas_dnrm2(N, &VR[(j+1)* lda], ione) ); result[2] = fmax(result[2], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && magma_abs( VR[jj+j*lda] ) > vrmx) vrmx = magma_abs( VR[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[2] = ulpinv; } } result[2] *= ulp; // Do test 4 result[3] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VL[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VL[j * lda], ione), cblas_dnrm2(N, &VL[(j+1)* lda], ione) ); result[3] = fmax(result[3], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && magma_abs( VL[jj+j*lda]) > vrmx) vrmx = magma_abs( VL[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[3] = ulpinv; } } result[3] *= ulp; // Compute eigenvalues only, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaNoVec, N, h_R, lda, w2, w2i, &DUM, 1, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, N\n", (int) info); } // Do test 5 result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N N\n"); // Compute eigenvalues and right eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, V\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N V\n"); // Do test 6 result[5] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VR[j+jj*lda] != LRE[j+jj*lda] ) result[5] = 0; // Compute eigenvalues and left eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case V, N\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with V N\n"); // Do test 7 result[6] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VL[j+jj*lda] != LRE[j+jj*lda] ) result[6] = 0; printf("Test 1: | A * VR - VR * W | / ( n |A| ) = %e\n", result[0]); printf("Test 2: | A'* VL - VL * W'| / ( n |A| ) = %e\n", result[1]); printf("Test 3: | |VR(i)| - 1 | = %e\n", result[2]); printf("Test 4: | |VL(i)| - 1 | = %e\n", result[3]); printf("Test 5: W (full) == W (partial) = %f\n", result[4]); printf("Test 6: VR (full) == VR (partial) = %f\n", result[5]); printf("Test 7: VL (full) == VL (partial) = %f\n", result[6]); //==================================================================== matnorm = lapackf77_dlange("f", &N, &ione, w1, &N, h_work); blasf77_daxpy(&N, &c_neg_one, w1, &ione, w2, &ione); result[7] = lapackf77_dlange("f", &N, &ione, w2, &N, h_work) / matnorm; printf("%5d %6.2f %6.2f %e\n", (int) N, cpu_time, gpu_time, result[7]); TESTING_FREE_PIN( LRE ); } else { printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; /* Initialize */ TESTING_INIT(); magma_queue_t queue=NULL; magma_queue_create( &queue ); magmablasSetKernelStream( queue ); magma_int_t j, n=1000000, FLOPS; float one = MAGMA_S_MAKE( 1.0, 0.0 ); float two = MAGMA_S_MAKE( 2.0, 0.0 ); magma_s_matrix a={Magma_CSR}, ad={Magma_CSR}, bd={Magma_CSR}, cd={Magma_CSR}; CHECK( magma_svinit( &a, Magma_CPU, n, 1, one, queue )); CHECK( magma_svinit( &bd, Magma_DEV, n, 1, two, queue )); CHECK( magma_svinit( &cd, Magma_DEV, n, 1, one, queue )); CHECK( magma_smtransfer( a, &ad, Magma_CPU, Magma_DEV, queue )); real_Double_t start, end, res; FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) res = magma_snrm2(n, ad.dval, 1); end = magma_sync_wtime( queue ); printf( " > MAGMA nrm2: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_sscal( n, two, ad.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA scal: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_saxpy( n, one, ad.dval, 1, bd.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA axpy: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) magma_scopy( n, bd.dval, 1, ad.dval, 1 ); end = magma_sync_wtime( queue ); printf( " > MAGMA copy: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); FLOPS = 2*n; start = magma_sync_wtime( queue ); for (j=0; j<100; j++) res = MAGMA_S_REAL( magma_sdot(n, ad.dval, 1, bd.dval, 1) ); end = magma_sync_wtime( queue ); printf( " > MAGMA dotc: %.2e seconds %.2e GFLOP/s\n", (end-start)/100, FLOPS*100/1e9/(end-start) ); printf("# tester BLAS: ok\n"); magma_smfree( &a, queue); magma_smfree(&ad, queue); magma_smfree(&bd, queue); magma_smfree(&cd, queue); cleanup: magma_smfree( &a, queue); magma_smfree(&ad, queue); magma_smfree(&bd, queue); magma_smfree(&cd, queue); magmablasSetKernelStream( NULL ); magma_queue_destroy( queue ); magma_finalize(); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claswp */ int main( int argc, char** argv) { /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magmaFloatComplex *h_A1, *h_A2, *h_A3, *h_AT; magmaFloatComplex_ptr d_A1; real_Double_t gpu_time, cpu_time1, cpu_time2; /* Matrix size */ int M=0, N=0, n2, lda, ldat; int size[7] = {1000,2000,3000,4000,5000,6000,7000}; int i, j; int ione = 1; int ISEED[4] = {0,0,0,1}; int *ipiv; int k1, k2, r, c, incx; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_claswp -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); M = N = size[6]; } lda = M; n2 = M*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A1, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A3, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_AT, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, n2 ); ipiv = (int*)malloc(M * sizeof(int)); if (ipiv == 0) { fprintf (stderr, "!!!! host memory allocation error (ipiv)\n"); } printf("\n\n"); printf(" M N CPU_BLAS (sec) CPU_LAPACK (sec) GPU (sec) \n"); printf("=============================================================================\n"); for(i=0; i<7; i++) { if(argc == 1){ M = N = size[i]; } lda = M; ldat = N; n2 = M*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A1 ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A1, &lda, h_A2, &lda ); for(r=0;r<M;r++){ for(c=0;c<N;c++){ h_AT[c+r*ldat] = h_A1[r+c*lda]; } } magma_csetmatrix( N, M, h_AT, 0, ldat, d_A1, 0, ldat, queue); for(j=0; j<M; j++) { ipiv[j] = (int)((rand()*1.*M) / (RAND_MAX * 1.)) + 1; } /* * BLAS swap */ /* Column Major */ cpu_time1 = magma_wtime(); for ( j=0; j<M; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A1+(ipiv[j]-1), &lda); } } cpu_time1 = magma_wtime() - cpu_time1; /* * LAPACK laswp */ cpu_time2 = magma_wtime(); k1 = 1; k2 = M; incx = 1; lapackf77_claswp(&N, h_A2, &lda, &k1, &k2, ipiv, &incx); cpu_time2 = magma_wtime() - cpu_time2; /* * GPU swap */ /* Col swap on transpose matrix*/ gpu_time = magma_wtime(); magma_cpermute_long2(N, d_A1, 0, ldat, ipiv, M, 0, queue); gpu_time = magma_wtime() - gpu_time; /* Check Result */ magma_cgetmatrix( N, M, d_A1, 0, ldat, h_AT, 0, ldat, queue); for(r=0;r<N;r++){ for(c=0;c<M;c++){ h_A3[c+r*lda] = h_AT[r+c*ldat]; } } int check_bl, check_bg, check_lg; check_bl = diffMatrix( h_A1, h_A2, M, N, lda ); check_bg = diffMatrix( h_A1, h_A3, M, N, lda ); check_lg = diffMatrix( h_A2, h_A3, M, N, lda ); printf("%5d %5d %6.2f %6.2f %6.2f %s %s %s\n", M, N, cpu_time1, cpu_time2, gpu_time, (check_bl == 0) ? "SUCCESS" : "FAILED", (check_bg == 0) ? "SUCCESS" : "FAILED", (check_lg == 0) ? "SUCCESS" : "FAILED"); if(check_lg !=0){ printf("lapack swap results:\n"); magma_cprint(M, N, h_A1, lda); printf("gpu swap transpose matrix result:\n"); magma_cprint(M, N, h_A3, lda); } if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A1 ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_A3 ); TESTING_FREE_CPU( h_AT ); TESTING_FREE_DEV( d_A1 ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_spotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_spotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], float, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( h_A(i,i), MAGMA_S_REAL(h_A(i,i)) + N ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_S_CNJG( h_A(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_spotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix(N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], 0, lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_spotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_spotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, float, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_sgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
void magmaf_finalize( void ) { magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_copts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); real_Double_t res; magma_c_matrix A={Magma_CSR}, A2={Magma_CSR}, A3={Magma_CSR}, A4={Magma_CSR}, A5={Magma_CSR}; int i=1; TESTING_CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); // filename for temporary matrix storage const char *filename = "testmatrix.mtx"; // write to file TESTING_CHECK( magma_cwrite_csrtomtx( A, filename, queue )); // read from file TESTING_CHECK( magma_c_csr_mtx( &A2, filename, queue )); // delete temporary matrix unlink( filename ); //visualize printf("A2:\n"); TESTING_CHECK( magma_cprint_matrix( A2, queue )); //visualize TESTING_CHECK( magma_cmconvert(A2, &A4, Magma_CSR, Magma_CSRL, queue )); printf("A4:\n"); TESTING_CHECK( magma_cprint_matrix( A4, queue )); TESTING_CHECK( magma_cmconvert(A4, &A5, Magma_CSR, Magma_ELL, queue )); printf("A5:\n"); TESTING_CHECK( magma_cprint_matrix( A5, queue )); // pass it to another application and back magma_int_t m, n; magma_index_t *row, *col; magmaFloatComplex *val=NULL; TESTING_CHECK( magma_ccsrget( A2, &m, &n, &row, &col, &val, queue )); TESTING_CHECK( magma_ccsrset( m, n, row, col, val, &A3, queue )); TESTING_CHECK( magma_cmdiff( A, A2, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester IO: ok\n"); else printf("%% tester IO: failed\n"); TESTING_CHECK( magma_cmdiff( A, A3, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester matrix interface: ok\n"); else printf("%% tester matrix interface: failed\n"); magma_cmfree(&A, queue ); magma_cmfree(&A2, queue ); magma_cmfree(&A4, queue ); magma_cmfree(&A5, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgesv */ int main(int argc, char **argv) { real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_LU, *h_B, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( device[ opts.device ], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[ opts.device ], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; ldb = lda; gflops = ( FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_LU, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_zlacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_zlacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info, queue ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== Anorm = lapackf77_zlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
SEXP magmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri) { magma_init(); int ndevices; double *h_R; ndevices = INTEGER_VALUE(ngpu); int idevice; for(idevice=0; idevice < ndevices; idevice++) { magma_setdevice(idevice); if(CUBLAS_STATUS_SUCCESS != cublasInit()) { printf("Error: gpu %d: cublasInit failed\n", idevice); magma_finalize(); exit(-1); } } // magma_print_devices(); int In, INB; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); double *PA = NUMERIC_POINTER(A); int i,j; //magma_timestr_t start, end; double gpu_time; printf("Inside magma_dpotrf_m"); /*for(i = 0; i < 5; i++) { for(j = 0; j < 5; j++) { printf("%.8f ", PA[i+j*In]); } printf("\n"); } */ magma_int_t N, status, info, nGPU, n2, lda; clock_t t1, t2; N = In; status = 0; int nGPUs = ndevices; lda = N; n2 = lda*N; if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) { fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); magma_finalize(); exit(-1); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda ); //printf("Modified by Vinay in 2 GPU\n"); //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri){ t1 = clock(); magma_dpotrf_m(nGPUs, MagmaLower, N, h_R, N, &info); t2 = clock (); } else{ t1 = clock(); magma_dpotrf_m(nGPUs, MagmaUpper, N, h_R, N, &info); t2 = clock (); } gpu_time = (double) (t2-t1)/(CLOCKS_PER_SEC) ; // Magma time printf (" magma_dpotrf_m time : %f sec. \n", gpu_time ); if(info != 0) { printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda ); magma_free_pinned(h_R); magma_finalize(); cublasShutdown(); int IZeroTri; IZeroTri = INTEGER_VALUE(zeroTri); if(IZeroTri & lTri) { for(i = 1; i < In; i++) { for(j=0; j< i; j++) { PA[i*In+j] = 0.0; } } } else if(IZeroTri){ for(i = 0; i < In; i++) { for(j=i+1; j < In; j++) { PA[i*In+j] = 0.0; } } } return(R_NilValue); }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_matrix A={Magma_CSR}, B_d={Magma_CSR}; magma_c_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %lld-by-%lld with %lld nonzeros\n\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); magma_int_t n = A.num_rows; TESTING_CHECK( magma_cmtransfer( A, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess TESTING_CHECK( magma_cvinit( &b, Magma_DEV, A.num_cols, 1, zero, queue )); TESTING_CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, one, queue )); TESTING_CHECK( magma_cprint_vector( b, 90, 10, queue )); TESTING_CHECK( magma_cprint_matrix( A, queue )); printf("\n\n\n"); TESTING_CHECK( magma_cprint_matrix( B_d, queue )); float res; res = magma_scnrm2( n, b.dval, 1, queue ); printf("norm0: %f\n", res); TESTING_CHECK( magma_c_spmv( one, B_d, x, zero, b, queue )); // b = A x TESTING_CHECK( magma_cprint_vector( b, 0, 100, queue )); TESTING_CHECK( magma_cprint_vector( b, b.num_rows-10, 10, queue )); res = magma_scnrm2( n, b.dval, 1, queue ); printf("norm: %f\n", res); TESTING_CHECK( magma_cresidual( B_d, x, b, &res, queue )); printf("res: %f\n", res); magma_cmfree(&B_d, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&b, queue ); i++; } magma_queue_destroy( queue ); magma_finalize(); return info; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *h_R = NULL, *h_P = NULL; magmaFloat_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, check = 0, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0; magma_int_t nb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i]) == 0){ N = atoi(argv[++i]); if (N > 0) { size[0] = size[9] = N; flag = 1; } } if(strcmp("-NGPU", argv[i]) == 0) num_gpus0 = atoi(argv[++i]); if(strcmp("-NSUB", argv[i]) == 0) num_subs0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i]) == 0) uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower : MagmaUpper); if(strcmp("-check", argv[i]) == 0) check = 1; } } /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus0;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("\nUsing %d GPUs:\n", num_gpus0); printf(" testing_spotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0, (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " ")); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_SPOTRF( N ) / 1e9;; nb = magma_get_spotrf_nb(N); if (num_subs0*num_gpus0 > N/nb) { num_gpus = N/nb; num_subs = 1; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); } else { num_gpus = num_gpus0; num_subs = num_subs0; } tot_subs = num_subs * num_gpus; /* Allocate host memory for the matrix */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(float), NULL, NULL); cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(float), NULL, NULL); for (k=0; k<num_gpus; k++) { h_R = (float*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, n2*sizeof(float), 0, NULL, NULL, NULL); h_P = (float*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lda*nb*sizeof(float), 0, NULL, NULL, NULL); } #else TESTING_MALLOC_PIN( h_P, float, lda*nb ); TESTING_MALLOC_PIN( h_R, float, n2 ); #endif /* Initialize the matrix */ init_matrix( N, h_R, lda ); /* Allocate GPU memory */ if (uplo == MagmaUpper) { ldda = ((N+nb-1)/nb)*nb; n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; } else { ldda = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; n_local = ((N+nb-1)/nb)*nb; } for (j=0; j<tot_subs; j++) { TESTING_MALLOC_DEV( d_lA[j], float, n_local*ldda ); } /* Warm up to measure the performance */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_ssetmatrix( j+nk, nk, &h_R[j*lda], lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { magma_int_t mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_slacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_ssetmatrix( mk, nk, h_P, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_ssetmatrix( nk, j+nk, &h_R[j], lda, d_lA[k], j/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); }*/ } magma_spotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_ssetmatrix( j+nk, nk, &h_R[j*lda], lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { magma_int_t mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_slacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_ssetmatrix( mk, nk, h_P, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_ssetmatrix( nk, j+nk, &h_R[j], lda, d_lA[k], (j/(nb*tot_subs)*nb), ldda, queues[2*(k%num_gpus)]); }*/ } gpu_time = magma_wtime(); magma_spotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); /* gather matrix from gpus */ if (uplo==MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_sgetmatrix( j+nk, nk, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { k = ((j+kk*nb)/nb)%tot_subs; magma_int_t mk = 0; mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { mk += min(nb, N-ii); } if (mk > 0 && nk > 0) { magma_sgetmatrix( mk, nk, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, h_P, lda, queues[2*(k%num_gpus)]); } mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_slacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda ); mk += mii; } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_sgetmatrix( nk, j+nk, d_lA[k], (j/(nb*tot_subs)*nb), ldda, &h_R[j], lda, queues[2*(k%num_gpus)] ); }*/ } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if (check == 1) { float work[1], matnorm, diffnorm; float *h_A; TESTING_MALLOC_PIN( h_A, float, n2 ); init_matrix( N, h_A, lda ); cpu_time = magma_wtime(); if (uplo == MagmaLower) { lapackf77_spotrf( MagmaLowerStr, &N, h_A, &lda, &info ); } else { lapackf77_spotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); TESTING_FREE_PIN( h_A ); } else { printf( "%5d - - (- -) %6.2f (%6.2f) - -\n", N, gpu_perf, gpu_time ); } // free memory #ifdef USE_PINNED_CLMEMORY for (k=0; k<num_gpus; k++) { clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL); clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL); } clReleaseMemObject(buffer1); clReleaseMemObject(buffer2); #else TESTING_FREE_PIN( h_P ); TESTING_FREE_PIN( h_R ); #endif for (j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } /* clean up */ for (i=0; i<num_gpus; i++) { magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); return 0; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_B, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, double, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, double, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **S, PyGpuArrayObject **U, // may be NULL PyGpuArrayObject **VT, // may be NULL PARAMS_TYPE* params) { bool compute_uv = (U != NULL); magma_int_t *iwork = NULL, iunused[1]; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_vec_t jobz; size_t s_dims[1], u_dims[2], vt_dims[2]; float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL, *work = NULL; float dummy[1]; int res = -1, lwork; if (A->ga.typecode != GA_FLOAT) { PyErr_SetString(PyExc_TypeError, "GpuMagmaMatrixInverse: Unsupported data type"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(params->context->ctx); magma_init(); if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: requires data to be C-contiguous"); return 1; } if (PyGpuArray_NDIM(A) != 2) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: matrix rank error"); goto fail; } // magma matrix svd // reverse dimensions because MAGMA expects column-major matrices: M = PyGpuArray_DIM(A, 1); N = PyGpuArray_DIM(A, 0); K = std::min(M, N); if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float), cudaMemcpyDeviceToDevice); if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (compute_uv) { if (params->full_matrices) { jobz = MagmaAllVec; } else { jobz = MagmaSomeVec; } M_U = (jobz == MagmaAllVec ? M : K); N_VT = (jobz == MagmaAllVec ? N : K); ldu = M; ldv = N_VT; if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } } else { jobz = MagmaNoVec; ldu = M; ldv = N; } // query for workspace size magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv, dummy, -1, iunused, &info); lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]); if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } // compute svd magma_sgesdd(jobz, M, N, a_data, M, s_data, u_data, ldu, vt_data, ldv, work, lwork, iwork, &info); if (info > 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)", info); goto fail; } else if (info < 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info); goto fail; } s_dims[0] = K; if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float), cudaMemcpyDeviceToDevice); if (compute_uv) { u_dims[0] = N; u_dims[1] = N_VT; if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float), cudaMemcpyDeviceToDevice); vt_dims[0] = M_U; vt_dims[1] = M; if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float), cudaMemcpyDeviceToDevice); } res = 0; fail: if (a_data != NULL) magma_free_pinned(a_data); if (s_data != NULL) magma_free_pinned(s_data); if (u_data != NULL) magma_free_pinned(u_data); if (vt_data != NULL) magma_free_pinned(vt_data); if (work != NULL) magma_free_pinned(work); if (iwork != NULL) magma_free_cpu(iwork); magma_finalize(); cuda_exit(params->context->ctx); return res; }
int main( int argc, char** argv) { real_Double_t gflops, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time; float magma_error, clblas_error, work[1]; magma_trans_t transA = MagmaNoTrans; magma_trans_t transB = MagmaNoTrans; magma_int_t istart = 1024; magma_int_t iend = 6240; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float *h_A, *h_B, *h_C, *h_C2, *h_C3; magmaFloat_ptr d_A, d_B, d_C; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); float beta = MAGMA_S_MAKE( -0.48, 0.38 ); int lapack = getenv("MAGMA_RUN_LAPACK") != NULL; int count = 1; printf("\nUsage: testing_sgemm [-NN|NT|TN|TT|NC|CN|TC|CT|CC] -M m -N n -K k -count c -l\n" " -l or setting $MAGMA_RUN_LAPACK runs CPU BLAS,\n" " and computes both MAGMA and CLBLAS error using CPU BLAS result.\n" " Else, MAGMA error is computed using CLBLAS result.\n\n"); for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 && i+1 < argc ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 && i+1 < argc ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } else if (strcmp("-l", argv[i])==0) { lapack = true; } else if ( strcmp("-count", argv[i]) == 0 && i+1 < argc ){ count = atoi(argv[++i]); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ldda; K += 32; M += 32; N += 32; TESTING_MALLOC_CPU( h_A, float, lda*K ); TESTING_MALLOC_CPU( h_B, float, ldb*Bn ); TESTING_MALLOC_CPU( h_C, float, ldc*N ); TESTING_MALLOC_CPU( h_C2, float, ldc*N ); TESTING_MALLOC_CPU( h_C3, float, ldc*N ); TESTING_MALLOC_DEV( d_A, float, ldda*K ); TESTING_MALLOC_DEV( d_B, float, lddb*Bn ); TESTING_MALLOC_DEV( d_C, float, lddc*N ); printf("Testing transA = %c transB = %c\n", *lapack_const(transA), *lapack_const(transB)); printf(" M N K MAGMA Gflop/s (sec) CLBLAS Gflop/s (sec) CPU Gflop/s (sec) MAGMA error CLBLAS error\n"); printf("===========================================================================================================\n"); for( i=istart; i<iend; i = (int)(i*1.25) ) { for( int cnt = 0; cnt < count; ++cnt ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS_SGEMM( M, N, K ) / 1e9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &szeA, h_A ); lapackf77_slarnv( &ione, ISEED, &szeB, h_B ); lapackf77_slarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_ssetmatrix( Am, An, h_A, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( Bm, Bn, h_B, ldb, d_B, 0, lddb, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magmablas_sgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime(); magmablas_sgemm_reduce( M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); magma_time = magma_wtime() - magma_time; magma_perf = gflops / magma_time; magma_sgetmatrix( M, N, d_C, 0, lddc, h_C2, ldc, queue ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_sgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime(); magma_sgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync(queue); clblas_time = magma_wtime() - clblas_time; clblas_perf = gflops / clblas_time; magma_sgetmatrix( M, N, d_C, 0, lddc, h_C3, ldc, queue ); /* ===================================================================== Performs operation using BLAS =================================================================== */ if ( lapack ) { cpu_time = magma_wtime(); blasf77_sgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ if ( lapack ) { // compare both magma & clblas to lapack blasf77_saxpy(&szeC, &c_neg_one, h_C, &ione, h_C2, &ione); magma_error = lapackf77_slange("M", &M, &N, h_C2, &ldc, work); blasf77_saxpy(&szeC, &c_neg_one, h_C, &ione, h_C3, &ione); clblas_error = lapackf77_slange("M", &M, &N, h_C3, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) %7.2f (%7.4f) %8.2e %8.2e\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time, magma_error, clblas_error ); } else { // compare magma to clblas blasf77_saxpy(&szeC, &c_neg_one, h_C3, &ione, h_C2, &ione); magma_error = lapackf77_slange("M", &M, &N, h_C2, &ldc, work); printf("%5d %5d %5d %7.2f (%7.4f) %7.2f (%7.4f) --- ( --- ) %8.2e ---\n", (int) M, (int) N, (int) K, magma_perf, magma_time, clblas_perf, clblas_time, magma_error ); } } if ( count > 1 ) { printf( "\n" ); } } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_CPU( h_C3 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
~GpuSolver(){ magma_free( dA ); magma_free( dB ); magma_finalize(); };
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A, dwork; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 }; magma_int_t ntest = 10; magma_int_t i, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0, 0, 0, 1}; magmaDoubleComplex *work; magmaDoubleComplex tmp; double rwork[1]; magma_int_t *ipiv; magma_int_t lwork, ldwork; double A_norm, R_norm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[ntest-1] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zgetri_gpu -N %d\n\n", 1024); } /* query for Lapack workspace size */ N = size[ntest-1]; lda = N; work = &tmp; lwork = -1; lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); lwork = int( MAGMA_Z_REAL( *work )); /* query for Magma workspace size */ ldwork = N * magma_get_zgetri_nb( N ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory */ n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork ); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for( i=0; i < ntest; i++ ){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_ZGETRI( (double)N ) / 1e9; ldda = ((N+31)/32)*32; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue ); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 ); //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ //warm-up magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue ); gpu_time = magma_wtime()-gpu_time; if (info != 0) printf("magma_zgetri_gpu returned error %d\n", (int) info); gpu_perf = gflops / gpu_time; magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zgetri returned error %d\n", (int) info); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork ); printf( "%5d %6.2f %6.2f %e\n", (int) N, cpu_perf, gpu_perf, R_norm / A_norm ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *hA, *hR; magmaDoubleComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_Z_SET2REAL( hA(i,i), MAGMA_Z_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_Z_CNJG( hA(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_zgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work); blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); double *A, *B, *C; double *dA, *dB, *dC; double error, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_dmalloc_cpu( &A, lda*n ); magma_dmalloc_cpu( &B, lda*n ); magma_dmalloc_cpu( &C, lda*n ); magma_dmalloc( &dA, ldda*n ); magma_dmalloc( &dB, ldda*n ); magma_dmalloc( &dC, ldda*n ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 )); } magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_dsetmatrix( n, n, B, lda, dB, ldda ); magma_dsetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_dgetmatrix( n, n, dC, ldda, A, lda ); blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_dlange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double matnorm, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, flag = 0, n2, check = 0; magma_int_t n_local[MagmaMaxGPUs*MagmaMaxSubs], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, info, min_mn, nb; int num_gpus = 1, num_subs = 1, tot_subs = 1; magma_int_t ione = 1; M = N = size[9]; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); flag = 1; } else if (strcmp("-M", argv[i])==0) { M = atoi(argv[++i]); flag = 1; } else if (strcmp("-NGPU", argv[i])==0) { num_gpus = atoi(argv[++i]); } else if (strcmp("-NSUB", argv[i])==0) { num_subs = atoi(argv[++i]); } else if (strcmp("-check", argv[i])==0) { check = 1; } } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } } if (num_gpus > MagmaMaxGPUs){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = MagmaMaxGPUs; } if (num_subs > MagmaMaxSubs) { printf("More buffers requested than available. Have to change it.\n"); num_subs = MagmaMaxSubs; } tot_subs = num_gpus * num_subs; printf("\nNumber of GPUs to be used = %d\n", (int) num_gpus); printf("Usage: \n"); printf(" testing_zgeqrf_msub -M %d -N %d -NGPU %d -NSUB %d %s\n\n", M, N, num_gpus, num_subs, (check == 1 ? "-check" : " ")); /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for (i=0; i<num_gpus; i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf( "\n" ); printf(" M N CPU GFlop/s (sec.) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("==========================================================================\n"); for(i=0; i<10; i++){ if (flag == 0) { M = N = size[i]; } nb = magma_get_zgeqrf_nb(M); min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_ZGEQRF( (double)M, (double)N ) / 1e9; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); /* Allocate host workspace */ lhwork = -1; lapackf77_zgeqrf(&M, &N, h_R, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lhwork ); /* Allocate device memory for the matrix */ for (int j=0; j<tot_subs; j++) { n_local[j] = ((N/nb)/tot_subs)*nb; if (j < (N/nb)%tot_subs) n_local[j] += nb; else if (j == (N/nb)%tot_subs) n_local[j] += N%nb; TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local[j] ); } /* Initialize the matrix */ init_matrix( M, N, h_R, lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_queue_t *trans_queues = (magma_queue_t*)malloc(tot_subs*sizeof(magma_queue_t)); for (int j=0; j<tot_subs; j++) { trans_queues[j] = queues[2*(j%num_gpus)]; } // warm-up magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues); gpu_time = magma_wtime(); magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info < 0) printf("Argument %d of magma_zgeqrf_msub had an illegal value.\n", (int) -info); if (check == 1) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_zgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, tot_subs, nb, trans_queues); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); init_matrix( M, N, h_A, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapack_zgeqrf had an illegal value.\n", (int) -info); matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); TESTING_FREE_PIN( h_A ); } else { printf("%5d %5d -- ( -- ) %6.2f (%6.2f) --\n", (int) M, (int) N, gpu_perf, gpu_time ); } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for (int j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } for (i=0; i<num_gpus; i++) { magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }