/* //////////////////////////////////////////////////////////////////////////// -- Testing cpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magma_int_t N, n2, lda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hpd( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cpotrf( opts.uplo, N, h_R, lda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B, *h_X; magmaFloatComplex_ptr d_A, d_B; magma_int_t N, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = ldb = N; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*opts.nrhs ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); magma_csetmatrix( N, N, h_A, N, d_A, ldda ); magma_csetmatrix( N, opts.nrhs, h_B, N, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv_gpu( opts.uplo, N, opts.nrhs, d_A, ldda, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ magma_cgetmatrix( N, opts.nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; float error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaFloatComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaFloatComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_chesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_C_REAL(temp); TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_chesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_chesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_clacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_chesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_chesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaFloatComplex *h_R = NULL, *h_P = NULL; magmaFloatComplex_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, check = 0, info; magmaFloatComplex mz_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0; int nb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i]) == 0){ N = atoi(argv[++i]); if (N > 0) { size[0] = size[9] = N; flag = 1; } } if(strcmp("-NGPU", argv[i]) == 0) num_gpus0 = atoi(argv[++i]); if(strcmp("-NSUB", argv[i]) == 0) num_subs0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i]) == 0) uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower : MagmaUpper); if(strcmp("-check", argv[i]) == 0) check = 1; } } /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus0;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("\nUsing %d GPUs:\n", num_gpus0); printf(" testing_cpotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0, (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " ")); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_CPOTRF( N ) / 1e9;; nb = magma_get_cpotrf_nb(N); if (num_subs0*num_gpus0 > N/nb) { num_gpus = N/nb; num_subs = 1; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); } else { num_gpus = num_gpus0; num_subs = num_subs0; } tot_subs = num_subs * num_gpus; /* Allocate host memory for the matrix */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(magmaFloatComplex), NULL, NULL); cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(magmaFloatComplex), NULL, NULL); for (k=0; k<num_gpus; k++) { h_R = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, n2*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); h_P = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lda*nb*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); } #else TESTING_MALLOC_PIN( h_P, magmaFloatComplex, lda*nb ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); #endif /* Initialize the matrix */ init_matrix( N, h_R, lda ); /* Allocate GPU memory */ if (uplo == MagmaUpper) { ldda = ((N+nb-1)/nb)*nb; n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; } else { ldda = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; n_local = ((N+nb-1)/nb)*nb; } for (j=0; j<tot_subs; j++) { TESTING_MALLOC_DEV( d_lA[j], magmaFloatComplex, n_local*ldda ); } /* Warm up to measure the performance */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], j/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); }*/ } magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(j+nk, nk, &h_R[j*lda], 0, lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { int mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_csetmatrix(mk, nk, h_P, 0, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda, d_lA[k], (j/(nb*tot_subs)*nb), ldda, queues[2*(k%num_gpus)]); }*/ } gpu_time = magma_wtime(); magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf( "magma_cpotrf had error %d.\n", info ); /* gather matrix from gpus */ if (uplo==MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix(j+nk, nk, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (int kk = 0; kk<tot_subs; kk++) { k = ((j+kk*nb)/nb)%tot_subs; int mk = 0; mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { mk += min(nb, N-ii); } if (mk > 0 && nk > 0) { magma_cgetmatrix(mk, nk, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, h_P, 0, lda, queues[2*(k%num_gpus)]); } mk = 0; for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { int mii = min(nb, N-ii); lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda ); mk += mii; } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_cgetmatrix( nk, j+nk, d_lA[k], (j/(nb*tot_subs)*nb), ldda, &h_R[j], 0, lda, queues[2*(k%num_gpus)] ); }*/ } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if (check == 1) { float work[1], matnorm, diffnorm; magmaFloatComplex *h_A; TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); init_matrix( N, h_A, lda ); cpu_time = magma_wtime(); if (uplo == MagmaLower) { lapackf77_cpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); } else { lapackf77_cpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf( "lapackf77_cpotrf had error %d.\n", info ); /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work); blasf77_caxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_clange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); TESTING_FREE_PIN( h_A ); } else { printf( "%5d - - (- -) %6.2f (%6.2f) - -\n", N, gpu_perf, gpu_time ); } // free memory #ifdef USE_PINNED_CLMEMORY for (k=0; k<num_gpus; k++) { clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL); clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL); } clReleaseMemObject(buffer1); clReleaseMemObject(buffer2); #else TESTING_FREE_PIN( h_P ); TESTING_FREE_PIN( h_R ); #endif for (j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } /* clean up */ for (i=0; i<num_gpus; i++) { magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); return 0; }
int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t N, n2, lda, ldda, info; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magmaFloatComplex **d_A_array = NULL; magma_int_t *dinfo_magma; magma_int_t batchCount; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) batchCount = opts.batchcount; float tol = opts.tolerance * lapackf77_slamch("E"); printf("BatchCount N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; ldda = lda = ((N+31)/32)*32; n2 = lda* N * batchCount; gflops = batchCount * FLOPS_CPOTRF( N ) / 1e9 ; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda * N * batchCount); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount); magma_malloc((void**)&d_A_array, batchCount * sizeof(*d_A_array)); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for(int i=0; i<batchCount; i++) { magma_cmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_int_t columns = N * batchCount; lapackf77_clacpy( MagmaUpperLowerStr, &N, &(columns), h_A, &lda, h_R, &lda ); magma_csetmatrix( N, columns, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ cset_pointer(d_A_array, d_A, ldda, 0, 0, ldda * N, batchCount, queue); gpu_time = magma_sync_wtime(NULL); info = magma_cpotrf_batched( opts.uplo, N, d_A_array, ldda, dinfo_magma, batchCount, queue); gpu_time = magma_sync_wtime(NULL) - gpu_time; gpu_perf = gflops / gpu_time; magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_magma, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_cpotrf_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_cpotrf_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { lapackf77_cpotrf( lapack_uplo_const(opts.uplo), &N, h_A + i * lda * N, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix( N, columns, d_A, ldda, h_R, lda ); magma_int_t NN = lda*N; char const uplo = 'l'; // lapack_uplo_const(opts.uplo) float err = 0.0; for(int i=0; i<batchCount; i++) { error = lapackf77_clanhe("f", &uplo, &N, h_A + i * lda*N, &lda, work); blasf77_caxpy(&NN, &c_neg_one, h_A + i * lda*N, &ione, h_R + i * lda*N, &ione); error = lapackf77_clanhe("f", &uplo, &N, h_R + i * lda*N, &lda, work) / error; if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(fabs(error),err); } printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., err, (error < tol ? "ok" : "failed")); status += ! (err < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int)batchCount, (int) N, gpu_perf, gpu_time*1000. ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_A_array ); TESTING_FREE_DEV( dinfo_magma ); free(cpu_info); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cpotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may not fit entirely in the GPU memory. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; char uplo_[2] = {uplo, 0}; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; #ifdef ROW_MAJOR_PROFILE magma_timestr_t start, end, start0, end0; float chol_time = 1.0; #endif *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb(n); if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if( NB >= n ) { #ifdef CHECK_CPOTRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CPOTRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_CPOTRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); fflush(stdout); #endif for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE start0 = get_current_time(); #endif if (nb <= 1 || nb >= n) { lapackf77_cpotrf(uplo_, &n, a, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo); #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif /* update with the previous big-panels */ for( j=0; j<J; j+=nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_cgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_cherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if( n > J+JB ) { for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal (and diagonal!!!) big panel */ magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo); //magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif for( j=0; j<J; j+=nb ) { /* upload the diagonal of big panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj<JB; jj+=nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_cgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_cherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if( n > J+JB ) { /* off-diagonal */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal big panel */ magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ #ifdef ROW_MAJOR_PROFILE end0 = get_current_time(); #endif if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j<5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE printf("\n n=%d NB=%d nb=%d\n",n,NB,nb); printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, GetTimerValue(start0, end0), FLOPS_CPOTRF(n)/(1000000*GetTimerValue(start0, end0))); printf(" Performance %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, chol_time, FLOPS_CPOTRF(n)/(1000000*chol_time)); #endif return *info; } /* magma_cpotrf_ooc */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cposv */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *h_B, *h_X; magma_int_t N, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d, uplo %c\n", (int) opts.ngpu, opts.uplo ); printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ldb = N; gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC( h_B, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC( h_X, magmaFloatComplex, ldb*opts.nrhs ); TESTING_MALLOC( work, float, N ); /* ==================================================================== Initialize the matrix =================================================================== */ sizeA = lda*N; sizeB = ldb*opts.nrhs; lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); magma_cmake_hpd( N, h_A, lda ); // copy A to R and B to X; save A and B for residual lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cposv( opts.uplo, N, opts.nrhs, h_R, lda, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Residual =================================================================== */ Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb ); Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status |= ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cposv( &opts.uplo, &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cposv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e%s\n", (int) N, (int) opts.nrhs, gpu_perf, gpu_time, error, (error < tol ? "" : " failed")); } TESTING_FREE( h_A ); TESTING_FREE( h_R ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }