/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; magmaDouble_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dgetrf_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; double *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhesv */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; double error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_PIN( h_X, magmaDoubleComplex, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_Z_REAL(temp); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_zhesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_zhesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* static void Print_start_data (const double *msr_matrix, const int *indexes, const int n, const int nz, const double *rhs, const int my_rank) { // Print matrix as arrays MSR ans INDEX print_msr_NF (msr_matrix, indexes, nz, my_rank); // Print rhs printf ("rhs = "); print_vector (rhs, n, my_rank); } */ void *msr_solver (void* args) { // class matrix arguments matrix_args *m_args = (matrix_args*)args; int my_rank = m_args->get_my_rank (); int total_thread = m_args->get_total_thread (); // iteration definition constexpr int maxit = 50; constexpr int globalmaxit = 300; constexpr double eps = 1e-4; int iter = 0; int glob_iter = 0; double *msr_matrix = m_args->get_matrix (); //< MSR matrix double *x = m_args->get_x (); //< solve double *rhs = m_args->get_rhs (); //< right hand double *workspace = m_args->get_workspace (); //< addition workspace double *r = workspace; //< residual vector int *indexes = m_args->get_indexes (); //< MSR IND vector const int n = m_args->get_size (); //< size matrix const int width = m_args->get_width (); //< widht of band matrix double residual = 0; // residual double full_time = 0; // create msr matrix create_msr_band_matrix (msr_matrix, indexes, width, n, my_rank, total_thread); print_msr_matrix_SF (msr_matrix, indexes, n, my_rank); print_msr_NF (msr_matrix, indexes, m_args->get_nozero (), my_rank); // symmetry matrix /* int sym = 0; sym = is_symmetric_msr_matrix (indexes, n, my_rank); if (sym) { printf ("MSR not symmtetry!!!\n"); } */ // set rhs = (1,0,1,0...) set_rhs_vector (msr_matrix, indexes,rhs, n, my_rank, total_thread); // set x = (1,0,1,0...) set_init_solve_vector (x, n, my_rank, total_thread); // Print_start_data (msr_matrix, indexes, n, nz, rhs, my_rank); full_time = get_full_time (); // Initialize full time m_args->set_thread_time (get_time ()); // Initialize time for (glob_iter = 0; glob_iter < globalmaxit;) { iter = minimal_residual_solver (msr_matrix, indexes, n, rhs, x, workspace, my_rank, total_thread, maxit, eps); if (iter < 0) { glob_iter += maxit; residual = get_residual (msr_matrix, indexes, n, x, rhs, r, my_rank, total_thread); if (my_rank == 0) printf("Failure: iters = %d residual = %4.8e\n", glob_iter, residual); } else break; } // Finish time full_time = get_full_time () - full_time; m_args->set_thread_time(get_time () - m_args->get_thread_time ()); residual = get_residual (msr_matrix, indexes, n, x, rhs, r, my_rank, total_thread); if (my_rank == 0) { double error = 0; printf ("MATRIX: %s\nResidual = %e Error = %e\nFull_time = %f\n", "no-file", residual, error, full_time); } printf ("Thread # %d ------------------- CPU_thread_time = %f\n", my_rank, m_args->get_thread_time ()); synchronize (total_thread); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magmaFloat_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% version %d\n", (int) opts.version ); if ( opts.check == 2 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( opts, M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( opts, M, N, h_A, lda ); if ( opts.version == 2 ) { // no pivoting versions, so set ipiv to identity for (magma_int_t i=0; i < min_mn; ++i ) { ipiv[i] = i+1; } } magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_sgetrf_gpu( M, N, d_A, ldda, ipiv, &info); } else if ( opts.version == 2 ) { magma_sgetrf_nopiv_gpu( M, N, d_A, ldda, &info); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, float, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_sgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A, *h_P; magmaFloat_ptr d_lA[ MagmaMaxSubs * MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t dev, j, k, ngpu, nsub, n_local, nb, nk, ldn_local, maxm; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Initialize queues */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[MagmaMaxGPUs]; magma_int_t num = 0; magma_int_t err; err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for( dev=0; dev < opts.ngpu; dev++ ) { err = magma_queue_create( devices[dev], &queues[2*dev] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } err = magma_queue_create( devices[dev], &queues[2*dev+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } } printf("trans %s, ngpu %d, nsub %d\n", lapack_trans_const(opts.transA), (int) opts.ngpu, (int) opts.nsub ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); maxm = 32*((M+31)/32); lda = M; n2 = lda*N; nb = magma_get_sgetrf_nb(M); gflops = FLOPS_SGETRF( M, N ) / 1e9; // nsubs * ngpu must be at least the number of blocks ngpu = opts.ngpu; nsub = opts.nsub; if ( nsub*ngpu > N/nb ) { nsub = 1; ngpu = 1; printf( " * too many GPUs for the matrix size, using %d GPUs and %d submatrices\n", (int) ngpu, (int) nsub ); } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_P, float, lda*nb ); /* Allocate device memory */ if ( opts.transA == MagmaNoTrans ) { ldda = N/nb; /* number of block columns */ ldda = ldda/(ngpu*nsub); /* number of block columns per GPU */ ldda = nb*ldda; /* number of columns per GPU */ if ( ldda * ngpu*nsub < N ) { /* left over */ if ( N-ldda*ngpu*nsub >= nb ) { ldda += nb; } else { ldda += (N-ldda*ngpu*nsub)%nb; } } ldda = ((ldda+31)/32)*32; /* make it a multiple of 32 */ for( j=0; j < nsub * ngpu; j++ ) { TESTING_MALLOC_DEV( d_lA[j], float, ldda*maxm ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < nsub * ngpu; j++ ) { n_local = ((N/nb)/(nsub*ngpu))*nb; if ( j < (N/nb)%(nsub*ngpu) ) { n_local += nb; } else if ( j == (N/nb)%(nsub*ngpu) ) { n_local += N%nb; } TESTING_MALLOC_DEV( d_lA[j], float, ldda*n_local ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if ( info != 0 ) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); if ( opts.transA == MagmaNoTrans ) { for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* transpose on CPU, then copy to GPU */ int ii,jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_P[jj+ii*nk] = h_A[j*lda + ii+jj*lda]; } } magma_ssetmatrix( nk, M, h_P, nk, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, queues[2*(k%ngpu)] ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_ssetmatrix( M, nk, h_A + j*lda, lda, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, queues[2*(k%ngpu)] ); } } gpu_time = magma_wtime(); magma_sgetrf_msub( opts.transA, nsub, ngpu, M, N, d_lA, 0, ldda, ipiv, queues, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* get the matrix from GPUs */ if ( opts.transA == MagmaNoTrans ) { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* copy to CPU and then transpose */ magma_sgetmatrix( nk, M, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, h_P, nk, queues[2*(k%ngpu)] ); int ii, jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_A[j*lda + ii+jj*lda] = h_P[jj+ii*nk]; } } } } else { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_sgetmatrix( M, nk, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, h_A + j*lda, lda, queues[2*(k%ngpu)] ); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_P ); for( dev=0; dev < ngpu; dev++ ) { for( k=0; k < nsub; k++ ) { TESTING_FREE_DEV( d_lA[dev*nsub + k] ); } } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Free queues */ for( dev=0; dev < opts.ngpu; dev++ ) { magma_queue_destroy( queues[2*dev] ); magma_queue_destroy( queues[2*dev+1] ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; real_Double_t gpu_perf1, gpu_time1, gpu_perf2, gpu_time2, gpu_perf3, gpu_time3, alloc_time, free_time; double error; double *h_A; double *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu, NB; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_int_t P=-1; /*Number of threads in the CPU part*/ double d_cpu=-1; /*pourcentgae of the matrix to allocate in the cpu part*/ magma_int_t Pr=-1; /*Number of threads for the panel*/ magma_int_t async_nb; /*Block size*/ double *WORK; magma_int_t WORK_LD, WORK_n; double **dlpanelT; magma_int_t dlpanelT_m, dlpanelT_n; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); P = opts.nthread; async_nb = opts.nb; Pr = opts.panel_nthread; d_cpu = 0.0; #if defined(CPU_PEAK) && defined(GPU_PEAK) d_cpu = magma_amc_recommanded_dcpu(opts.nthread, CPU_PEAK, opts.ngpu, GPU_PEAK); #endif if(opts.fraction_dcpu!=0){ /*Overwrite the one computed with the model*/ d_cpu = opts.fraction_dcpu; } magma_assert(d_cpu > 0 && d_cpu<=1.0, "error: The cpu fraction is invalid. Ensure you use --fraction_dcpu with fraction_dcpu in [0.0, 1.0] or compile with both -DCPU_PEAK=<cpu peak performance> and -DGPU_PEAK=<gpu peak performance> set.\n"); printf("Asynchronous recursif LU... nb:%d, nbcores:%d, dcpu:%f, panel_nbcores:%d, ngpu: %d\n", async_nb, P, d_cpu, Pr, opts.ngpu); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) GPU_Async_v2 GFlop/s (sec) GPU_Async_work_v2 GFlop/s (sec)"); if ( opts.check == 2 ) { printf(" |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; //nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); /*set default number of threads for lapack*/ magma_setlapack_numthreads(P); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_dgetrf_nb( M ); // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time1 = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time1 = magma_wtime() - gpu_time1; gpu_perf1 = gflops / gpu_time1; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ==================================================================== Performs operation using MAGMA_Async: This interface allocate workspace internally =================================================================== */ /*For the benchmark we have 2 approaches*/ /*1. use directly magma_amc */ /*2. use magma_amc_work and add pinned memory time*/ /*We choose approach 2*/ /* nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); magma_amc_init(P, d_cpu, Pr, nb); gpu_time2 = magma_wtime(); magma_dgetrf_async_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time2 = magma_wtime() - gpu_time2; gpu_perf2 = gflops / gpu_time2; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } */ /* ==================================================================== Performs operation using MAGMA_Async_Work =================================================================== */ nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); //Compute workspace dimension WORK_LD = M; NB = (int) ceil( (double) N / nb); WORK_n = (int) ceil(N*d_cpu)+nb; /*TODO:remove +nb replace with A_N*/ //WORK_n = NSplit(NB, d_cpu)*nb; if(WORK_n<nb) WORK_n = nb;//make sure workspace has at least one block column //Make LD and n multiple of 32 //if(WORK_LD%32!=0) WORK_LD = ((WORK_LD + 31)/32)*32; //if(WORK_n%32!=0) WORK_n = ((WORK_n + 31)/32)*32; //Allocate workspace alloc_time = magma_wtime(); if (MAGMA_SUCCESS != magma_dmalloc_pinned(&WORK, WORK_LD*WORK_n)) { //if (MAGMA_SUCCESS != magma_dmalloc_cpu(&WORK, WORK_LD*WORK_n)) { info = MAGMA_ERR_HOST_ALLOC; printf("magma_dmalloc_pinned returned error %d: %s.\n ", (int) info); } /* Workspace for the panels on the GPU*/ dlpanelT_m = WORK_n; /*assume that the cpu and gpu use the same buffer size*/ dlpanelT_n = M; dlpanelT = (double **) malloc(ngpu*sizeof(double*)); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); if (MAGMA_SUCCESS != magma_dmalloc(&dlpanelT[dev], dlpanelT_m*dlpanelT_n)) { info = MAGMA_ERR_DEVICE_ALLOC; printf("magma_dmalloc returned error %d: %s.\n ", (int) info); } } alloc_time = magma_wtime() - alloc_time; //First touch the workspace with each thread. This may be needed to avoid using numactl --interleave //magma_amc_dmemset(WORK, 0.0, WORK_LD*WORK_n, 256, P); //nb //#pragma omp parallel for private(info) schedule(static,nb) //for(info=0;info<WORK_LD*WORK_n;info++) WORK[info] = 0.0; //alternative first touch by the thread magma_amc_init(P, d_cpu, Pr, nb); gpu_time3 = magma_wtime(); magma_dgetrf_mgpu_work_amc_v3(ngpu, M, N, d_lA, ldda, ipiv, &info, WORK, WORK_LD, WORK_n); gpu_time3 = magma_wtime() - gpu_time3; gpu_perf3 = gflops / gpu_time3; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); //Free workspace free_time = magma_wtime(); magma_free_pinned(WORK); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); magma_free(dlpanelT[dev]); } free(dlpanelT); free_time = magma_wtime() - free_time; /*DEDUCE t2, JUST FOR THE BENCHMARK*/ gpu_time2 = gpu_time3 + alloc_time + free_time; gpu_perf2 = gflops / gpu_time2; for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } */ printf("%5d %5d", (int) M, (int) N); if(cpu_perf!=0.0){ printf(" %7.2f (%7.2f)", cpu_perf, cpu_time); } else{ printf(" --- ( --- )"); } if(gpu_perf1!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf1, gpu_time1); } else{ printf(" --- ( --- )"); } if(gpu_perf2!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf2, gpu_time2); } else{ printf(" --- ( --- )"); } if(gpu_perf3!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf3, gpu_time3); } else{ printf(" --- ( --- )"); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }