/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; double *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; real_Double_t gpu_perf1, gpu_time1, gpu_perf2, gpu_time2, gpu_perf3, gpu_time3, alloc_time, free_time; double error; double *h_A; double *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu, NB; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_int_t P=-1; /*Number of threads in the CPU part*/ double d_cpu=-1; /*pourcentgae of the matrix to allocate in the cpu part*/ magma_int_t Pr=-1; /*Number of threads for the panel*/ magma_int_t async_nb; /*Block size*/ double *WORK; magma_int_t WORK_LD, WORK_n; double **dlpanelT; magma_int_t dlpanelT_m, dlpanelT_n; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); P = opts.nthread; async_nb = opts.nb; Pr = opts.panel_nthread; d_cpu = 0.0; #if defined(CPU_PEAK) && defined(GPU_PEAK) d_cpu = magma_amc_recommanded_dcpu(opts.nthread, CPU_PEAK, opts.ngpu, GPU_PEAK); #endif if(opts.fraction_dcpu!=0){ /*Overwrite the one computed with the model*/ d_cpu = opts.fraction_dcpu; } magma_assert(d_cpu > 0 && d_cpu<=1.0, "error: The cpu fraction is invalid. Ensure you use --fraction_dcpu with fraction_dcpu in [0.0, 1.0] or compile with both -DCPU_PEAK=<cpu peak performance> and -DGPU_PEAK=<gpu peak performance> set.\n"); printf("Asynchronous recursif LU... nb:%d, nbcores:%d, dcpu:%f, panel_nbcores:%d, ngpu: %d\n", async_nb, P, d_cpu, Pr, opts.ngpu); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) GPU_Async_v2 GFlop/s (sec) GPU_Async_work_v2 GFlop/s (sec)"); if ( opts.check == 2 ) { printf(" |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; //nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); /*set default number of threads for lapack*/ magma_setlapack_numthreads(P); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_dgetrf_nb( M ); // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time1 = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time1 = magma_wtime() - gpu_time1; gpu_perf1 = gflops / gpu_time1; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ==================================================================== Performs operation using MAGMA_Async: This interface allocate workspace internally =================================================================== */ /*For the benchmark we have 2 approaches*/ /*1. use directly magma_amc */ /*2. use magma_amc_work and add pinned memory time*/ /*We choose approach 2*/ /* nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); magma_amc_init(P, d_cpu, Pr, nb); gpu_time2 = magma_wtime(); magma_dgetrf_async_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time2 = magma_wtime() - gpu_time2; gpu_perf2 = gflops / gpu_time2; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } */ /* ==================================================================== Performs operation using MAGMA_Async_Work =================================================================== */ nb = async_nb; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate device memory n_local = numcols2p(0, N, nb, ngpu); ldn_local = n_local; //ldn_local = ((n_local+31)/32)*32; for( int dev=0; dev < ngpu; dev++){ magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); // Switch to the sequential version of BLAS magma_setlapack_numthreads(1); //Compute workspace dimension WORK_LD = M; NB = (int) ceil( (double) N / nb); WORK_n = (int) ceil(N*d_cpu)+nb; /*TODO:remove +nb replace with A_N*/ //WORK_n = NSplit(NB, d_cpu)*nb; if(WORK_n<nb) WORK_n = nb;//make sure workspace has at least one block column //Make LD and n multiple of 32 //if(WORK_LD%32!=0) WORK_LD = ((WORK_LD + 31)/32)*32; //if(WORK_n%32!=0) WORK_n = ((WORK_n + 31)/32)*32; //Allocate workspace alloc_time = magma_wtime(); if (MAGMA_SUCCESS != magma_dmalloc_pinned(&WORK, WORK_LD*WORK_n)) { //if (MAGMA_SUCCESS != magma_dmalloc_cpu(&WORK, WORK_LD*WORK_n)) { info = MAGMA_ERR_HOST_ALLOC; printf("magma_dmalloc_pinned returned error %d: %s.\n ", (int) info); } /* Workspace for the panels on the GPU*/ dlpanelT_m = WORK_n; /*assume that the cpu and gpu use the same buffer size*/ dlpanelT_n = M; dlpanelT = (double **) malloc(ngpu*sizeof(double*)); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); if (MAGMA_SUCCESS != magma_dmalloc(&dlpanelT[dev], dlpanelT_m*dlpanelT_n)) { info = MAGMA_ERR_DEVICE_ALLOC; printf("magma_dmalloc returned error %d: %s.\n ", (int) info); } } alloc_time = magma_wtime() - alloc_time; //First touch the workspace with each thread. This may be needed to avoid using numactl --interleave //magma_amc_dmemset(WORK, 0.0, WORK_LD*WORK_n, 256, P); //nb //#pragma omp parallel for private(info) schedule(static,nb) //for(info=0;info<WORK_LD*WORK_n;info++) WORK[info] = 0.0; //alternative first touch by the thread magma_amc_init(P, d_cpu, Pr, nb); gpu_time3 = magma_wtime(); magma_dgetrf_mgpu_work_amc_v3(ngpu, M, N, d_lA, ldda, ipiv, &info, WORK, WORK_LD, WORK_n); gpu_time3 = magma_wtime() - gpu_time3; gpu_perf3 = gflops / gpu_time3; magma_amc_finalize(); if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); //Free workspace free_time = magma_wtime(); magma_free_pinned(WORK); for(int dev=0;dev<ngpu;dev++){ magma_setdevice(dev); magma_free(dlpanelT[dev]); } free(dlpanelT); free_time = magma_wtime() - free_time; /*DEDUCE t2, JUST FOR THE BENCHMARK*/ gpu_time2 = gpu_time3 + alloc_time + free_time; gpu_perf2 = gflops / gpu_time2; for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } */ printf("%5d %5d", (int) M, (int) N); if(cpu_perf!=0.0){ printf(" %7.2f (%7.2f)", cpu_perf, cpu_time); } else{ printf(" --- ( --- )"); } if(gpu_perf1!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf1, gpu_time1); } else{ printf(" --- ( --- )"); } if(gpu_perf2!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf2, gpu_time2); } else{ printf(" --- ( --- )"); } if(gpu_perf3!=0.0){ printf(" %7.2f (%7.2f)", gpu_perf3, gpu_time3); } else{ printf(" --- ( --- )"); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }