/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double eps = lapackf77_dlamch("E"); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("%%===============================================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("%%=========================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, magma_ceildiv(N,nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for( int j=0; j < 4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &lda, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- dqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; double *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgeqrf_nb( M ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, h_A, &M, tau, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lhwork ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_DEVALLOC( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &M, tau, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_DEVFREE( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF4 computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { double *da[MagmaMaxGPUs]; double c_one = MAGMA_D_ONE; int i, k, ldda; *info = 0; int nb = magma_get_dgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < num_gpus; i++) { n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_dmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_dsetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_dgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_dgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb); } else { lapackf77_dgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); magma_free( da[i] ); } magma_setdevice( orig_dev ); return *info; } /* magma_dgeqrf4 */