/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_dgeqrf_nb(M); gflops = FLOPS_DGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_dgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf(magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double *dA, *dwork, *dT; double c_one = MAGMA_D_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_dgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_dgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = current_stream; } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_dsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i > 0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_dgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_dgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); dpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_dsetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_dgetmatrix_async( m, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[1] ); } magma_int_t rows = m-i; lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_dgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double eps = lapackf77_dlamch("E"); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("%%===============================================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("%%=========================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, magma_ceildiv(N,nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for( int j=0; j < 4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &lda, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- dqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork, #endif magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. For [cz]geqp3 only: RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); if (*info == 0) { minmn = min(m,n); if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #if defined(PRECISION_d) || defined(PRECISION_s) lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = cblas_dnrm2(sm, A(nfxd, j), ione); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j>nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* dgeqp3 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; double *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgeqrf_nb( M ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, h_A, &M, tau, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lhwork ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_DEVALLOC( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &M, tau, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_DEVFREE( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb_mgpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magmaDouble_ptr dAmgpu[], magma_int_t ldda, magmaDouble_ptr dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t queues[][20], magma_int_t nqueue, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt) + (a_1)-1) #define dA(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nqueue >= 3); assert (nqueue >= (ngpu+1)); *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n", nbcmplx); #endif double *dspace[MagmaMaxGPUs]; double *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; double *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; double *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation // TODO check malloc for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_dmalloc( &dspace[dev], devworksiz ); magma_dmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( queues[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_dmalloc_pinned ( &workngpu[ngpu], worksiz); double *worktest = NULL; //magma_dmalloc_cpu( &worktest, n*nb ); // not used // ====================== double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_dgetmatrix_async( (pm+pn), pn, dA(idev, i, di+1), ldda, A( i, i), lda, queues[ idev ][ nqueue-1 ] ); //magma_setdevice( 0 ); //printf("updating dsyr2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute DSYR2K_MGPU magmablas_dsyr2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, queues, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( queues[idev][ nqueue-1 ] ); //magma_setdevice( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the DSYR2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by dsyr2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dv[dev], pm, queues[dev][nqueue-1] ); // Send the triangular factor T to the GPU magma_dsetmatrix_async( pk, pk, hT, nb, dT(dev, 1, i), lddt, queues[dev][nqueue-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ nqueue-1 ] ); magma_queue_sync( queues[dev][nqueue-1] ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dT(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nqueue; ++s ) magma_queue_sync( queues[dev][s] ); } // compute DSYMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if (ngpu == 1) { magmablasSetKernelStream( queues[ 0 ][ 0 ] ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); } else { magmablas_dsymm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, queues, nqueue-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of DSYMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); // magma_queue_sync( queues[dev][0] ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next DSYR2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ nqueue-1 ] ); //magma_queue_sync( queues[idev][0] ); removed because the sync has been done in the loop above magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev], pm, c_one, dA(idev, indi, di+1), ldda); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev], pm, dv[idev], pm, c_one, dA(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ 0 ] ); //printf("LAST DSYR2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev], pm, d_one, dA(idev, indi, di+1), ldda); /* Send the last block to the CPU */ dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); magma_dgetmatrix( pk, pk, dA(idev, indi, di+1), ldda, A(n-pk+1, n-pk+1), lda ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } indi_old = indi; //indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { magma_event_destroy( redevents[dev][e] ); } } magma_free_pinned(workngpu[ngpu]); magma_free_cpu(worktest); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dsytrd_sy2sb_mgpu */
extern "C" magma_err_t magma_dgeqrf_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, double *tau, magmaDouble_ptr dT, size_t dT_offset, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DGEQRF computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R. This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). dT (workspace/output) DOUBLE_PRECISION array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define t_ref(a_1) dT, (dT_offset + (a_1)*nb) #define d_ref(a_1) dT, (dT_offset + (minmn + (a_1))*nb) #define dd_ref(a_1) dT, (dT_offset + (2*minmn+(a_1))*nb) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; double *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_dmalloc_cpu( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(double)); magma_event_t event[2] = {NULL, NULL}; ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async( rows, ib, a_ref(i,i), ldda, work_ref(i), 0, ldwork, queue, &event[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, a_ref(old_i, old_i ), ldda, t_ref(old_i), nb, a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork, queue); /* store the diagonal */ magma_dsetmatrix_async( old_ib, old_ib, ut, 0, old_ib, d_ref(old_i), old_ib, queue, &event[0] ); } magma_event_sync(event[1]); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it */ magma_event_sync(event[0]); dsplit_diag_block(ib, work_ref(i), ldwork, ut); magma_dsetmatrix( rows, ib, work_ref(i), 0, ldwork, a_ref(i,i), ldda, queue); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_dsetmatrix( ib, ib, hwork, 0, ib, t_ref(i), nb, queue ); if (i+nb < k-nb){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork, queue); } else { cols = n-i-ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork, queue); /* Fix the diagonal block */ magma_dsetmatrix( ib, ib, ut, 0, ib, d_ref(i), ib , queue); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix( rows, ib, a_ref(i, i), ldda, work, 0, rows, queue ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix( rows, ib, work, 0, rows, a_ref(i, i), ldda, queue ); } magma_free_cpu( work ); return *info; } /* magma_dgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, 1024, 1024); M = N = size[9]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, lda*N ); TESTING_MALLOC( h_A2, double, lda*N ); TESTING_MALLOC( h_B, double, ldb*nrhs ); TESTING_MALLOC( h_X, double, ldb*nrhs ); TESTING_MALLOC( h_R, double, ldb*nrhs ); TESTING_DEVALLOC( d_A, double, ldda*N ); TESTING_DEVALLOC( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC( h_work, double, lhwork ); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; flops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1000000; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); start = get_current_time(); magma_dgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); start = get_current_time(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); end = get_current_time(); cpu_perf = flops / GetTimerValue(start, end); if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", (int) -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
magma_err_t magma_dgeqrf2_2q_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, double *tau, magma_err_t *info, magma_queue_t* queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDouble_ptr dwork; double *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_dgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double)*lwork, NULL, NULL); work = (double*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(double), 0, NULL, NULL, NULL); nbmin = 2; nx = 2*nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL); clFlush(queues[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]); magma_dsetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork, dA(old_i, old_i), ldda, queues[1], NULL); clFlush(queues[1]); } magma_queue_sync(queues[0]); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_dsetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[0]); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_dsetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queues[1]); if (i+nb < k-nx){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_queue_sync(queues[1]); }else { magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_dsetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queues[1]); clFlush(queues[1]); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queues[0]); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queues[0]); } clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); // magma_free_cpu(work); return *info; } /* magma_dgeqrf2_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *dtau, *h_work, tmp[1]; magmaDouble_ptr d_A; magmaDouble_ptr dwork; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, min_mn ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); // warmup if ( opts.warmup ) { magma_dgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); magma_dgeqr2_gpu( M, N, d_A, ldda, dtau, dwork, &info ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqr2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF computes a QR factorization of a DOUBLE PRECISION M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_dgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { #define A(i_,j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dA, ((i_) + (j_)*ldda + dA_offset) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dwork(i_) dwork, ((i_) + dwork_offset) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif /* Constants */ const double c_one = MAGMA_D_ONE; /* Local variables */ magmaDouble_ptr dA, dT, dwork; magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib; /* Function Body */ *info = 0; magma_int_t nb = magma_get_dgeqrf_nb( m, n ); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max( n*nb, 2*nb*nb ); work[0] = magma_dmake_lwork( lwkopt ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min( m, n ); if (min_mn == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = magma_roundup( n, 32 ) - nb; ldda = magma_roundup( m, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_dgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info ); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* alloc failed so call non-GPU-resident version */ return magma_dgeqrf_ooc( m, n, A, lda, tau, work, lwork, info ); } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); //used for timing CPU and GPU int iter = 0; float cpu_time = 0.0; float gpu_time = 0.0; int affinity = map_cpu(0); if(affinity != 0) { printf("affinity failed\n"); return -1; } magma_set_lapack_numthreads(1); // for initial setting, better to be automatic in the future // SetGPUFreq(324, 324); // system("echo 1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed"); SetGPUFreq(2600, 705); system("echo 2500000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed"); double gpu_iter1_low = 2096.544434; double gpu_iter1_high = 478.825226; double cpu_iter1_low = 1792.011230; double cpu_iter1_high = 1413.732788; double gpu_pred_high = gpu_iter1_high; double gpu_pred_low = gpu_iter1_low; double cpu_pred_high = cpu_iter1_high; double cpu_pred_low = cpu_iter1_low; double ratio_split_freq = 0; double time_until_interrupt = 0; cudaEvent_t start_cpu, stop_cpu; cudaEvent_t start_gpu, stop_gpu; // switches for different modes bool timing = false; //for initial setting only, greatly impact performance bool dvfs = false; //turn on dvfs energy saving bool relax = false; //turn on relax scheme bool r2h = false; // turn on race to halt //these parameters need to be tuned in future works. double dvfs_converage = 0.5; double prediction_offset_gpu = 0.65; double prediction_offset_cpu = 0.65; //for nvprof profiler, brings slight constant performance overhead //cudaProfilerStart(); if ( (nb > 1) && (nb < min_mn) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_dsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, queues[0] ); old_i = 0; old_ib = nb; for (i = 0; i < min_mn-nb; i += nb) { ib = min( min_mn-i, nb ); if (i > 0) { /* get i-th panel from device */ magma_queue_sync( queues[1] ); magma_dgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, queues[0] ); if (timing) { //start gpu timing cudaEventCreate(&start_gpu); cudaEventCreate(&stop_gpu); cudaEventRecord(start_gpu, 0); } /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT(0,0), nb, dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] ); double ratio_slack_pred = 1.0 - (double)nb/(m-iter*nb); cpu_pred_high = cpu_pred_high * ratio_slack_pred; cpu_pred_low = cpu_pred_low * ratio_slack_pred; gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred; gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred; if (timing) { printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high); printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high); } if (iter < dvfs_converage*(min_mn-nb)/nb) { if (cpu_pred_high > gpu_pred_high) { //slack on GPU ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1)); time_until_interrupt = gpu_pred_low * ratio_split_freq; //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt); // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq); if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g'); else dvfs_adjust(cpu_pred_high, 'g'); } } else if (r2h) { r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g'); } } else { //slack on CPU ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1)); time_until_interrupt = cpu_pred_low * ratio_split_freq; if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c'); else dvfs_adjust(gpu_pred_high, 'c'); } } else if (r2h) { r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c'); } } } if (timing) { //end gpu timing cudaEventRecord(stop_gpu, 0); cudaEventSynchronize(stop_gpu); cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu); cudaEventDestroy(start_gpu); cudaEventDestroy(stop_gpu); printf("iter:%d GPU time:%f\n", iter, gpu_time); } magma_dgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); magma_queue_sync( queues[0] ); } magma_int_t rows = m-i; if (timing) { //start cpu timing cudaEventCreate(&start_cpu); cudaEventCreate(&stop_cpu); cudaEventRecord(start_cpu, 0); } lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib ); if (timing) { //end cpu timing cudaEventRecord(stop_cpu, 0); cudaEventSynchronize(stop_cpu); cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu); cudaEventDestroy(start_cpu); cudaEventDestroy(stop_cpu); printf("iter:%d CPU time:%f\n", iter, cpu_time); if (gpu_time < cpu_time) { printf("slack: +\n"); } else { printf("slack: -\n"); } } magma_dpanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); /* put i-th V matrix onto device */ magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); /* put T matrix onto device */ magma_queue_sync( queues[1] ); magma_dsetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] ); magma_queue_sync( queues[0] ); if (i + ib < n) { if (i+ib < min_mn-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } old_i = i; old_ib = ib; } iter ++; } //for nvprof profiler. //cudaProfilerStop(); } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < min_mn) { ib = n-i; if (i != 0) { magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); } magma_int_t rows = m-i; lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_dgeqrf */
/** Purpose ------- DGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. double *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; double *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and dlarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think dlarfb work could be smaller, max(n_local[:]). * Oddly, T and dlarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_dmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for dgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_dmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_dgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_dgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); dpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_dsetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if dpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ dq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_dsetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for dgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_dgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dsetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( stream[dev][0] ); magma_queue_destroy( stream[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dgeqrf2_mgpu */
/** Purpose ------- DGEQRF3 computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them and the corresponding parts of the upper triangular R are stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) DOUBLE_PRECISION array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal matrices for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf3_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, double *tau, magmaDouble_ptr dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; double *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(double)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_dsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_dgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut. */ magma_queue_sync( stream[0] ); dsplit_diag_block3(ib, work(i), ldwork, ut); magma_dsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_dsetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_dsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_dgeqrf_gpu */
/** Purpose ------- DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace, for [cz]geqp3 only) DOUBLE PRECISION array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, #endif magma_int_t *info ) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt=0, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #ifdef REAL double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = magma_cblas_dnrm2( sm, A(nfxd,j), ione ); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j > nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* magma_dgeqp3 */
/** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Other versions (magma_dgeqrf_gpu and magma_dgeqrf3_gpu) store the intermediate T matrices. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) double *dwork; double *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; /* download i-th panel */ magma_queue_sync( stream[1] ); magma_dgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[0] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_dsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[1] ); } magma_queue_sync( stream[0] ); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_dsetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); } else { magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_dsetmatrix_async( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[1] ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] ); magma_queue_sync( stream[1] ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dgeqrf2_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, error2; double c_zero = MAGMA_D_ZERO; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr d_A2, d_T2, ddA2, dtau2; magmaDouble_ptr dwork, dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 10. * opts.tolerance * lapackf77_dlamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R - Q^H*A|| ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because dgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because dgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); TESTING_MALLOC_DEV( d_T2, double, N*N ); TESTING_MALLOC_DEV( ddA2, double, N*N ); TESTING_MALLOC_DEV( dtau2, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_dlaset magmablas_dlaset( MagmaFull, N, N, c_zero, c_zero, ddA, N ); magmablas_dlaset( MagmaFull, N, N, c_zero, c_zero, d_T, N ); magmablas_dlaset( MagmaFull, N, N, c_zero, c_zero, ddA2, N ); magmablas_dlaset( MagmaFull, N, N, c_zero, c_zero, d_T2, N ); lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_dgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_dgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_dgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_dgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, NULL, &info); //magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); magma_dgetmatrix( N, N, ddA, N, h_T, N ); magma_dgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } magma_int_t ldq = M; magma_int_t ldr = min_mn; double *Q, *R; double *work; TESTING_MALLOC_CPU( Q, double, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, double, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_dlacpy( "Lower", &M, &min_mn, h_R, &M, Q, &ldq ); lapackf77_dorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_dlaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_dlacpy( "Upper", &min_mn, &N, h_R, &M, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_dgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); double Anorm = lapackf77_dlange( "1", &M, &N, h_A, &lda, work ); error2 = lapackf77_dlange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error2 /= (N*Anorm); TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_R, &M, h_A, &lda ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_dgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_dgetmatrix( N, N, d_T, N, h_T, N ); double terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = sqrt( terr ); // If comparison to LAPACK fail, check || R - Q^H*A || / (N * ||A||) // and print fail if both fails, otherwise print ok (*) printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error2, terr, (error2 < tol ? "ok" : "failed" )); status += ! (error2 < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *dT, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define dA(a_1,a_2) (dA + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_1) (dT + ((a_1)-1)*(lddt)) int ldda = ((n+31)/32)*32; int lddt = nb; double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); double *dA; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); /* Use the first panel of dA as work space */ double *dwork = dA + n*ldda; double *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); magmablasSetKernelStream( stream[0] ); magma_event_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //magma_event_create(&Pupdate_event); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Copy the matrix to the GPU */ if (1 <= n-nb) { trace_gpu_start( 0, 0, "set", "set A" ); magma_dsetmatrix_async( (n-nb), (n-nb), A(nb+1, nb+1), lda, dA(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); magma_queue_wait_event(stream[1], Pupdate_event); //, 0); magma_dgetmatrix_async( (pm+pn), pn, dA( i, i), ldda, A ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "her2k", "her2k" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dA(indi_old+pn_old, indj_old), ldda, dW + pn_old, pm_old, d_one, dA(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dA(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_dsetmatrix_async( pk, pk, hT, nb, dT(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore dq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dA(indi, indj), ldda, dT(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "hemm", "X = A*work" ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dA(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dA(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dA(indi, indj), ldda, dW, pm, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dW, pm, dA(indi, indj), ldda, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); magma_event_record(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "her2k", "her2k last iteration" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dA(indi, indj), ldda, dW, pm, d_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb) { dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_dgetmatrix( pk, pk, dA(n-pk+1, n-pk+1), ldda, A(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "dsytrd_sy2sb.svg", "trace.css" ); magma_event_destroy( Pupdate_event ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); return *info; } /* magma_dsytrd_sy2sb */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); const double d_neg_one = MAGMA_D_NEG_ONE; const double d_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const double c_one = MAGMA_D_ONE; const double c_zero = MAGMA_D_ZERO; const magma_int_t ione = 1; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double Anorm, error=0, error2=0; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, dT; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); printf( "version %d\n", (int) opts.version ); if ( opts.version == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |R - Q^H*A| |I - Q^H*Q|\n"); printf("===============================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |b - A*x|\n"); printf("================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { // LAPACK complaint arguments magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info ); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 1 ) { // stores dT, V blocks have zeros, R blocks inverted & stored in dT magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // stores dT, V blocks have zeros, R blocks stored in dT magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check && opts.version == 2 ) { /* ===================================================================== Check the result, following zqrt01 except using the reduced Q. This works for any M,N (square, tall, wide). Only for version 2, which has LAPACK complaint output. =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, lda ); magma_int_t ldq = M; magma_int_t ldr = min_mn; double *Q, *R; double *work; TESTING_MALLOC_CPU( Q, double, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, double, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); // generate M by K matrix Q, where K = min(M,N) lapackf77_dlacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq ); lapackf77_dorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info ); assert( info == 0 ); // copy K by N matrix R lapackf77_dlaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr ); lapackf77_dlacpy( "Upper", &min_mn, &N, h_R, &lda, R, &ldr ); // error = || R - Q^H*A || / (N * ||A||) blasf77_dgemm( "Conj", "NoTrans", &min_mn, &N, &M, &c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr ); Anorm = lapackf77_dlange( "1", &M, &N, h_A, &lda, work ); error = lapackf77_dlange( "1", &min_mn, &N, R, &ldr, work ); if ( N > 0 && Anorm > 0 ) error /= (N*Anorm); // set R = I (K by K identity), then R = I - Q^H*Q // error = || I - Q^H*Q || / N lapackf77_dlaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr ); blasf77_dsyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr ); error2 = lapackf77_dlansy( "1", "Upper", &min_mn, R, &ldr, work ); if ( N > 0 ) error2 /= N; TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; } else if ( opts.check && M >= N ) { /* ===================================================================== Check the result by solving consistent linear system, A*x = b. Only for versions 1 & 3 with M >= N. =================================================================== */ magma_int_t lwork; double *x, *b, *hwork; magmaDouble_ptr d_B; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #ifdef HAVE_CUBLAS else if ( opts.version == 3 ) { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } #endif else { printf( "Unknown version %d\n", (int) opts.version ); exit(1); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Print performance and error. =================================================================== */ printf("%5d %5d ", (int) M, (int) N ); if ( opts.lapack ) { printf( "%7.2f (%7.2f)", cpu_perf, cpu_time ); } else { printf(" --- ( --- )" ); } printf( " %7.2f (%7.2f) ", gpu_perf, gpu_time ); if ( opts.check ) { if ( opts.version == 2 ) { bool okay = (error < tol && error2 < tol); status += ! okay; printf( "%11.2e %11.2e %s\n", error, error2, (okay ? "ok" : "failed") ); } else if ( M >= N ) { bool okay = (error < tol); status += ! okay; printf( "%10.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf( "(error check only for M >= N)\n" ); } } else { printf( " ---\n" ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgels */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, error, Anorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; size = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_dgeqrf_nb(M); gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, NULL, &lda, NULL, NULL, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_A2, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &size, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &size, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_dlarnv( &ione, ISEED, &size, h_X ); //blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_dgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgels3_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_dlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_daxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *d_A, *h_R, *tau, *dT, *h_work, tmp[1]; magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol; opts.lapack |= (opts.version == 2 && opts.check == 2); // check (-c2) implies lapack (-l) if ( opts.version != 2 && opts.check == 1 ) { printf( " ===================================================================\n" " NOTE: -c check for this version will be wrong\n" " because tester ignores the special structure of MAGMA dgeqrf resuls.\n" " We reset it to -c2.\n" " ===================================================================\n\n"); opts.check = 2; } if ( opts.version == 2 ) { if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1*eps) ||I-Q'Q||_1 / (M*eps)\n"); printf("=========================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } tol = 1.0; } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||Ax-b||_F/(N*||A||_F*||x||_F)\n"); printf("====================================================================================\n"); tol = opts.tolerance * lapackf77_dlamch("E"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGEQRF( M, N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 2 ) { magma_dgeqrf2_gpu( M, N, d_A, ldda, tau, &info); } else { nb = magma_get_dgeqrf_nb( M ); size = (2*min(M, N) + (N+31)/32*32 )*nb; TESTING_MALLOC_DEV( dT, double, size ); if ( opts.version == 3 ) { magma_dgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info); } else { magma_dgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info); } } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau2, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( tau2 ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { if ( opts.version == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( M >= N ) { magma_int_t lwork; double *x, *b, *d_B, *hwork; const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; const magma_int_t ione = 1; // initialize RHS, b = A*random TESTING_MALLOC_CPU( x, double, N ); TESTING_MALLOC_CPU( b, double, M ); lapackf77_dlarnv( &ione, ISEED, &N, x ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione ); // copy to GPU TESTING_MALLOC_DEV( d_B, double, M ); magma_dsetvector( M, b, 1, d_B, 1 ); if ( opts.version == 1 ) { // allocate hwork magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } else { // allocate hwork magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, tmp, -1, &info ); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lwork ); // solve linear system magma_dgeqrs3_gpu( M, N, 1, d_A, ldda, tau, dT, d_B, M, hwork, lwork, &info ); if (info != 0) printf("magma_dgeqrs3 returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE_CPU( hwork ); } magma_dgetvector( N, d_B, 1, x, 1 ); // compute r = Ax - b, saved in b lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); blasf77_dgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione ); // compute residual |Ax - b| / (n*|A|*|x|) double norm_x, norm_A, norm_r, work[1]; norm_A = lapackf77_dlange( "F", &M, &N, h_A, &lda, work ); norm_r = lapackf77_dlange( "F", &M, &ione, b, &M, work ); norm_x = lapackf77_dlange( "F", &N, &ione, x, &N, work ); TESTING_FREE_CPU( x ); TESTING_FREE_CPU( b ); TESTING_FREE_DEV( d_B ); error = norm_r / (N * norm_A * norm_x); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf(" %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- ", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); if ( opts.version != 2 ) TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_B, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, double, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, double, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
extern "C" magma_int_t magma_dgeqrf(magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQRF computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_dgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double *dA, *dwork, *dT; double c_one = MAGMA_D_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_dgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_dgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[3], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); magma_queue_create( &stream[2] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_dsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[2] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_dgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_dgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[2] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); dpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_dsetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); dq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda ); } magma_int_t rows = m-i; lapackf77_dgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[2] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_dgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr dwork; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t nsize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t i, info, min_mn; magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (32*2+2)*min_mn) ); double *h1 = (double*)malloc(sizeof(double)*N*N); memset(h1, 0, N*N*sizeof(double)); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_PIN( h_work, double, lwork ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N)) / 1e9; /* Initialize the matrix */ magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up // magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); /* magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); */ gpu_time = magma_wtime(); magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue ); magma_dgetmatrix( N, N, ddA, 0, N, h_T, 0, N, queue ); // Restore the upper triangular part of A before the check for(int col=0; col<N; col++){ for(int row=0; row<=col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / error; // Check if T is the same double terr = 0.; magma_dgetmatrix( N, N, d_T, 0, N, h_T, 0, N, queue ); for(int col=0; col<N; col++) for(int row=0; row<=col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_T ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); free(h1); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double matnorm, work[1]; double mzone = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (double)M, (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_dgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_dgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_dgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_dlange("f", &M, &N, h_A, &M, work); blasf77_daxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_dlange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_T, *ddA, *dtau; double *d_A2, *d_T2, *ddA2, *dtau2; double *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 10. * opts.tolerance * lapackf77_dlamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because dgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because dgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); TESTING_MALLOC_DEV( d_T2, double, N*N ); TESTING_MALLOC_DEV( ddA2, double, N*N ); TESTING_MALLOC_DEV( dtau2, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_dlaset cudaMemset(ddA, 0, N*N*sizeof(double)); cudaMemset(d_T, 0, N*N*sizeof(double)); cudaMemset(ddA2, 0, N*N*sizeof(double)); cudaMemset(d_T2, 0, N*N*sizeof(double)); lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_dgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_dgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_dgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_dgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_dgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); magma_dgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_dgetmatrix( N, N, d_T, N, h_T, N ); double terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF4 computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R using multiple GPUs. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { double *da[MagmaMaxGPUs]; double c_one = MAGMA_D_ONE; int i, k, ldda; *info = 0; int nb = magma_get_dgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (num_gpus < 0 || num_gpus > MagmaMaxGPUs) { *info = -1; } else if (m < 0) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); ldda = ((m+31)/32)*32; magma_int_t n_local[MagmaMaxGPUs]; for (i=0; i < num_gpus; i++) { n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; magma_setdevice(i); // TODO on failure, free previously allocated memory if (MAGMA_SUCCESS != magma_dmalloc( &da[i], ldda*n_local[i] )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } if (m > nb && n > nb) { /* Copy the matrix to the GPUs in 1D block cyclic distribution */ magma_dsetmatrix_1D_col_bcyclic(m, n, A, lda, da, ldda, num_gpus, nb); /* Factor using the GPU interface */ magma_dgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info); /* Copy the matrix back from the GPUs to the CPU */ magma_dgetmatrix_1D_col_bcyclic(m, n, da, ldda, A, lda, num_gpus, nb); } else { lapackf77_dgeqrf(&m, &n, A, &lda, tau, work, &lwork, info); } /* Free the allocated GPU memory */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); magma_free( da[i] ); } magma_setdevice( orig_dev ); return *info; } /* magma_dgeqrf4 */