/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; double *T; magma_dmalloc_pinned(&T, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_dlaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, T, ib, dT, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dormqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_B, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, double, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, double, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, double, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_dsetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_dsetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork, #endif magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. For [cz]geqp3 only: RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); if (*info == 0) { minmn = min(m,n); if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #if defined(PRECISION_d) || defined(PRECISION_s) lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = cblas_dnrm2(sm, A(nfxd, j), ione); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j>nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* dgeqp3 */
extern "C" magma_int_t magma_dormqr_m(magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, double *a, magma_int_t lda, double *tau, double *c, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. C (input/output) DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ double c_one = MAGMA_D_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nb = 128; double *t ; magma_dmalloc_pinned (&t, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; magma_int_t left = lapackf77_lsame(side_, "L"); magma_int_t notran = lapackf77_lsame(trans_, "N"); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw)) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i){ magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // Put 0s in the upper triangular part of dA; magmablas_dsetdiag1subdiag0_stream('L', kb, kb, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0]); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], t, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, t, kb, dt(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i){ magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); magma_xerbla( __func__, 1 ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); // 1) copy the panel from A to the GPU, and // 2) Put 0s in the upper triangular part of dA; magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, t, ib, dt, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dt, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_dormqr */
/** Purpose ------- If VECT = MagmaQ, DORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q*C C*Q TRANS = MagmaTrans: Q**H*C C*Q**H If VECT = MagmaP, DORMBR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: P*C C*P TRANS = MagmaTrans: P**H*C C*P**H Here Q and P**H are the unitary matrices determined by DGEBRD when reducing A real matrix A to bidiagonal form: A = Q*B * P**H. Q and P**H are defined as products of elementary reflectors H(i) and G(i) respectively. Let nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Thus nq is the order of the unitary matrix Q or P**H that is applied. If VECT = MagmaQ, A is assumed to have been an NQ-by-K matrix: if nq >= k, Q = H(1) H(2) . . . H(k); if nq < k, Q = H(1) H(2) . . . H(nq-1). If VECT = MagmaP, A is assumed to have been A K-by-NQ matrix: if k < nq, P = G(1) G(2) . . . G(k); if k >= nq, P = G(1) G(2) . . . G(nq-1). Arguments --------- @param[in] vect magma_vect_t - = MagmaQ: apply Q or Q**H; - = MagmaP: apply P or P**H. @param[in] side magma_side_t - = MagmaLeft: apply Q, Q**H, P or P**H from the Left; - = MagmaRight: apply Q, Q**H, P or P**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q or P; - = MagmaTrans: Conjugate transpose, apply Q**H or P**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER If VECT = MagmaQ, the number of columns in the original matrix reduced by DGEBRD. If VECT = MagmaP, the number of rows in the original matrix reduced by DGEBRD. K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,min(nq,K)) if VECT = MagmaQ (LDA,nq) if VECT = MagmaP The vectors which define the elementary reflectors H(i) and G(i), whose products determine the matrices Q and P, as returned by DGEBRD. @param[in] lda INTEGER The leading dimension of the array A. If VECT = MagmaQ, LDA >= max(1,nq); if VECT = MagmaP, LDA >= max(1,min(nq,K)). @param[in] tau DOUBLE_PRECISION array, dimension (min(nq,K)) TAU(i) must contain the scalar factor of the elementary reflector H(i) or G(i) which determines Q or P, as returned by DGEBRD in the array argument TAUQ or TAUP. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q or P*C or P**H*C or C*P or C*P**H. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M); if N = 0 or M = 0, LWORK >= 1. For optimum performance if SIDE = MagmaLeft, LWORK >= max(1,N*NB); if SIDE = MagmaRight, LWORK >= max(1,M*NB), where NB is the optimal blocksize. (NB = 0 if M = 0 or N = 0.) \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_dormbr( magma_vect_t vect, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i,j) (A + (i) + (j)*lda) #define C(i,j) (C + (i) + (j)*ldc) magma_int_t i1, i2, nb, mi, ni, nq, nq_1, nw, iinfo, lwkopt; magma_int_t left, notran, applyq, lquery; magma_trans_t transt; MAGMA_UNUSED( nq_1 ); // used only in version 1 *info = 0; applyq = (vect == MagmaQ); left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q or P and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (m == 0 || n == 0) { nw = 0; } /* check arguments */ if (! applyq && vect != MagmaP) { *info = -1; } else if (! left && side != MagmaRight) { *info = -2; } else if (! notran && trans != MagmaTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (k < 0) { *info = -6; } else if ( ( applyq && lda < max(1,nq) ) || ( ! applyq && lda < max(1,min(nq,k)) ) ) { *info = -8; } else if (ldc < max(1,m)) { *info = -11; } else if (lwork < max(1,nw) && ! lquery) { *info = -13; } if (*info == 0) { if (nw > 0) { // TODO have get_dormqr_nb and get_dormlq_nb routines? see original LAPACK dormbr. // TODO make them dependent on m, n, and k? nb = magma_get_dgebrd_nb( min( m, n )); lwkopt = max(1, nw*nb); } else { lwkopt = 1; } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } if (applyq) { /* Apply Q */ if (nq >= k) { /* Q was determined by a call to DGEBRD with nq >= k */ #if VERSION == 1 lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_dormqr( side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* Q was determined by a call to DGEBRD with nq < k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans), &mi, &ni, &nq_1, A(1,0), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_dormqr( side, trans, mi, ni, nq-1, A(1,0), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } else { /* Apply P */ if (notran) { transt = MagmaTrans; } else { transt = MagmaNoTrans; } if (nq > k) { /* P was determined by a call to DGEBRD with nq > k */ #if VERSION == 1 lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); #else magma_dormlq( side, transt, m, n, k, A, lda, tau, C, ldc, work, lwork, &iinfo); #endif } else if (nq > 1) { /* P was determined by a call to DGEBRD with nq <= k */ if (left) { mi = m - 1; ni = n; i1 = 1; i2 = 0; } else { mi = m; ni = n - 1; i1 = 0; i2 = 1; } #if VERSION == 1 nq_1 = nq - 1; lapackf77_dormlq( lapack_side_const(side), lapack_trans_const(transt), &mi, &ni, &nq_1, A(0,1), &lda, tau, C(i1,i2), &ldc, work, &lwork, &iinfo); #else magma_dormlq( side, transt, mi, ni, nq-1, A(0,1), lda, tau, C(i1,i2), ldc, work, lwork, &iinfo); #endif } } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dormbr */
extern "C" magma_int_t magma_dormqr(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. C (input/output) DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define A(a_1,a_2) ( A + (a_1) + (a_2)*lda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) magma_int_t nb = magma_get_dgeqrf_nb( min( m, n )); double c_one = MAGMA_D_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nq_i, lddwork; magma_int_t i; double *T; magma_int_t i1, i2, step, ib, ic, jc, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = max(1,nw) * nb; work[0] = MAGMA_D_MAKE( lwkopt, 0 ); if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, MagmaTransStr)) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } /* Allocate work space on the GPU */ magma_int_t lddc = m; double *dwork, *dC; magma_dmalloc( &dC, lddc*n ); magma_dmalloc( &dwork, (m + n + nb)*nb ); if ( dC == NULL || dwork == NULL ) { magma_free( dC ); magma_free( dwork ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* work space on CPU */ T = (double*) malloc( 2*nb*nb * sizeof(double) ); if ( T == NULL ) { magma_free( dC ); magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Copy matrix C from the CPU to the GPU */ magma_dsetmatrix( m, n, C, ldc, dC, lddc ); if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, &tau[1], C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( (left && (! notran)) || ((! left) && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for( i=i1; (step<0 ? i>=i2 : i<i2); i += step ) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_dlarft("F", "C", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ dpanel_to_q('U', ib, A(i,i), lda, T+ib*ib); magma_dsetmatrix( nq_i, ib, A(i,i), lda, dwork, nq_i ); dq_to_panel('U', ib, A(i,i), lda, T+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, T, ib, dwork+nq_i*ib, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, nq_i, dwork+nq_i*ib, ib, dC(ic,jc), lddc, dwork+nq_i*ib + ib*ib, lddwork); } magma_dgetmatrix( m, n, dC, lddc, C, ldc ); } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); magma_free( dC ); magma_free( dwork ); free( T ); return *info; } /* magma_dormqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgels */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, error, Anorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; size = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_dgeqrf_nb(M); gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, NULL, &lda, NULL, NULL, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_A2, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &size, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &size, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_dlarnv( &ione, ISEED, &size, h_X ); //blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_dgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgels3_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_dlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_daxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dormqr_gpu(char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, double *dA, magma_int_t ldda, double *tau, double *dC, magma_int_t lddc, double *hwork, magma_int_t lwork, double *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DORMQR_GPU overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) DOUBLE_PRECISION array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. DC (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). HWORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) Currently, dgetrs_gpu assumes that on exit, hwork contains the last block of A and C. This will change and *should not be relied on*! LWORK (input) INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+NB) + N*NB if SIDE = 'L', and LWORK >= (N-K+NB)*(M+NB) + M*NB if SIDE = 'R', where NB is the given blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. DT (input) DOUBLE_PRECISION array on the GPU that is the output (the 9th argument) of magma_dgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_dgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define dA(a_1,a_2) (dA + (a_1) + (a_2)*ldda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) #define dT(a_1) (dT + (a_1)*nb) double c_one = MAGMA_D_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; double *dwork; magma_int_t i, lddwork; magma_int_t i1, i2, step, ib, ic, jc, ma, mi, ni, nq, nw; int left, notran, lquery; magma_int_t lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = (nq - k + nb)*(nw + nb) + nw*nb; hwork[0] = MAGMA_D_MAKE( lwkopt, 0 ); if ( (!left) && (!lapackf77_lsame(side_, "R")) ) { *info = -1; } else if ( (!notran) && (!lapackf77_lsame(trans_, MagmaTransStr)) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < lwkopt && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork = k; dwork = dT(2*lddwork); if ( (left && (! notran)) || ((! left) && notran) ) { // left trans: Q^T C // right notrans: C Q // multiply from first block, i = 0, to next-to-last block, i < k-nb i1 = 0; i2 = k-nb; step = nb; } else { // left notrans: Q C // right trans: C Q^T // multiply from next-to-last block, i = floor((k-1-nb)/nb)*nb, to first block, i = 0 i1 = ((k - 1 - nb) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } /* Use unblocked code to multiply last or only block (cases Q*C or C*Q^T). */ // workspace left: A(mi*nb) + C(mi*ni) + work(ni*nb_la) = (m-k-nb)*nb + (m-k-nb)*n + n*nb // workspace right: A(ni*nb) + C(mi*ni) + work(mi*nb_la) = (n-k-nb)*nb + m*(n-k-nb) + m*nb if ( step < 0 ) { // i is beginning of last block i = i1 - step; if ( i >= k ) { i = i1; } ib = k - i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } double* hA = hwork; double* hC = hwork + ma*ib; double* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_dgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_dgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_dormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_dsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } /* Use blocked code to multiply blocks */ if (nb < k) { for( i=i1; (step<0 ? i>=i2 : i<i2); i+=step ) { ib = min(nb, k - i); if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; jc = i; } magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, i ), ldda, dT(i), nb, dC(ic, jc), lddc, dwork, nw ); } } else { i = i1; } /* Use unblocked code to multiply the last or only block (cases Q^T*C or C*Q). */ if ( step > 0 ) { ib = k-i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } double* hA = hwork; double* hC = hwork + ma*ib; double* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_dgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma ); magma_dgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi ); lapackf77_dormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_dsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc ); } // TODO sync. For cases Q*C and C*Q^T, last call is magma_dlarfb_gpu, // which is async magma_gemm calls, so dormqr can be unfinished. // TODO: dgeqrs_gpu ASSUMES that hwork contains the last block of A and C. // That needs to be fixed, but until then, don't modify hwork[0] here. // In LAPACK: On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. //hwork[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* end of magma_dormqr_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; double *C, *R, *A, *hwork, *tau; magmaDouble_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_dgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*k ); TESTING_MALLOC_CPU( hwork, double, lwork_max ); TESTING_MALLOC_CPU( tau, double, k ); TESTING_MALLOC_DEV( dC, double, ldc*n ); TESTING_MALLOC_DEV( dA, double, lda*k ); TESTING_MALLOC_DEV( dT, double, dt_size ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); magma_dsetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_dsetmatrix( mm, k, A, lda, dA, lda ); magma_dgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_dgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_dgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_dormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // dormqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_dgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_dormqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/***************************************************************************//** Purpose ------- DGELS solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by DGEQRF. @param[in] lda INTEGER The leading dimension of the array A, LDA >= M. @param[in,out] B DOUBLE PRECISION array, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= M. @param[out] hwork (workspace) DOUBLE PRECISION array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= max( N*NB, 2*NB*NB ), where NB is the blocksize given by magma_get_dgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_gels *******************************************************************************/ extern "C" magma_int_t magma_dgels( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr A, magma_int_t lda, magmaDouble_ptr B, magma_int_t ldb, double *hwork, magma_int_t lwork, magma_int_t *info) { /* Constants */ const double c_one = MAGMA_D_ONE; /* Local variables */ double *tau; magma_int_t min_mn; magma_int_t nb = magma_get_dgeqrf_nb( m, n ); magma_int_t lwkopt = max( n*nb, 2*nb*nb ); // (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_dmake_lwork( lwkopt ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now */ *info = -3; else if (nrhs < 0) *info = -4; else if (lda < max(1,m)) *info = -6; else if (ldb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } magma_dmalloc_cpu( &tau, min_mn ); if ( tau == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgeqrf( m, n, A, lda, tau, hwork, lwork, info ); if ( *info == 0 ) { // B := Q' * B lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &m, &nrhs, &n, A, &lda, tau, B, &ldb, hwork, &lwork, info ); // Solve R*X = B(1:n,:) blasf77_dtrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &n, &nrhs, &c_one, A, &lda, B, &ldb ); } magma_free_cpu( tau ); return *info; }
extern "C" magma_int_t magma_dormqr(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *a, magma_int_t lda, double *tau, double *c, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- MAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= DORMQR overwrites the general real M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**T * C C * Q**T where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**T from the Left; = 'R': apply Q or Q**T from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**T. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. C (input/output) DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**T * C or C * Q**T or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ double c_one = MAGMA_D_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; /* Allocate work space on the GPU */ magmaDouble_ptr dwork, dc; magma_malloc( &dc, (m)*(n)*sizeof(double) ); magma_malloc( &dwork, (m + n + 64)*64*sizeof(double) ); /* Copy matrix C from the CPU to the GPU */ magma_dsetmatrix( m, n, c, 0, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_offset, i__4, lddwork; magma_int_t i__; double t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; a_offset = 1 + lda; a -= a_offset; --tau; c_offset = 1 + ldc; c -= c_offset; *info = 0; left = lapackf77_lsame(lapack_const(side_), "L"); notran = lapackf77_lsame(lapack_const(trans_), "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(lapack_const(side_), "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(lapack_const(trans_), "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_D_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(lapack_const(side_), lapack_const(trans_), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i__ + 1; lapackf77_dlarft("F", "C", &i__4, &ib, &a[i__ + i__ * lda], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ dpanel_to_q(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_dsetmatrix( i__4, ib, &a[i__ + i__ * lda], 0, lda, dwork, 0, i__4, queue ); dq_to_panel(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i__ + 1; ic = i__; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i__ + 1; jc = i__; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, t, 0, ib, dwork, i__4*ib, ib, queue ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(ic + jc * m), m, dwork, (i__4*ib + ib*ib), lddwork, queue); } magma_dgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], 0, ldc, queue ); } MAGMA_D_SET2REAL( work[0], lwkopt ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_dormqr */
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define dV(i_,j_) (dV + (i_) + (j_)*nq_i) #define dT(i_,j_) (dT + (i_) + (j_)*ib) #define dwork(i_) (dwork + (i_)) double *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_dgelqf_nb( m, n ); lwkopt = max(1,nw)*nb; work[0] = magma_dmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_D_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_dormqr( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = magma_roundup( m, 32 ); magmaDouble_ptr dwork, dV, dT, dC; magma_dmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_dmalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_dsetmatrix( m, n, C, ldc, dC(0,0), lddc, queue ); if ( (left && ! notran) || (! left && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_dlarft( "Forward", "Columnwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib ); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ magma_dpanel_to_q( MagmaUpper, ib, A(i,i), lda, T2 ); magma_dsetmatrix( nq_i, ib, A(i,i), lda, dV(0,0), nq_i, queue ); magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_dsetmatrix( ib, ib, T, ib, dT(0,0), ib, queue ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dV(0,0), nq_i, dT(0,0), ib, dC(ic,jc), lddc, dwork(0), ldwork, queue ); } magma_dgetmatrix( m, n, dC(0,0), lddc, C, ldc, queue ); magma_queue_destroy( queue ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = magma_dmake_lwork( lwkopt ); return *info; } /* magma_dormqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrs */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; double matnorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, (int) M, (int) N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrs_gpu -nrhs %d -M %d -N %d\n\n", (int) nrhs, 1024, 1024); M = N = size[9]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_dgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, lda*N ); TESTING_MALLOC( h_A2, double, lda*N ); TESTING_MALLOC( h_B, double, ldb*nrhs ); TESTING_MALLOC( h_X, double, ldb*nrhs ); TESTING_MALLOC( h_R, double, ldb*nrhs ); TESTING_DEVALLOC( d_A, double, ldda*N ); TESTING_DEVALLOC( d_B, double, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = -1; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC( h_work, double, lhwork ); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; flops = (FLOPS_GEQRF( (double)M, (double)N ) + FLOPS_GEQRS( (double)M, (double)N, (double)nrhs )) / 1000000; /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); start = get_current_time(); magma_dgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_dgels had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); // Get the solution in h_X magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); start = get_current_time(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); end = get_current_time(); cpu_perf = flops / GetTimerValue(start, end); if (info < 0) printf("Argument %d of lapackf77_dgels had an illegal value.\n", (int) -info); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_dlange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_dlange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_A2 ); TESTING_FREE( h_B ); TESTING_FREE( h_X ); TESTING_FREE( h_R ); TESTING_FREE( h_work ); TESTING_DEVFREE( d_A ); TESTING_DEVFREE( d_B ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; double *T = NULL; magmaDouble_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_dmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_dmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_dmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_dlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_dsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_dgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_dmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_dormqr */
/** Purpose ------- DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace, for [cz]geqp3 only) DOUBLE PRECISION array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, #endif magma_int_t *info ) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt=0, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #ifdef REAL double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = magma_cblas_dnrm2( sm, A(nfxd,j), ione ); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j > nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* magma_dgeqp3 */
/** Purpose ------- DORMQR_GPU overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument dA. dA is modified by the routine but restored on exit. @param[in] ldda INTEGER The leading dimension of the array dA. If SIDE = MagmaLeft, LDDA >= max(1,M); if SIDE = MagmaRight, LDDA >= max(1,N). @param[in] tau DOUBLE PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] dC DOUBLE PRECISION array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array DC. LDDC >= max(1,M). @param[out] hwork (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) \n Currently, dgetrs_gpu assumes that on exit, hwork contains the last block of A and C. This will change and *should not be relied on*! @param[in] lwork INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+NB) + N*NB if SIDE = MagmaLeft, and LWORK >= (N-K+NB)*(M+NB) + M*NB if SIDE = MagmaRight, where NB is the given blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. @param[in,out] dT DOUBLE PRECISION array on the GPU that is the output (the 9th argument) of magma_dgeqrf_gpu. Part used as workspace. @param[in] nb INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_dgeqrf_gpu. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDouble_const_ptr dA, magma_int_t ldda, double const *tau, magmaDouble_ptr dC, magma_int_t lddc, double *hwork, magma_int_t lwork, magmaDouble_ptr dT, magma_int_t nb, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_1) + (a_2)*ldda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) #define dT(a_1) (dT + (a_1)*nb) double c_one = MAGMA_D_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magmaDouble_ptr dwork; magma_int_t i, lddwork; magma_int_t i1, i2, step, ib, ic, jc, ma, mi, ni, nq, nw; magma_int_t lwkopt; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); bool lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = (nq - k + nb)*(nw + nb) + nw*nb; hwork[0] = magma_dmake_lwork( lwkopt ); if ( ! left && side != MagmaRight ) { *info = -1; } else if ( ! notran && trans != MagmaTrans ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < lwkopt && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork = k; dwork = dT(2*lddwork); if ( (left && (! notran)) || ((! left) && notran) ) { // left trans: Q^T C // right notrans: C Q // multiply from first block, i = 0, to next-to-last block, i < k-nb i1 = 0; i2 = k-nb; step = nb; } else { // left notrans: Q C // right trans: C Q^T // multiply from next-to-last block, i = floor((k-1-nb)/nb)*nb, to first block, i = 0 i1 = ((k - 1 - nb) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Use unblocked code to multiply last or only block (cases Q*C or C*Q^T). */ // workspace left: A(mi*nb) + C(mi*ni) + work(ni*nb_la) = (m-k-nb)*nb + (m-k-nb)*n + n*nb // workspace right: A(ni*nb) + C(mi*ni) + work(mi*nb_la) = (n-k-nb)*nb + m*(n-k-nb) + m*nb if ( step < 0 ) { // i is beginning of last block i = i1 - step; if ( i >= k ) { i = i1; } ib = k - i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } double* hA = hwork; double* hC = hwork + ma*ib; double* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_dgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma, queue ); magma_dgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi, queue ); lapackf77_dormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_dsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc, queue ); } /* Use blocked code to multiply blocks */ if (nb < k) { for( i=i1; (step < 0 ? i >= i2 : i < i2); i += step ) { ib = min(nb, k - i); if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; jc = i; } magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, i ), ldda, dT(i), nb, dC(ic, jc), lddc, dwork, nw, queue ); } } else { i = i1; } /* Use unblocked code to multiply the last or only block (cases Q^T*C or C*Q). */ if ( step > 0 ) { ib = k-i; if (left) { // ni=n, jc=0, H or H^T is applied to C(i:m-1,0:n-1) mi = m - i; ma = mi; ic = i; } else { // mi=m, ic=0, H or H^T is applied to C(0:m-1,i:n-1) ni = n - i; ma = ni; jc = i; } double* hA = hwork; double* hC = hwork + ma*ib; double* hW = hwork + ma*ib + mi*ni; magma_int_t lhwork = lwork - (ma*ib + mi*ni); magma_dgetmatrix( ma, ib, dA(i, i ), ldda, hA, ma, queue ); magma_dgetmatrix( mi, ni, dC(ic, jc), lddc, hC, mi, queue ); lapackf77_dormqr( side_, trans_, &mi, &ni, &ib, hA, &ma, tau+i, hC, &mi, hW, &lhwork, info ); // send the updated part of C back to the GPU magma_dsetmatrix( mi, ni, hC, mi, dC(ic, jc), lddc, queue ); } // TODO sync. For cases Q*C and C*Q^T, last call is magma_dlarfb_gpu, // which is async magma_gemm calls, so dormqr can be unfinished. // TODO: dgeqrs_gpu ASSUMES that hwork contains the last block of A and C. // That needs to be fixed, but until then, don't modify hwork[0] here. // In LAPACK: On exit, if INFO = 0, HWORK[0] returns the optimal LWORK. //hwork[0] = magma_dmake_lwork( lwkopt ); magma_queue_destroy( queue ); return *info; } /* magma_dormqr_gpu */