void MAGMAF_ZLARFB_GPU( char *side, char *trans, char *direct, char *storev, magma_int_t *m, magma_int_t *n, magma_int_t *k, devptr_t *dv, magma_int_t *ldv, devptr_t *dt, magma_int_t *ldt, devptr_t *dc, magma_int_t *ldc, devptr_t *dowrk, magma_int_t *ldwork ) { magma_zlarfb_gpu( side[0], trans[0], direct[0], storev[0], *m, *n, *k, DEVPTR(dv), *ldv, DEVPTR(dt), *ldt, DEVPTR(dc), *ldc, DEVPTR(dowrk), *ldwork); }
extern "C" magma_int_t magma_zgeqlf(magma_int_t m, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SGEQLF computes a QL factorization of a COMPLEX_16 M-by-N matrix A: A = Q * L. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, if m >= n, the lower triangle of the subarray A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L; if m <= n, the elements on and below the (n-m)-th superdiagonal contain the M-by-N lower trapezoidal matrix L; the remaining elements, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB can be obtained through magma_get_zgeqlf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in A(1:m-k+i-1,n-k+i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) magmaDoubleComplex *da, *dwork; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib, nb; magma_int_t rows, cols; magma_int_t ib, ki, kk, mu, nu, iinfo, ldda; int lquery; nb = magma_get_zgeqlf_nb(m); *info = 0; lquery = (lwork == -1); // silence "uninitialized" warnings old_ib = nb; old_i = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } if (*info == 0) { k = min(m,n); if (k == 0) work[0] = c_one; else { work[0] = MAGMA_Z_MAKE( n*nb, 0 ); } if (lwork < max(1,n) && ! lquery) *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (k == 0) return *info; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = da + ldda*(n); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. The last kk columns are handled by the block method. First, copy the matrix on the GPU except the last kk columns */ magma_zsetmatrix_async( (m), (n-nb), a_ref(0, 0), lda, da_ref(0, 0), ldda, stream[0] ); ki = ((k - nb - 1) / nb) * nb; kk = min(k, ki + nb); for (i = k - kk + ki; i >= k -kk; i -= nb) { ib = min(k-i,nb); if (i < k - kk + ki) { /* 1. Copy asynchronously the current panel to the CPU. 2. Copy asynchronously the submatrix below the panel to the CPU) */ rows = m - k + i + ib; magma_zgetmatrix_async( rows, ib, da_ref(0, n-k+i), ldda, a_ref(0, n-k+i), lda, stream[1] ); magma_zgetmatrix_async( (m-rows), ib, da_ref(rows, n-k+i), ldda, a_ref(rows, n-k+i), lda, stream[0] ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the main update from the lookahead techniques. */ rows = m - k + old_i + old_ib; cols = n - k + old_i - old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, old_ib, da_ref(0, cols+old_ib), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); /* Compute the QL factorization of the current block A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */ rows = m - k + i + ib; cols = n - k + i; lapackf77_zgeqlf(&rows,&ib, a_ref(0,cols), &lda, tau+i, work, &lwork, &iinfo); if (cols > 0) { /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ lapackf77_zlarft( MagmaBackwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(0, cols), &lda, tau + i, work, &ib); zpanel_to_q( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); magma_zsetmatrix( rows, ib, a_ref(0,cols), lda, da_ref(0,cols), ldda ); zq_to_panel( MagmaLower, ib, a_ref(rows-ib,cols), lda, work+ib*ib); // Send the triangular part on the GPU magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the update of first ib columns. */ if (i-ib >= k -kk) magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, ib, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0,cols-ib), ldda, dwork+ib, lddwork); else{ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, ib, da_ref(0, cols), ldda, dwork, lddwork, da_ref(0, 0 ), ldda, dwork+ib, lddwork); } old_i = i; old_ib = ib; } } mu = m - k + i + nb; nu = n - k + i + nb; magma_zgetmatrix( m, nu, da_ref(0,0), ldda, a_ref(0,0), lda ); } else { mu = m; nu = n; } /* Use unblocked code to factor the last or only block */ if (mu > 0 && nu > 0) lapackf77_zgeqlf(&mu, &nu, a_ref(0,0), &lda, tau, work, &lwork, &iinfo); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqlf */
extern "C" magma_int_t magma_zgeqrf2_2q_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex *tau, magma_queue_t* queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex_ptr dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_zmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaDoubleComplex)*lwork, NULL, NULL); work = (magmaDoubleComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL); nbmin = 2; nx = 2*nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), ldwork, queues[0], NULL); clFlush(queues[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, queues[1], NULL); clFlush(queues[1]); } magma_queue_sync(queues[0]); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[0]); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, 0, lddwork, queues[1]); if (i+nb < k-nx){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_queue_sync(queues[1]); }else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queues[1]); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, queues[1]); clFlush(queues[1]); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows, queues[0]); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda, queues[0]); } clEnqueueUnmapMemObject(queues[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); // magma_free_cpu(work); return *info; } /* magma_zgeqrf2_gpu */
extern "C" magma_int_t magma_zunmqr_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magmaDoubleComplex *hwork, magma_int_t lwork, magmaDoubleComplex_ptr dT, size_t dT_offset, magma_int_t nb, magma_queue_t queue, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZUNMQR_GPU overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. DC (input/output) COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). HWORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK. LWORK >= (M-K+NB)*(N+2*NB) if SIDE = 'L', and LWORK >= (N-K+NB)*(M+2*NB) if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array, and no error message related to LWORK is issued by XERBLA. DT (input) COMPLEX_16 array on the GPU that is the output (the 9th argument) of magma_zgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_zgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda)) #define c_ref(a_1,a_2) dC, (dC_offset+(a_1)+(a_2)*(lddc)) #define t_ref(a_1) dT, (dT_offset+(a_1)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex_ptr dwork; magma_int_t i, lddwork; magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret; int left, notran, lquery; magma_int_t lwkopt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); if (!left || notran) printf("zunmqr_gpu called with arguments not yet supported\n"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if ( (!left) && (side != MagmaRight) ) { *info = -1; } else if ( (!notran) && (trans != MagmaConjTrans) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } lwkopt = (m-k+nb)*(n+2*nb); hwork[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { hwork[0] = c_one; return *info; } lddwork= k; dwork = dT; size_t dwork_offset = 2*lddwork*nb; if ( (left && (! notran)) || ( (!left) && notran ) ) { i1 = 0; i2 = k-nb; i3 = nb; } else { i1 = (k - 1 - nb) / nb * nb; i2 = 0; i3 = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (nb < k) { for (i=i1; i3<0 ? i>i2 : i<i2; i+=i3) { ib = min(nb, k - i); if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } ret = magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, mi, ni, ib, a_ref(i, i ), ldda, t_ref(i), nb, c_ref(ic, jc), lddc, dwork, dwork_offset, nw, queue); if ( ret != MAGMA_SUCCESS ) return ret; } } else { i = i1; } /* Use unblocked code to multiply the last or only block. */ if (i < k) { ib = k-i; if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } magma_zgetmatrix(mi, ib, a_ref(i, i), ldda, hwork, mi, queue); magma_zgetmatrix(mi, ni, c_ref(ic, jc), lddc, hwork+mi*ib, mi, queue); magma_int_t lhwork = lwork - mi*(ib + ni); lapackf77_zunmqr( MagmaLeftStr, MagmaConjTransStr, &mi, &ni, &ib, hwork, &mi, tau+i, hwork+mi*ib, &mi, hwork+mi*(ib+ni), &lhwork, info); // send the updated part of c back to the GPU magma_zsetmatrix(mi, ni, hwork+mi*ib, mi, c_ref(ic, jc), lddc, queue); } return *info; /* End of MAGMA_ZUNMQR_GPU */ }
extern "C" void magma_zbulge_applyQ( magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT, magmaDoubleComplex *dE, magma_int_t copytype ) { //%=========================== //% local variables //%=========================== magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t LDT, LDV, firstcolj; magma_int_t bg, nbGblk, rownbm, k, m, n; magma_int_t st, ed, fst, vlen, vnb, colj, len; magma_int_t blkid, vpos, taupos, tpos; //magmaDoubleComplex *WORK; magma_int_t LWORK; magma_int_t cur_blksiz, avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled, versionL, versionR; magma_int_t blkcnt=-1; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *INFO=0; versionL = 113; versionR = 92; LDT = Vblksiz; LDV = NB+Vblksiz-1; //blklen = LDV*Vblksiz; nbGblk = plasma_ceildiv((N-1), Vblksiz); //magma_zmalloc_cpu( &WORK, LWORK ); /* find the size of the matrix T V*/ findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV); /* Copy E & V & T to the GPU in dE and dV and dT * depending on copytype: * 1: mean copy only V * 2: mean copy V and T * 3: mean copy V, T and E * */ if (copytype > 0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV ); if (copytype > 1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT ); if (copytype > 2) magma_zsetmatrix( N, NE, E, N, dE, N ); magmaDoubleComplex *dwork; //ldwork = NE; LWORK = 2*N*max(Vblksiz, 64); if (MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update. * So we use the reverse order from small q to large one, * so from q_n to q_1 so Left update to Identity. * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order. * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update * */ if (WANTZ == 1) { versionL=113; SIDE = MagmaLeft; //set the matrix to Identity here to avoid copying it from the CPU magmablas_zlaset( MagmaFull, N, N, c_zero, c_one, dE, N ); } printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n", (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ); #if defined(USESTREAM) magma_int_t N2=N/2; magma_int_t N1=N-N2; printf("using stream\n"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); #endif if (SIDE == MagmaLeft) { if (versionL == 113) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for complex to handle A(N,N-1) else rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); for (m = rownbm; m > 0; m--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1); if ((vlen > 0) && (vnb > 0)) { if (WANTZ == 1) { len = N-colst; magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len); } else { magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); } } } } } else if (versionL == 114) { rownbm = plasma_ceildiv((N-1), NB); for (m = rownbm; m > 0; m--) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = nbgr; n > 0; n--) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); #endif } } } } } else if (SIDE == MagmaRight) { if (versionR == 91) { for (bg =1; bg <= nbGblk; bg++) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for complex to handle A(N,N-1) for (m = 1; m <= rownbm; m++) { vlen = 0; vnb = 0; // for k=0; I compute the fst and then can remove it from the loop colj = (bg-1)*Vblksiz; fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colj = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } else if (versionR == 92) { rownbm = plasma_ceildiv((N-1), NB); for (m = 1; m <= rownbm; m++) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = 1; n <= nbgr; n++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } } else { printf("ERROR SIDE %d\n", SIDE); } #if defined(USESTREAM) magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); #endif magmablasSetKernelStream( orig_stream ); }
extern "C" magma_int_t magma_zgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Other versions (magma_zgeqrf_gpu and magma_zgeqrf3_gpu) store the intermediate T matrices. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaDoubleComplex *dwork; magmaDoubleComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_zmalloc( &dwork, (n)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_zsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); zpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_zsetmatrix( rows, ib, work_ref(i), ldwork, dA(i,i), ldda ); zq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_zsetmatrix( ib, ib, hwork, ib, dwork, lddwork ); if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); } else { magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); magma_zsetmatrix( ib, ib, work_ref(i), ldwork, dA(i,i), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); return *info; } /* magma_zgeqrf2_gpu */
extern "C" magma_int_t magma_zgeqrf(magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) cuDoubleComplex *da, *dwork; cuDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_zgeqrf_nb(min(m, n)); magma_int_t lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_zgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info); } if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_zgeqrf_ooc(m, n, a, lda, tau, work, lwork, info); } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); dwork = da + ldda*(n); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ magma_zsetmatrix_async( (m), (n-nb), a_ref(0,nb), lda, da_ref(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0){ magma_zgetmatrix_async( (m-i), ib, da_ref(i,i), ldda, a_ref(i,i), lda, stream[1] ); magma_zgetmatrix_async( i, ib, da_ref(0,i), ldda, a_ref(0,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &ib); zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); magma_zsetmatrix( rows, ib, a_ref(i,i), lda, da_ref(i,i), ldda ); zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); if (i + ib < n) { magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork ); if (i+ib < k-nb) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) magma_zgetmatrix( m, ib, da_ref(0,i), ldda, a_ref(0,i), lda ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqrf */
extern "C" magma_int_t magma_zunmql2_gpu(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dc, magma_int_t lddc, magmaDoubleComplex *wa, magma_int_t ldwa, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZUNMQL overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'C': Q**H * C C * Q**H where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by ZGEQLF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'C': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQLF in the last k columns of its array argument A. The diagonal and the lower part are destroyed, the reflectors are not modified. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'. TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQLF. DC (device input/output) COMPLEX_16 array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDDC (input) INTEGER The leading dimension of the array C. LDDC >= max(1,M). WA (input/workspace) COMPLEX_16 array, dimension (LDWA,M) if SIDE = 'L' (LDWA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. LDWA (input) INTEGER The leading dimension of the array A. LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; /* Allocate work space on the GPU */ magmaDoubleComplex *dwork; magma_zmalloc( &dwork, 2*(m + 64)*64 ); magma_int_t wa_offset, dc_offset, i__4; magma_int_t i__; magmaDoubleComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw; magma_int_t ldwork; int left, notran; wa_offset = 1 + ldwa; wa -= wa_offset; --tau; dc_offset = 1 + lddc; dc -= dc_offset; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "C")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } ldwork = nw; /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } magmablas_zsetdiag1subdiag0('U', k, nb, da, ldda); for (i__ = i1; (i3 < 0 ? i__ >= i2 : i__ <= i2); i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i__ + ib - 1; lapackf77_zlarft("Backward", "Columnwise", &i__4, &ib, &wa[i__ * ldwa + 1], &ldwa, &tau[i__], t, &ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i__ + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i__ + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, t, ib, dwork+i__4*ib, ib ); magma_zlarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, &da[(i__-1) * ldda], ldda, dwork+i__4*ib, ib, &dc[1+lddc], lddc, dwork+i__4*ib + ib*ib, ldwork); } magma_free( dwork ); return *info; } /* magma_zunmql */
/** Purpose ------- ZUNMLQ overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complexunitary matrix defined as the product of k elementary reflectors Q = H(k)**H . . . H(2)**H H(1)**H as returned by ZGELQF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,M) if SIDE = MagmaLeft, (LDA,N) if SIDE = MagmaRight. The i-th row must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGELQF in the first k rows of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,K). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGELQF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgelqf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmlq( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define dV(i_,j_) (dV + (i_) + (j_)*ib) #define dT(i_,j_) (dT + (i_) + (j_)*ib) #define dwork(i_) (dwork + (i_)) magmaDoubleComplex *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; magma_trans_t transt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,k)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_zgelqf_nb( min( m, n )); lwkopt = max(1,nw)*nb; work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_Z_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_zunmlq( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = ((m+31)/32)*32; magmaDoubleComplex_ptr dwork, dV, dT, dC; magma_zmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_zmalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC(0,0), lddc ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb)*nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (notran) { transt = Magma_ConjTrans; } else { transt = MagmaNoTrans; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i + 1) . . . H(i + ib-1) */ nq_i = nq - i; lapackf77_zlarft("Forward", "Rowwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 ); magma_zsetmatrix( ib, nq_i, A(i,i), lda, dV(0,0), ib ); zq_to_panel( MagmaLower, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT(0,0), ib ); magma_zlarfb_gpu( side, transt, MagmaForward, MagmaRowwise, mi, ni, ib, dV(0,0), ib, dT(0,0), ib, dC(ic,jc), lddc, dwork(0), ldwork ); } magma_zgetmatrix( m, n, dC(0,0), lddc, C, ldc ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); return *info; } /* magma_zunmlq */
extern "C" magma_int_t magma_zunmqr_m(magma_int_t nrgpu, char side, char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *c, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; cuDoubleComplex* dw[MagmaMaxGPUs]; cudaStream_t stream [MagmaMaxGPUs][2]; magma_int_t ind_c, kb; magma_int_t i__4; magma_int_t i; cuDoubleComplex t[4160]; /* was [65][64] */ magma_int_t i1, i2, i3, ib, nb, nq, nw; magma_int_t left, notran, lquery; magma_int_t iinfo, lwkopt; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_Z_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar =nb; magma_int_t lddwork = nw; magma_int_t n_l = (n+nrgpu-1)/nrgpu; // local n n_l = ((n_l+63)/64)*64; if (n_l<256) n_l=256; nrgpu = min(nrgpu, (n+n_l-1)/n_l); // Don't use GPU that will not have data. for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(NULL); if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], (n_l*lddc + 2*lddac*lddar + 2*(nb + 1 + lddwork)*nb))) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); kb = min(n_l, n-igpu*n_l); magma_zsetmatrix_async( m, kb, C(0, igpu*n_l), ldc, dC(igpu, 0, 0), lddc, stream[igpu][0] ); } if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } kb = min(nb, k-i1); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( nq-i1, kb, A(i1, i1), lda, dA_c(igpu, 0, i1, 0), lddac, stream[igpu][0] ); } ind_c = 0; for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( ib, ib, t, ib, dt(igpu, ind_c), ib, stream[igpu][ind_c] ); magma_queue_sync( stream[igpu][ind_c] ); // Makes sure that we can change t next iteration. } // start the copy of next A panel kb = min(nb, k - i - i3); if (kb > 0 && i+i3 >= 0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_zsetmatrix_async( nq-(i+i3), kb, A(i+i3, i+i3), lda, dA_c(igpu, (ind_c+1)%2, i+i3, 0), lddac, stream[igpu][(ind_c+1)%2] ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); // Put 0s in the upper triangular part of dA; magmablas_zsetdiag1subdiag0_stream('L', ib, ib, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][ind_c]); kb = min(n_l, n-igpu*n_l); magmablasSetKernelStream(stream[igpu][ind_c]); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, kb, ib, dA_c(igpu, ind_c, i, 0), lddac, dt(igpu, ind_c), ib, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); } ind_c = (ind_c+1)%2; } //copy C from mgpus for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); magma_queue_sync( stream[igpu][1] ); kb = min(n_l, n-igpu*n_l); //asynchronous copy gives problems sometimes... // magma_zgetmatrix_async( m, kb, // dC(igpu, 0, 0), lddc, // C(0, igpu*n_l), ldc, stream[igpu][0] ); magma_zgetmatrix( m, kb, dC(igpu, 0, 0), lddc, C(0, igpu*n_l), ldc ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); magma_xerbla( __func__, 1 ); return *info; /*if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; i3 < 0 ? i >= i2 : i < i2; i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], t, &ib); // 1) copy the panel from A to the GPU, and // 2) Put 0s in the upper triangular part of dA; magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_zsetdiag1subdiag0('L', ib, ib, dA(i, 0), ldda); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, t, ib, dt, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dt, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } } MAGMA_Z_SET2REAL( work[0], lwkopt ); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); magmablasSetKernelStream(NULL); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_zunmqr */
/** Purpose ------- ZGEQRF2_MGPU computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaDoubleComplex **dlA, magma_int_t ldda, magmaDoubleComplex *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaDoubleComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaDoubleComplex *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_zgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and zlarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think zlarfb work could be smaller, max(n_local[:]). * Oddly, T and zlarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_zmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for zgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_zmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_zgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_zgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); zpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_zsetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if zpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ zq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_zsetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_zsetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_zgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for zgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_zgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_zsetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_zgeqrf2_mgpu */
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] nrgpu INTEGER Number of GPUs to use. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m(magma_int_t nrgpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T; magma_zmalloc_pinned(&T, nb*nb); //printf("calling zunmqr_m with nb=%d\n", (int) nb); magmaDoubleComplex* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_int_t igpu = 0; int gpu_b; magma_getdevice(&gpu_b); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+nrgpu-1)/nrgpu*nb_l; nrgpu = min(nrgpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_zmalloc( &dw[igpu], ldw )) { magma_xerbla( __func__, -(*info) ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/nrgpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band_stream( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_zsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%nrgpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(igpu, 0, i/nrgpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_zgetmatrix_async( m, kb, // dC(igpu, 0, i/nrgpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < nrgpu; ++igpu) { magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return *info; } /* magma_zunmqr */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT COMPLEX_16 array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr(magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; magmaDoubleComplex *dA, *dV, *dW; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaDoubleComplex *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but zungqr is slow for // k smaller than the zungqr's blocking size (new version can be up to 60x faster) lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_zlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_zlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_zlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_zgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */
extern "C" magma_int_t magma_zunmqr_gpu_2stages(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *dc, magma_int_t lddc, magmaDoubleComplex *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZUNMQR_GPU overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument DA. DA is modified by the routine but restored on exit. LDDA (input) INTEGER The leading dimension of the array DA. If SIDE = 'L', LDDA >= max(1,M); if SIDE = 'R', LDDA >= max(1,N). DC (input/output) COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDDC (input) INTEGER The leading dimension of the array DC. LDDC >= max(1,M). DT (input) COMPLEX_16 array on the GPU that is the output (the 9th argument) of magma_zgeqrf_gpu. NB (input) INTEGER This is the blocking size that was used in pre-computing DT, e.g., the blocking size used in magma_zgeqrf_gpu. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magmaDoubleComplex *dwork; magma_int_t i1, i2, i3, ib, ic, jc, mi, ni, nq, nw, ret; int left, notran; //magma_int_t lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if ( (!left) && (!lapackf77_lsame(side_, "R")) ) { *info = -1; } else if ( (!notran) && (!lapackf77_lsame(trans_, MagmaConjTransStr)) ) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } if(MAGMA_SUCCESS != magma_zmalloc( &dwork, n*nb )) { printf ("!!!! zungqr_2stage magma_alloc failed for: dwork\n" ); exit(-1); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } if ( (left && (! notran)) || ( (!left) && notran ) ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for (magma_int_t i=i1; (i3<0 ? i>=i2 : i<i2); i+=i3) { ib = min(nb, k - i); if (left){ mi = m - i; ic = i; } else { ni = n - i; jc = i; } ret = magma_zlarfb_gpu( MagmaLeft, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, da+i+i*ldda, ldda, dT+i*nb, nb, dc+ic+jc*lddc, lddc, dwork, nw); if ( ret != MAGMA_SUCCESS ){ magma_free(dwork); return ret; } } return MAGMA_SUCCESS; } /* End of MAGMA_ZUNMQR_GPU_2stages */
extern "C" magma_int_t magma_zbulge_applyQ_v2( magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex_ptr dE, magma_int_t ldde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_t myevent[2]; cudaEventCreateWithFlags(&myevent[0],cudaEventDisableTiming); cudaEventCreateWithFlags(&myevent[1],cudaEventDisableTiming); // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel magmaDoubleComplex *dwork, *dwork0, *dwork1, *dwvt0, *dwvt1; magmaDoubleComplex *dT0, *dV0, *dT1, *dV1; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t lddw = 0; magma_int_t lddwork = ((NE+31)/32)*32; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*lddwork; (v2) // lddv*Vblksiz; (v1,v3) magma_int_t dworksiz = lddwork*Vblksiz; // lddv*Vblksiz; (v2) // NE*Vblksiz=lddwork*Vblksiz; (v1,v3) if (MAGMA_SUCCESS != magma_zmalloc( &dwork, 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0 = dwork; // size = dworksiz; dwork1 = dwork0 + dworksiz; // size = dworksiz; dwvt0 = dwork + 2*dworksiz; // size = dwVTsiz; dwvt1 = dwvt0 + dwVTsiz; // size = dwVTsiz; dV0 = dwork + 2*dworksiz + 2*dwVTsiz; dT0 = dV0 + Vchunksiz*Vblksiz*lddv; dV1 = dT0 + Vchunksiz*Vblksiz*lddt; dT1 = dV1 + Vchunksiz*Vblksiz*lddv; // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; // performance loss if the reflector are applied to a big number of eigenvectors (~10000) // => apply the reflectors to blocks of eigenvectors. //magma_int_t nr_bl = magma_ceildiv(NE,10000); //nr of blocks magma_int_t sz_bl = NE; //magma_ceildiv(NE,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22 GPU with N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; magmablasSetKernelStream(stream[1]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[1]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1, vld, stream[1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1, tld, stream[1]); } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); magmablasSetKernelStream(stream[0]); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0, vld, stream[0]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0, tld, stream[0]); } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { magmablasSetKernelStream(stream[0]); magma_queue_wait_event( stream[0], myevent[1] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0+lcvpos, lddv, dT0+lctpos, lddt, dE(myrow,i), ldde, dwork0, lddw, dwvt0, lddv); } magma_event_record( myevent[0], stream[0] ); } else { magmablasSetKernelStream(stream[1]); magma_queue_wait_event( stream[1], myevent[0] ); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); lddw = min(lddwork,sz_bl); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1+lcvpos, lddv, dT1+lctpos, lddt, dE(myrow,i), ldde, dwork1, lddw, dwvt1, lddv); } magma_event_record( myevent[1], stream[1] ); } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min( Vblksiz, Vm ); } else { Vn = min( Vblksiz, Vm-1 ); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT magma_device_sync(); magmablasSetKernelStream( orig_stream ); magma_event_destroy( myevent[0] ); magma_event_destroy( myevent[1] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free(dwork); return *info; }
/** Purpose ------- ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) COMPLEX_16 array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaDoubleComplex *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(magmaDoubleComplex)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_zsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); zsplit_diag_block(ib, work(i), ldwork, ut); magma_zsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_zsetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_zsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zgeqrf_gpu */
extern "C" magma_int_t magma_zgeqrf_ooc(magma_int_t m, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF_OOC computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This is an out-of-core (ooc) version that is similar to magma_zgeqrf but the difference is that this version can use a GPU even if the matrix does not fit into the GPU memory at once. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) magmaDoubleComplex *da, *dwork; magmaDoubleComplex c_one = MAGMA_Z_ONE; int k, lddwork, ldda; *info = 0; int nb = magma_get_zgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Check how much memory do we have */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); magma_int_t IB, NB = (magma_int_t)(0.8*freeMem/m); NB = (NB / nb) * nb; if (NB >= n) return magma_zgeqrf(m, n, a, lda, tau, work, lwork, info); k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((NB+31)/32)*32+nb; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &da, (NB + nb)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); // magmablasSetKernelStream(stream[1]); magmaDoubleComplex *ptr = da + ldda * NB; dwork = da + ldda*(NB + nb); /* start the main loop over the blocks that fit in the GPU memory */ for(int i=0; i<n; i+=NB) { IB = min(n-i, NB); //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB); /* 1. Copy the next part of the matrix to the GPU */ magma_zsetmatrix_async( (m), IB, a_ref(0,i), lda, da_ref(0,0), ldda, stream[0] ); magma_queue_sync( stream[0] ); /* 2. Update it with the previous transformations */ for(int j=0; j<min(i,k); j+=nb) { magma_int_t ib = min(k-j, nb); /* Get a panel in ptr. */ // 1. Form the triangular factor of the block reflector // 2. Send it to the GPU. // 3. Put 0s in the upper triangular part of V. // 4. Send V to the GPU in ptr. // 5. Update the matrix. // 6. Restore the upper part of V. magma_int_t rows = m-j; lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib); magma_zsetmatrix_async( ib, ib, work, ib, dwork, lddwork, stream[1] ); zpanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); magma_zsetmatrix_async( rows, ib, a_ref(j,j), lda, ptr, rows, stream[1] ); magma_queue_sync( stream[1] ); magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, IB, ib, ptr, rows, dwork, lddwork, da_ref(j, 0), ldda, dwork+ib, lddwork); zq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); } /* 3. Do a QR on the current part */ if (i<k) magma_zgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info); /* 4. Copy the current part back to the CPU */ magma_zgetmatrix_async( (m), IB, da_ref(0,0), ldda, a_ref(0,i), lda, stream[0] ); } magma_queue_sync( stream[0] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqrf_ooc */
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] dC COMPLEX_16 array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) COMPLEX_16 array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr2_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, magma_int_t lddc, magmaDoubleComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ magmaDoubleComplex_ptr dwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, i__4, lddwork; magmaDoubleComplex T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq; int left, notran; wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; //nw = n; magma_zmalloc( &dwork, (n + 64)*64 ); // TODO after checking args, else memory leak! } else { nq = n; //nw = m; magma_zmalloc( &dwork, (m + 64)*64 ); // TODO after checking args, else memory leak! } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } // set nb-1 super-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // with the upper triangle parts already set to identity. magmablas_zlaset_band( MagmaUpper, k, k, nb, c_zero, c_one, dA, ldda ); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; lapackf77_zlarft("Forward", "Columnwise", &i__4, &ib, wA(i,i), &ldwa, &tau[i], T, &ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dwork, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i-1,i-1), ldda, dwork, ib, // dA using 0-based indices here dC(ic,jc), lddc, dwork + ib*ib, lddwork); } magma_free( dwork ); return *info; } /* magma_zunmqr */
extern "C" magma_int_t magma_zunmql( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *c, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_queue_t queue, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZUNMQL overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'C': Q**H * C C * Q**H where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by ZGEQLF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'C': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX*16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX*16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQLF. C (input/output) COMPLEX*16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ /* Allocate work space on the GPU */ magmaDoubleComplex_ptr dwork, dc; magma_zmalloc( &dc, (m)*(n) ); magma_zmalloc( &dwork, 2*(m + 64)*64 ); /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, c, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_dim1, c_offset, i__4; magma_int_t i__; magmaDoubleComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw; magma_int_t iinfo, ldwork, lwkopt; int lquery, left, notran; a_offset = 1 + lda; a -= a_offset; --tau; c_dim1 = ldc; c_offset = 1 + c_dim1; c -= c_offset; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } if (*info == 0) { if (m == 0 || n == 0) { lwkopt = 1; } else { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = nw * nb; } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (lwork < nw && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_zunmql(lapack_const(side), lapack_const(trans), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; } else { mi = m; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i__ + ib - 1; lapackf77_zlarft("Backward", "Columnwise", &i__4, &ib, &a[i__ * lda + 1], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the lower triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q(MagmaLower, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_zsetmatrix( i__4, ib, &a[1 + i__ * lda], lda, dwork, 0, i__4, queue ); zq_to_panel(MagmaLower, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i__ + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i__ + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, t, ib, dwork, i__4*ib, ib, queue ); magma_zlarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(1+m), m, dwork, (i__4*ib + ib*ib), ldwork, queue); } magma_zgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], ldc, queue ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_zunmql */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT COMPLEX_16 array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; magma_int_t lddwork; magmaDoubleComplex *dA=NULL, *dV=NULL, *dW=NULL; magmaDoubleComplex *work=NULL; magma_queue_t queue=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = magma_roundup( m, 32 ); lddwork = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_zmalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magmaDoubleComplex *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // zungqr requires less workspace (n*nb), but is slow if k < zungqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_zungqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_zlacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_zlaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_zlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } } if (kk > 0) { // Use blocked code // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to dV on the GPU mi = m - i; lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, queue ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork, queue ); } } // copy result back to CPU magma_zgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_zungqr */
extern "C" magma_int_t magma_zunmqr(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, cuDoubleComplex *A, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *C, magma_int_t ldc, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. C (input/output) COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define A(a_1,a_2) ( A + (a_1) + (a_2)*lda) #define dC(a_1,a_2) (dC + (a_1) + (a_2)*lddc) magma_int_t nb = magma_get_zgeqrf_nb( min( m, n )); cuDoubleComplex c_one = MAGMA_Z_ONE; char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t nq_i, lddwork; magma_int_t i; cuDoubleComplex T[ 2*nb*nb ]; magma_int_t i1, i2, step, ib, ic, jc, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } lwkopt = max(1,nw) * nb; work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, MagmaConjTransStr)) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } /* Allocate work space on the GPU */ magma_int_t lddc = m; cuDoubleComplex *dwork, *dC; magma_zmalloc( &dC, lddc*n ); magma_zmalloc( &dwork, (m + n + nb)*nb ); /* Copy matrix C from the CPU to the GPU */ magma_zsetmatrix( m, n, C, ldc, dC, lddc ); if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, &tau[1], C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( (left && (! notran)) || ((! left) && notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } for( i=i1; (step<0 ? i>=i2 : i<i2); i += step ) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ nq_i = nq - i; lapackf77_zlarft("F", "C", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ zpanel_to_q('U', ib, A(i,i), lda, T+ib*ib); magma_zsetmatrix( nq_i, ib, A(i,i), lda, dwork, nq_i ); zq_to_panel('U', ib, A(i,i), lda, T+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dwork+nq_i*ib, ib ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, nq_i, dwork+nq_i*ib, ib, dC(ic,jc), lddc, dwork+nq_i*ib + ib*ib, lddwork); } magma_zgetmatrix( m, n, dC, lddc, C, ldc ); } work[0] = MAGMA_Z_MAKE( lwkopt, 0 ); magma_free( dC ); magma_free( dwork ); return *info; } /* magma_zunmqr */
void MAGMA_ZLARFB_GPU( char *side, char *trans, char *direct, char *storev, magma_int_t *m, magma_int_t *n, magma_int_t *k, double2 *dv, magma_int_t *ldv, double2 *dt, magma_int_t *ldt, double2 *dc, magma_int_t *ldc, double2 *dowrk, magma_int_t *ldwork) { magma_zlarfb_gpu( side[0], trans[0], direct[0], storev[0], *m, *n, *k, dv, *ldv, dt, *ldt, dc, *ldc, dowrk, *ldwork); }
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] T COMPLEX_16 array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace lwork = n * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( stream[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */
extern "C" magma_int_t magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date July 2014 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using cudaMallocHost. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -8, the GPU memory allocation failed Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) int cnt=-1; cuDoubleComplex c_one = MAGMA_Z_ONE; int i, k, lddwork, old_i, old_ib; int nbmin, nx, ib, ldda; *info = 0; magma_qr_params *qr_params = (magma_qr_params *)cntxt->params; int nb = qr_params->nb; int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); long int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) return MAGMA_SUCCESS; k = min(m,n); if (k == 0) { work[0] = c_one; return MAGMA_SUCCESS; } cublasStatus status; static cudaStream_t stream[2]; cudaStreamCreate(&stream[0]); cudaStreamCreate(&stream[1]); nbmin = 2; nx = nb; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; cuDoubleComplex *da; status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da); if (status != CUBLAS_STATUS_SUCCESS) { *info = -8; return 0; } cuDoubleComplex *dwork = da + ldda*(n); if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex), a_ref(0,nb), lda *sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m), (n-nb), cudaMemcpyHostToDevice,stream[0]); old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); if (i>0){ cudaMemcpy2DAsync( a_ref(i,i), lda *sizeof(cuDoubleComplex), da_ref(i,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m-i), ib, cudaMemcpyDeviceToHost,stream[1]); cudaMemcpy2DAsync( a_ref(0,i), lda *sizeof(cuDoubleComplex), da_ref(0,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*i, ib, cudaMemcpyDeviceToHost,stream[0]); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } cudaStreamSynchronize(stream[1]); int rows = m-i; cnt++; cntxt->nb = qr_params->ib; magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); cntxt->nb = nb; /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib); if (cnt < qr_params->np_gpu) { qr_params->p[cnt]=a; } zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex), a_ref(i,i), lda, da_ref(i,i), ldda); if (qr_params->flag == 1) zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); if (i + ib < n) { cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork); if (i+ib < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) cublasGetMatrix(m, ib, sizeof(cuDoubleComplex), da_ref(0,i), ldda, a_ref(0,i), lda); int rows = m-i; cnt++; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); if (cnt < qr_params->np_gpu) { int ib2=min(ib,nb); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2); qr_params->p[cnt]=a; } } cudaStreamDestroy( stream[0] ); cudaStreamDestroy( stream[1] ); cublasFree(da); return MAGMA_SUCCESS; } /* magma_zgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlarfb_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double error, work[1]; // test all combinations of input parameters const char side[] = { MagmaLeft, MagmaRight }; const char trans[] = { MagmaConjTrans, MagmaNoTrans }; const char direct[] = { MagmaForward, MagmaBackward }; const char storev[] = { MagmaColumnwise, MagmaRowwise }; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { M = opts.msize[i]; N = opts.nsize[i]; K = opts.ksize[i]; if ( M < K || N < K || K <= 0 ) { printf( "skipping M %d, N %d, K %d; requires M >= K, N >= K, K >= 0.\n", (int) M, (int) N, (int) K ); continue; } for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { ldc = ((M+31)/32)*32; ldt = ((K+31)/32)*32; ldw = (side[iside] == MagmaLeft ? N : M); // (ldv, nv) get swapped later if rowwise ldv = (side[iside] == MagmaLeft ? M : N); nv = K; // Allocate memory for matrices magmaDoubleComplex *C, *R, *V, *T, *W; TESTING_MALLOC( C, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( R, magmaDoubleComplex, ldc*N ); TESTING_MALLOC( V, magmaDoubleComplex, ldv*K ); TESTING_MALLOC( T, magmaDoubleComplex, ldt*K ); TESTING_MALLOC( W, magmaDoubleComplex, ldw*K ); magmaDoubleComplex *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, magmaDoubleComplex, ldc*N ); TESTING_DEVALLOC( dV, magmaDoubleComplex, ldv*K ); TESTING_DEVALLOC( dT, magmaDoubleComplex, ldt*K ); TESTING_DEVALLOC( dW, magmaDoubleComplex, ldw*K ); // C is M x N. size = ldc*N; lapackf77_zlarnv( &ione, ISEED, &size, C ); //printf( "C=" ); magma_zprint( M, N, C, ldc ); // V is ldv x nv. See larfb docs for description. // if column-wise and left, M x K // if column-wise and right, N x K // if row-wise and left, K x M // if row-wise and right, K x N size = ldv*nv; lapackf77_zlarnv( &ione, ISEED, &size, V ); if ( storev[istor] == MagmaColumnwise ) { if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv ); } else { lapackf77_zlaset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_zprint( ldv, nv, V, ldv ); // T is K x K, upper triangular for forward, and lower triangular for backward magma_int_t k1 = K-1; size = ldt*K; lapackf77_zlarnv( &ione, ISEED, &size, T ); if ( direct[idir] == MagmaForward ) { lapackf77_zlaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_zlaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_zprint( K, K, T, ldt ); magma_zsetmatrix( M, N, C, ldc, dC, ldc ); magma_zsetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_zsetmatrix( K, K, T, ldt, dT, ldt ); lapackf77_zlarfb( &side[iside], &trans[itran], &direct[idir], &storev[istor], &M, &N, &K, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_zprint( M, N, C, ldc ); magma_zlarfb_gpu( side[iside], trans[itran], direct[idir], storev[istor], M, N, K, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_zgetmatrix( M, N, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_zprint( M, N, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_zlange( "Fro", &M, &N, C, &ldc, work ); size = ldc*N; blasf77_zaxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_zlange( "Fro", &M, &N, R, &ldc, work ) / error; printf( "%5d %5d %5d %c %c %c %c %8.2e\n", (int) M, (int) N, (int) K, storev[istor], side[iside], direct[idir], trans[itran], error ); TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); }}}} printf( "\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] C COMPLEX_16 array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *C, magma_int_t ldc, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) /* Constants */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; /* Local variables */ const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); magma_int_t nb = 128; magmaDoubleComplex *T = NULL; magmaDoubleComplex_ptr dw[MagmaMaxGPUs] = { NULL }; magma_queue_t queues[MagmaMaxGPUs][2] = {{ NULL }}; magma_event_t events[MagmaMaxGPUs][2] = {{ NULL }}; magma_int_t ind_c; magma_device_t dev; magma_device_t orig_dev; magma_getdevice( &orig_dev ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = magma_zmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_zunmqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = magma_roundup( m, 64 ); // TODO why 64 instead of 32 ? magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = magma_ceildiv( n, nb_l ); // number of blocks magma_int_t maxnlocal = magma_ceildiv( nbl, ngpu )*nb_l; ngpu = min( ngpu, magma_ceildiv( n, nb_l )); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) if (MAGMA_SUCCESS != magma_zmalloc_pinned( &T, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dw[dev], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &events[dev][0] ); magma_event_create( &events[dev][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(dev, 0, i/ngpu*nb_l), lddc, queues[dev][0] ); nlocal[dev] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_sync( events[dev][ind_c] ); // check if the new data can be copied magma_zsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); // set upper triangular part of dA to identity magmablas_zlaset_band( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(dev, ind_c, i, 0), lddac, queues[dev][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_zlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_zsetmatrix_async(kb, kb, T, kb, dT(dev, ind_c), kb, queues[dev][0] ); } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); // check if the data was copied magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[dev], kb, dA_c(dev, ind_c, i, 0), lddac, dT(dev, ind_c), kb, dC(dev, i, 0), lddc, dwork(dev, ind_c), lddwork, queues[dev][1] ); magma_event_record(events[dev][ind_c], queues[dev][1] ); } ind_c = (ind_c+1)%2; } for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_queue_sync( queues[dev][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { dev = i % ngpu; magma_setdevice( dev ); magma_int_t kb = min(nb_l, n-i*nb_l); magma_zgetmatrix( m, kb, dC(dev, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc, queues[dev][1] ); // magma_zgetmatrix_async( m, kb, // dC(dev, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, queues[dev][0] ); } } else { *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); goto cleanup; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_zlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_zsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda, queues[dev][1] ); magmablas_zlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda, queues[dev][1] ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_zsetmatrix( ib, ib, T, ib, dT, ib, queues[dev][1] ); magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork, queues[dev][1] ); } */ } cleanup: work[0] = magma_zmake_lwork( lwkopt ); for (dev = 0; dev < ngpu; ++dev) { magma_setdevice( dev ); magma_event_destroy( events[dev][0] ); magma_event_destroy( events[dev][1] ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_free( dw[dev] ); } magma_setdevice( orig_dev ); magma_free_pinned( T ); return *info; } /* magma_zunmqr */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA COMPLEX_16 array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT (workspace) COMPLEX_16 work space array on the GPU, dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB. This must be the 6th argument of magma_zgeqrf_gpu [ note that if N here is bigger than N in magma_zgeqrf_gpu, the workspace requirement DT in magma_zgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_gpu( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; magmaDoubleComplex_ptr dV, dW; magmaDoubleComplex *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_zmalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_zmalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // roundup(n,32) * nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_zgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk, queue ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } if (kk > 0) { // Use blocked code // queue: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_zcopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, queue ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork, queue ); } } } magma_queue_sync( queue ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( queue ); return *info; } /* magma_zungqr_gpu */
/** Purpose: --------- ZUNGLQ generates an M-by-N complex matrix Q with orthonormal rows, which is defined as the first M rows of a product of K elementary reflectors of order N Q = H(k)**H . . . H(2)**H H(1)**H as returned by ZGELQF. Arguments: --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. N >= M. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. M >= K >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the i-th row must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGELQF in the first k rows of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGELQF. @param[out] work COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= NB*NB, where NB is the optimal blocksize. If LWORK = -1, a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit; - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgelqf_comp ********************************************************************/ extern "C" magma_int_t magma_zunglq( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define tau(i_) (tau + (i_)) // Constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; // Local variables bool lquery; magma_int_t i, ib, ki, ldda, lddwork, lwkopt, mib, nb, n_i; magma_queue_t queue = NULL; magmaDoubleComplex_ptr dA = NULL; magmaDoubleComplex* work2 = NULL; // Test the input arguments *info = 0; nb = magma_get_zgelqf_nb( m, n ); lwkopt = nb*nb; work[0] = magma_zmake_lwork( lwkopt ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0 || n < m) { *info = -2; } else if (k < 0 || k > m) { *info = -3; } else if (lda < max( 1, m )) { *info = -5; } else if (lwork < max( 1, lwkopt ) && ! lquery) { *info = -8; //printf( "m %d, n %d, nb %d: lwork %d, required %d\n", m, n, nb, lwork, lwkopt ); //*info = 0; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } // Quick return if possible if (m <= 0) { work[0] = c_one; return *info; } //if (lwork < lwkopt) { // magma_zmalloc_cpu( &work2, lwkopt ); //} //else { // work2 = work; //} work2 = work; // Allocate GPU work space // ldda*n for matrix dA // nb*n for dV // lddwork*nb for dW larfb workspace ldda = magma_roundup( m, 32 ); lddwork = magma_roundup( m, 32 ); if (MAGMA_SUCCESS != magma_zmalloc( &dA, ldda*n + n*nb + lddwork*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magmaDoubleComplex_ptr dV; dV = dA + ldda*n; magmaDoubleComplex_ptr dW; dW = dA + ldda*n + n*nb; magmaDoubleComplex_ptr dT; dT = dA + ldda*n + n*nb + lddwork*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magmablas_zlaset( MagmaFull, m, n, MAGMA_Z_NAN, MAGMA_Z_NAN, dA, ldda, queue ); // all columns are handled by blocked method. // ki is start of last (partial) block ki = ((k - 1) / nb) * nb; // Use blocked code for( i=ki; i >= 0; i -= nb ) { ib = min( nb, k-i ); // first block has extra rows to update mib = ib; if ( i == ki ) { mib = m - i; } // Send current panel of V (block row) to the GPU lapackf77_zlaset( "Lower", &ib, &ib, &c_zero, &c_one, A(i,i), &lda ); // TODO: having this _async was causing numerical errors. Why? magma_zsetmatrix( ib, n-i, A(i,i), lda, dV, nb, queue ); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) n_i = n - i; lapackf77_zlarft( MagmaForwardStr, MagmaRowwiseStr, &n_i, &ib, A(i,i), &lda, &tau[i], work2, &nb ); magma_zsetmatrix_async( ib, ib, work2, nb, dT, nb, queue ); // set panel of A (block row) to identity magmablas_zlaset( MagmaFull, mib, i, c_zero, c_zero, dA(i,0), ldda, queue ); magmablas_zlaset( MagmaFull, mib, n-i, c_zero, c_one, dA(i,i), ldda, queue ); if (i < m) { // Apply H**H to A(i:m,i:n) from the right magma_zlarfb_gpu( MagmaRight, MagmaConjTrans, MagmaForward, MagmaRowwise, m-i, n-i, ib, dV, nb, dT, nb, dA(i,i), ldda, dW, lddwork, queue ); } } // copy result back to CPU magma_zgetmatrix( m, n, dA(0,0), ldda, A(0,0), lda, queue ); cleanup: magma_queue_destroy( queue ); magma_free( dA ); //if (work2 != work) { // magma_free_cpu( work2 ); //} work[0] = magma_zmake_lwork( lwkopt ); return *info; }
/** Purpose ------- ZUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by ZGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQLF in the last k columns of its array argument dA. The diagonal and the lower part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array dA. If SIDE = MagmaLeft, LDDA >= max(1,M); if SIDE = MagmaRight, LDDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQLF. @param[in,out] dC COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array dC. LDDC >= max(1,M). @param[in] wA COMPLEX_16 array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. (A copy of the upper or lower part of dA, on the host.) @param[in] ldwa INTEGER The leading dimension of the array wA. If SIDE = MagmaLeft, LDWA >= max(1,M); if SIDE = MagmaRight, LDWA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmql2_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, magma_int_t lddc, const magmaDoubleComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Constants */ const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmax = 64; /* Local variables */ magmaDoubleComplex_ptr dwork = NULL, dT = NULL; magmaDoubleComplex T[ nbmax*nbmax ]; magma_int_t i, i1, i2, step, ib, lddwork, nb, mi, ni, nq, nq_i, nw; magma_queue_t queue = NULL; // Parameter adjustments for Fortran indexing wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } // size of the block nb = nbmax; lddwork = nw; /* Use hybrid CPU-GPU code */ if ( ( left && notran) || (! left && ! notran) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } // dwork is (n or m) x nb + nb x nb, for left or right respectively if (MAGMA_SUCCESS != magma_zmalloc( &dwork, lddwork*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT = dwork + lddwork*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // in bottom k x k portion of dA, // set nb-1 sub-diagonals to 0, and diagonal to 1, in // This way we can copy V directly to the GPU, // with the lower triangle parts already set to identity. // A is nq x k, either m x k (left) or n x k (right) magmablas_zlaset_band( MagmaLower, k, k, nb, c_zero, c_one, dA(nq-k,0), ldda, queue ); for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min( nb, k - i + 1 ); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib - 1; lapackf77_zlarft( "Backward", "Columnwise", &nq_i, &ib, wA(1,i), &ldwa, &tau[i], T, &ib ); if (left) { /* H or H^H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib - 1; } else { /* H or H^H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib - 1; } /* Apply H or H^H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_zlarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dA(0,i-1), ldda, dT, ib, // dA using 0-based indices here dC(1,1), lddc, dwork, lddwork, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dwork ); return *info; } /* magma_zunmql */
extern "C" magma_int_t magma_zgeqrf(magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) magmaDoubleComplex *dA, *dwork, *dT; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_zgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_zgeqrf4(num_gpus, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_zgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[3], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); magma_queue_create( &stream[2] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_zsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[2] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_zgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[2] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); zpanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_zsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_zsetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); zq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); zq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_zgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda ); } magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[2] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( dA ); return *info; } /* magma_zgeqrf */