void magma_sprint_gpu( magma_int_t m, magma_int_t n, const float *dA, magma_int_t ldda ) { magma_int_t info = 0; if ( m < 0 ) info = -1; else if ( n < 0 ) info = -2; else if ( ldda < max(1,m) ) info = -4; if (info != 0) { magma_xerbla( __func__, -(info) ); return; //info; } magma_int_t lda = m; float* A; magma_smalloc_cpu( &A, lda*n ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_sgetmatrix( m, n, dA, ldda, A, lda, queue ); magma_queue_destroy( queue ); magma_sprint( m, n, A, lda ); magma_free_cpu( A ); }
/** Purpose ------- DGEQR2 computes a QR factorization of a real m by n matrix A: A = Q * R. This expert routine requires two more arguments than the standard dgeqr2, namely, dT and ddA, explained below. The storage for A is also not as in the LAPACK's dgeqr2 routine (see below). The first is used to output the triangular n x n factor T of the block reflector used in the factorization. The second holds the diagonal nxn blocks of A, i.e., the diagonal submatrices of R. This version implements the right-looking QR. A hard-coded requirement for N is to be <= min(M, 128). For larger N one should use a blocking QR version. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. 0 <= N <= min(M, 128). @param[in,out] dA DOUBLE PRECISION array, dimension (LDDA,N) On entry, the m by n matrix A. On exit, the orthogonal matrix Q as a product of elementary reflectors (see Further Details). \n the elements on and above the diagonal of the array contain the min(m,n) by n upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] dtau DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT DOUBLE PRECISION array, dimension N x N. Stores the triangular N x N factor T of the block reflector used in the factorization. The lower triangular part is 0. @param[out] ddA DOUBLE PRECISION array, dimension N x N. Stores the elements of the upper N x N diagonal block of A. LAPACK stores this array in A. There are 0s below the diagonal. @param dwork (workspace) DOUBLE PRECISION array, dimension (N) @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_aux ********************************************************************/ extern "C" magma_int_t magma_dgeqr2x_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dtau, magmaDouble_ptr dT, magmaDouble_ptr ddA, magmaDouble_ptr dwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i, min_mn; magmaDouble_ptr dnorm = dwork; magmaDouble_ptr dwork2 = (magmaDouble_ptr)(dwork + 2*n); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0 || n > min(m, 128)) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Compute the norms of the trailing columns */ min_mn = min(m,n); // magmablas_dnrm2_cols( m, min_mn, dA(0,0), ldda, dnorm, queue ); for (i = 0; i < min_mn; ++i) { /* Generate elementary reflector H(i) to annihilate A(i+1:m,i) */ magmablas_dnrm2_cols( m-i, 1, dA(i,i), ldda, dnorm+i, queue ); magma_dlarfgx_gpu( m-i, dA(i, i), dA(min(i+1,m), i), dtau+i, dnorm+i, ddA + i + i*n, i, queue ); if (i < n) { /* Apply H(i)' to A(i:m,i+1:n) from the left */ magma_dlarfx_gpu( m-i, n-i-1, dA(i, i), dtau+i, //dA(i, i+1), ldda, dnorm+i+1, dA(i, 0), ldda, dnorm+i+1, dT, i, dwork2, queue ); } } magma_queue_destroy( queue ); return *info; } /* magma_dgeqr2 */
// ---------------------------------------------------------------------- // TODO info is unused extern "C" magma_int_t magma_zhtodhe( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex_ptr dA[], magma_int_t ldda, magma_queue_t queues[][10], magma_int_t *info) { magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_int_t k; if (uplo == MagmaLower) { /* go through each block-column */ magma_int_t j, jj, jb, mj; for (j=0; j < n; j += nb) { jj = j/(nb*ngpu); k = (j/nb)%ngpu; jb = min(nb, (n-j)); mj = n-j; magma_setdevice( k ); magma_zsetmatrix_async( mj, jb, A(j,j), lda, dA(k, j, jj*nb), ldda, queues[k][0] ); } } else { /* go through each block-column */ magma_int_t j, jj, jb, mj; for (j=0; j < n; j += nb) { jj = j/(nb*ngpu); k = (j/nb)%ngpu; jb = min(nb, (n-j)); mj = j+jb; magma_setdevice( k ); magma_zsetmatrix_async( mj, jb, A(0, j), lda, dA(k, 0, jj*nb), ldda, queues[k][0] ); } } for( k=0; k < ngpu; k++ ) { magma_setdevice( k ); magma_queue_sync( queues[k][0] ); } magma_setdevice( orig_dev ); return *info; }
extern "C" magma_int_t magma_shtodhe(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float **dA, magma_int_t ldda, magma_queue_t stream[][10], magma_int_t *info) { magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_int_t k; if (uplo == MagmaLower) { /* go through each block-column */ magma_int_t j, jj, jb, mj; for (j=0; j < n; j += nb) { jj = j/(nb*num_gpus); k = (j/nb)%num_gpus; jb = min(nb, (n-j)); mj = n-j; magma_setdevice(k); magma_ssetmatrix_async( mj, jb, A(j,j), lda, dA(k, j, jj*nb), ldda, stream[k][0] ); } } else { /* go through each block-column */ magma_int_t j, jj, jb, mj; for (j=0; j < n; j += nb) { jj = j/(nb*num_gpus); k = (j/nb)%num_gpus; jb = min(nb, (n-j)); mj = j+jb; magma_setdevice(k); magma_ssetmatrix_async( mj, jb, A(0, j), lda, dA(k, 0, jj*nb), ldda, stream[k][0] ); } } for( k=0; k < num_gpus; k++ ) { magma_setdevice(k); magma_queue_sync(stream[k][0]); } magma_setdevice( orig_dev ); return *info; }
magma_int_t magma_dnan_inf_gpu( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDouble_const_ptr dA, magma_int_t ldda, magma_int_t *cnt_nan, magma_int_t *cnt_inf ) { magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper && uplo != MagmaFull ) info = -1; else if ( m < 0 ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < max(1,m) ) info = -5; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_int_t lda = m; double* A; magma_dmalloc_cpu( &A, lda*n ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_dgetmatrix( m, n, dA, ldda, A, lda, queue ); magma_queue_destroy( queue ); magma_int_t cnt = magma_dnan_inf( uplo, m, n, A, lda, cnt_nan, cnt_inf ); magma_free_cpu( A ); return cnt; }
/** Purpose ------- ZUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by ZGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQLF in the last k columns of its array argument dA. The diagonal and the lower part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array dA. If SIDE = MagmaLeft, LDDA >= max(1,M); if SIDE = MagmaRight, LDDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQLF. @param[in,out] dC COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array dC. LDDC >= max(1,M). @param[in] wA COMPLEX_16 array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. (A copy of the upper or lower part of dA, on the host.) @param[in] ldwa INTEGER The leading dimension of the array wA. If SIDE = MagmaLeft, LDWA >= max(1,M); if SIDE = MagmaRight, LDWA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmql2_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, magma_int_t lddc, const magmaDoubleComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Constants */ const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmax = 64; /* Local variables */ magmaDoubleComplex_ptr dwork = NULL, dT = NULL; magmaDoubleComplex T[ nbmax*nbmax ]; magma_int_t i, i1, i2, step, ib, lddwork, nb, mi, ni, nq, nq_i, nw; magma_queue_t queue = NULL; // Parameter adjustments for Fortran indexing wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } // size of the block nb = nbmax; lddwork = nw; /* Use hybrid CPU-GPU code */ if ( ( left && notran) || (! left && ! notran) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } // dwork is (n or m) x nb + nb x nb, for left or right respectively if (MAGMA_SUCCESS != magma_zmalloc( &dwork, lddwork*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT = dwork + lddwork*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // in bottom k x k portion of dA, // set nb-1 sub-diagonals to 0, and diagonal to 1, in // This way we can copy V directly to the GPU, // with the lower triangle parts already set to identity. // A is nq x k, either m x k (left) or n x k (right) magmablas_zlaset_band( MagmaLower, k, k, nb, c_zero, c_one, dA(nq-k,0), ldda, queue ); for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min( nb, k - i + 1 ); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib - 1; lapackf77_zlarft( "Backward", "Columnwise", &nq_i, &ib, wA(1,i), &ldwa, &tau[i], T, &ib ); if (left) { /* H or H^H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib - 1; } else { /* H or H^H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib - 1; } /* Apply H or H^H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_zlarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dA(0,i-1), ldda, dT, ib, // dA using 0-based indices here dC(1,1), lddc, dwork, lddwork, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dwork ); return *info; } /* magma_zunmql */
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; // Local variables float *work; magmaFloat_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_sgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_sgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(float); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((float)NB/nb) ) { ngpu2 = (magma_int_t)ceil((float)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_smalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_smalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_ssetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_sgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_stranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_sgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_stranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_sgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_sgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_ssetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_slaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_stranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_stranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_sgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_sgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_slaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_ssetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_stranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_stranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_sgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_stranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_sgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_sgetrf */
/** Purpose ------- ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_zgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((double)NB/nb) ) { ngpu = (int)ceil((double)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_ZGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_ZGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((double)N/nb) ) { ngpu = (int)ceil((double)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_zsetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_zsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ztranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_zlaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_zsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ztranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_zgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_zgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_ZGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_zgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_zgetrf_m */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA COMPLEX_16 array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] dT (workspace) COMPLEX_16 work space array on the GPU, dimension (2*MIN(M, N) + ceil(N/32)*32 )*NB. This must be the 6th argument of magma_zgeqrf_gpu [ note that if N here is bigger than N in magma_zgeqrf_gpu, the workspace requirement DT in magma_zgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_gpu( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; magmaDoubleComplex_ptr dV, dW; magmaDoubleComplex *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_zmalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_zmalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // roundup(n,32) * nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_zgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk, queue ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda, queue ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda, queue ); } if (kk > 0) { // Use blocked code // queue: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_zcopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, queue ); // set panel to identity magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda, queue ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda, queue ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork, queue ); } } } magma_queue_sync( queue ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( queue ); return *info; } /* magma_zungqr_gpu */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] T REAL array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_m( magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t d, i, ib, j, jb, ki, kk; float *work=NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = magma_roundup( m, 32 ); magma_int_t lddwork = magma_roundup( n, 32 ); magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; float *dA[ MagmaMaxGPUs ] = { NULL }; float *dT[ MagmaMaxGPUs ] = { NULL }; float *dV[ MagmaMaxGPUs ] = { NULL }; float *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t queues[ MagmaMaxGPUs ] = { NULL }; for( d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_smalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( d, &queues[d] ); } trace_init( 1, ngpu, 1, queues ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for larfb work // m*nb for V // nb*nb for T lwork = (n + m + nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } float *work_T, *work_V; work_T = work + n*nb; work_V = work + n*nb + nb*nb; // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; // sorgqr requires less workspace (n*nb), but is slow if k < sorgqr's block size. // replacing it with the 4 routines below is much faster (e.g., 60x). //magma_int_t iinfo; //lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, // A(kk, kk), &lda, // &tau[kk], work, &lwork, &iinfo ); lapackf77_slacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk); lapackf77_slaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, work_V, &m_kk, &tau[kk], work_T, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk ); if (kk > 0) { for( j=kk; j < n; j += nb ) { jb = min( n-j, nb ); d = (j / nb) % ngpu; di = ((j / nb) / ngpu) * nb; magma_setdevice( d ); magma_ssetmatrix( m_kk, jb, A(kk, j), lda, dA(d, kk, di), ldda, queues[d] ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] ); } } trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_ssetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] ); trace_gpu_end( d, 0 ); } // queue: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to dV on the GPUs lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, queues[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda, queues[dpanel] ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork, queues[d] ); trace_gpu_end( d, 0 ); } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_sgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb, queues ); trace_cpu_end( 0 ); } #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "sorgqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif cleanup: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( queues[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); return *info; } /* magma_sorgqr */
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda + dA_offset) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const double d_one = 1.0; const double d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, nb; magmaDoubleComplex *work; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb( n ); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] ); lapackf77_zpotrf( uplo_, &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] ); } else { /* Use blocked code. */ if (upper) { //========================================================= /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block row right of diagonal block if (j+jb < n) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaUpperStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block row right of diagonal block if (j+jb < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block column below diagonal block if (j+jb < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaLowerStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block column below diagonal if (j+jb < n) { magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; double *T; magma_dmalloc_pinned(&T, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_dlaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, T, ib, dT, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dormqr */
/** Purpose ------- SSYTRD reduces a real symmetric matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] num_streams INTEGER The number of GPU streams used for update. 10 >= num_streams > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] d REAL array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_ssyev_comp ********************************************************************/ extern "C" magma_int_t magma_ssytrd_mgpu( magma_int_t num_gpus, magma_int_t num_streams, magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(id, i, j) (dA[(id)] + (j)*ldda + (i)) #define dW(id, i, j) (dwork[(id)] + (j)*ldda + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ln, ldda; magma_int_t nb = magma_get_ssytrd_nb(n), ib; float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; //float mv_time = 0.0; #ifdef PROFILE_SY2RK float up_time = 0.0; #endif magma_int_t kk, nx; magma_int_t i = 0, ii, iii, j, did, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt, ldwork2; magma_int_t lquery; magma_queue_t stream[MagmaMaxGPUs][10]; float *dx[MagmaMaxGPUs], *dy[MagmaMaxGPUs], *hwork; float *dwork2[MagmaMaxGPUs]; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } else if ( num_streams > 2 ) { *info = 2; // TODO fix } /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); float *dA[MagmaMaxGPUs]; float *dwork[MagmaMaxGPUs]; float times[11]; for( did=0; did < 11; did++ ) times[did] = 0; //#define PROFILE_SY2RK #ifdef PROFILE_SY2RK magma_event_t start, stop; float etime; magma_setdevice(0); magma_event_create( &start ); magma_event_create( &stop ); #endif ldda = lda; ln = ((nb*(1+n/(nb*num_gpus))+31)/32)*32; ldwork2 = (1+ n / nb + (n % nb != 0)) * ldda; for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); // TODO fix memory leak if ( MAGMA_SUCCESS != magma_smalloc(&dA[did], ln*ldda+3*lddwork*nb) || MAGMA_SUCCESS != magma_smalloc(&dx[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dy[did], num_streams*n) || MAGMA_SUCCESS != magma_smalloc(&dwork2[did], ldwork2 ) ) { for( i=0; i < did; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[did] = dA[did] + ln*ldda; for( kk=0; kk < num_streams; kk++ ) magma_queue_create(&stream[did][kk]); } magma_setdevice(0); // TODO fix memory leak dwork2 if ( MAGMA_SUCCESS != magma_smalloc_pinned( &hwork, num_streams*num_gpus*n ) ) { for( i=0; i < num_gpus; i++ ) { magma_setdevice(i); magma_free(dA[i]); magma_free(dx[i]); magma_free(dy[i]); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ for (i = nb*((n-1)/nb); i >= nx; i -= nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* wait for the next panel */ if (i != nb*((n-1)/nb)) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } magma_slatrd_mgpu(num_gpus, uplo, n, i+ib, ib, nb, A(0, 0), lda, e, tau, work, ldwork, dA, ldda, 0, dwork, i+ib, dwork2, ldwork2, 1, dx, dy, hwork, stream, times); magma_ssyr2k_mgpu(num_gpus, MagmaUpper, MagmaNoTrans, nb, i, ib, c_neg_one, dwork, i+ib, 0, d_one, dA, ldda, 0, num_streams, stream); /* get the next panel */ if (i-nb >= nx ) { ib = min(nb, n-(i-nb)); ii = nb*((i-nb)/(nb*num_gpus)); did = ((i-nb)/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( (i-nb)+ib, ib, dA(did, 0, ii), ldda, A(0, i-nb), lda, stream[did][0] ); } /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j > 0 ) { *A(j-1,j) = MAGMA_S_MAKE( e[j - 1], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* end of for i=... */ if ( nx > 0 ) { if (1 <= n-nx) { /* else A is already on CPU */ for (i=0; i < nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( nx, ib, dA(did, 0, ii), ldda, A(0, i), lda, stream[did][0] ); } } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } /* Use unblocked code to reduce the last or only block */ lapackf77_ssytd2(uplo_, &nx, A(0, 0), &lda, d, e, tau, &iinfo); } } else { trace_init( 1, num_gpus, num_streams, (CUstream_st**)stream ); /* Copy the matrix to the GPU */ if (1 <= n-nx) { magma_shtodhe(num_gpus, uplo, n, nb, A, lda, dA, ldda, stream, &iinfo ); } /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; /* Reduce columns i:i+ib-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i != 0) { magma_setdevice(did); trace_gpu_start( did, 0, "comm", "get" ); magma_sgetmatrix_async( n-i, ib, dA(did, i, ii), ldda, A(i,i), lda, stream[did][0] ); trace_gpu_end( did, 0 ); magma_queue_sync(stream[did][0]); magma_setdevice(0); } magma_slatrd_mgpu(num_gpus, uplo, n, n-i, ib, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA, ldda, i, dwork, (n-i), dwork2, ldwork2, 1, dx, dy, hwork, stream, times ); #ifdef PROFILE_SY2RK magma_setdevice(0); if ( i > 0 ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_record(start, 0); #endif magma_ssyr2k_mgpu(num_gpus, MagmaLower, MagmaNoTrans, nb, n-i-ib, ib, c_neg_one, dwork, n-i, ib, d_one, dA, ldda, i+ib, num_streams, stream); #ifdef PROFILE_SY2RK magma_setdevice(0); magma_event_record(stop, 0); #endif /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+ib; ++j) { if ( j+1 < n ) { *A(j+1,j) = MAGMA_S_MAKE( e[j], 0 ); } d[j] = MAGMA_S_REAL( *A(j, j) ); } } /* for i=... */ /* Use unblocked code to reduce the last or only block */ if ( i < n ) { iii = i; i_n = n-i; if ( i > 0 ) { for (; i < n; i += nb) { ib = min(nb, n-i); ii = nb*(i/(nb*num_gpus)); did = (i/nb)%num_gpus; magma_setdevice(did); magma_sgetmatrix_async( i_n, ib, dA(did, iii, ii), ldda, A(iii, i), lda, stream[did][0] ); } for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); magma_queue_sync(stream[did][0]); } } lapackf77_ssytrd(uplo_, &i_n, A(iii, iii), &lda, &d[iii], &e[iii], &tau[iii], work, &lwork, &iinfo); } } #ifdef PROFILE_SY2RK magma_setdevice(0); if ( n > nx ) { cudaEventElapsedTime(&etime, start, stop); up_time += (etime/1000.0); } magma_event_destroy( start ); magma_event_destroy( stop ); #endif trace_finalize( "ssytrd.svg", "trace.css" ); for( did=0; did < num_gpus; did++ ) { magma_setdevice(did); for( kk=0; kk < num_streams; kk++ ) magma_queue_sync(stream[did][kk]); for( kk=0; kk < num_streams; kk++ ) magma_queue_destroy(stream[did][kk]); magma_free(dA[did]); magma_free(dx[did]); magma_free(dy[did]); magma_free(dwork2[did]); } magma_free_pinned(hwork); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); work[0] = MAGMA_S_MAKE( lwkopt, 0 ); #ifdef PROFILE_SY2RK printf( " n=%d nb=%d\n", n, nb ); printf( " Time in SLARFG: %.2e seconds\n", times[0] ); //printf( " Time in SSYMV : %.2e seconds\n", mv_time ); printf( " Time in SSYR2K: %.2e seconds\n", up_time ); #endif return *info; } /* magma_ssytrd */
/** Purpose ------- CSYTRF_nopiv_gpu computes the LDLt factorization of a complex symmetric matrix A. The factorization has the form A = U^T * D * U, if UPLO = MagmaUpper, or A = L * D * L^T, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_csysv_comp ******************************************************************* */ extern "C" magma_int_t magma_csytrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_chetrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t queues[2]; magma_event_t event; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); // CPU workspace magmaFloatComplex *A; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaFloatComplex_ptr dW; if (MAGMA_SUCCESS != magma_cmalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_event_sync( event ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j,j), nb, queues[1] ); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync( queues[1] ); trace_cpu_start( 0, "potrf", "potrf" ); magma_csytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), nb, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_csetmatrix_async( jb, jb, A(j, j), nb, dA(j, j), ldda, queues[0] ); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_ccopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_clascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_cgemm( MagmaTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) magma_event_record( event, queues[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_event_sync( event ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j,j), nb, queues[1] ); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync( queues[1] ); trace_cpu_start( 0, "potrf", "potrf" ); magma_csytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), nb, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_csetmatrix_async( jb, jb, A(j, j), nb, dA(j, j), ldda, queues[0] ); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_ccopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_clascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_cgemm( MagmaNoTrans, MagmaTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) magma_event_record( event, queues[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "chetrf.svg","trace.css" ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); return *info; } /* magma_csytrf_nopiv */
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE_PRECISION array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); double *work; int upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_dpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //ngpu; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_dpotrf_mgpu */
/** Purpose ------- CGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors. @param[out] dwork (workspace) COMPLEX array on the GPU, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. \n Note: unlike the CPU interface of this routine, the GPU interface does not support a workspace query. @param rwork (workspace, for [cz]geqp3 only) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_cgeqp3_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqp3_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, magmaFloatComplex_ptr dwork, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define dA(i_, j_) (dA + (i_) + (j_)*ldda) const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magma_int_t ione = 1; //magma_int_t na; magma_int_t n_j; magma_int_t j, jb, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, lwkopt; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } nb = magma_get_cgeqp3_nb( m, n ); minmn = min(m,n); if (*info == 0) { if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #ifdef REAL lwkopt += 2*n; #endif } if (lwork < lwkopt) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (minmn == 0) return *info; #ifdef REAL float *rwork = dwork + (n + 1)*nb; #endif magmaFloatComplex_ptr df; if (MAGMA_SUCCESS != magma_cmalloc( &df, (n+1)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloat_ptr dlsticcs; if (MAGMA_SUCCESS != magma_smalloc( &dlsticcs, 1+256*(n+255)/256 )) { magma_free( df ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magmablas_claset( MagmaFull, n+1, nb, c_zero, c_zero, df, n+1, queue ); nfxd = 0; /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_cswap(&m, dA(0, j), &ione, dA(0, nfxd), &ione); // TODO: ERROR, matrix not on CPU! jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* // TODO: Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. if (nfxd > 0) { na = min(m,nfxd); lapackf77_cgeqrf(&m, &na, dA, &ldda, tau, dwork, &lwork, info); if (na < n) { n_j = n - na; lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr, &m, &n_j, &na, dA, &ldda, tau, dA(0, na), &ldda, dwork, &lwork, info ); } }*/ /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; //sminmn = minmn - nfxd; /* Initialize partial column norms. */ magmablas_scnrm2_cols( sm, sn, dA(nfxd,nfxd), ldda, &rwork[nfxd], queue ); magma_scopymatrix( sn, 1, &rwork[nfxd], sn, &rwork[n+nfxd], sn, queue ); j = nfxd; //if (nb < sminmn) { /* Use blocked code initially. */ /* Compute factorization: while loop. */ topbmn = minmn; // - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; //magma_claqps_gpu // this is a cpp-file magma_claqps2_gpu // this is a cuda-file ( m, n_j, j, jb, &fjb, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], dwork, &df[jb], n_j, dlsticcs, queue ); j += fjb; /* fjb is actual number of columns factored */ } } /* // Use unblocked code to factor the last or only block. if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_cgetmatrix( m-j, n_j, dA(j,j), ldda, A(j,j), lda, queue ); } lapackf77_claqp2(&m, &n_j, &j, dA(0, j), &ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], dwork ); }*/ } magma_queue_destroy( queue ); magma_free( df ); magma_free( dlsticcs ); return *info; } /* magma_cgeqp3_gpu */
extern "C" magma_int_t magma_cgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex **dlA, magma_int_t ldda, magmaFloatComplex *tau, magma_int_t *info ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CGEQRF2_MGPU computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaFloatComplex *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; magma_queue_t cqueue; magmablasGetKernelStream( &cqueue ); magma_device_t cdevice; magma_getdevice( &cdevice ); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; nb = magma_get_cgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think clarfb work could be smaller, max(n_local[:]). * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j==i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_cgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_csetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if cpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_csetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_csetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_cgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for cgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_csetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. // check that queues and events are non-zero before destroying them, though. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( stream[dev][0] ) { magma_queue_destroy( stream[dev][0] ); } if ( stream[dev][1] ) { magma_queue_destroy( stream[dev][1] ); } if ( panel_event[dev] ) { magma_event_destroy( panel_event[dev] ); } magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( cdevice ); magmablasSetKernelStream( cqueue ); return *info; } /* magma_cgeqrf2_mgpu */
/** Purpose ======= SSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U^H * D * U, if UPLO = MagmaUpper, or A = L * D * L^H, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_ssysv_comp ******************************************************************* */ extern "C" magma_int_t magma_ssytrf_nopiv( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A +(j)*lda + (i)) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaFloat_ptr dA; magmaFloat_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = magma_roundup( n, 32 ); nb = magma_get_ssytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_smalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_smalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_device_t cdev; magma_queue_t queues[2]; magma_event_t event; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if ( j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_scopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0]); if (k == j+jb) { // magma_event_record( event, queues[0] ); magma_queue_sync( queues[0] ); } } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); if (j != 0) { //magma_event_sync(event); magma_sgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, queues[1]); } trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(queues[1]); trace_cpu_start( 0, "potrf", "potrf" ); magma_ssytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), lda, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_ssetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU trace_gpu_start( 0, 1, "get", "get" ); magma_sgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[1]); trace_gpu_end( 0, 1 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_scopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_slascl_diag( MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) { //magma_event_record( event, queues[0] ); magma_queue_sync(queues[0]); } } trace_gpu_end( 0, 0 ); } } } trace_finalize( "ssytrf.svg","trace.css" ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_ssytrf_nopiv */
/** Purpose ------- ZUNGQR generates an M-by-N COMPLEX_16 matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A COMPLEX_16 array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF_GPU. @param[in] T COMPLEX_16 array, dimension (NB, min(M,N)). T contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_zgeqrf_gpu (except stored on the CPU, not the GPU). @param[in] nb INTEGER This is the block size used in ZGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in T. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zungqr_m( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t nb, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(d,i,j) (dA[d] + (i) + (j)*ldda) #define dT(d,i,j) (dT[d] + (i) + (j)*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldwork; magma_int_t i, ib, ki, kk, iinfo; magmaDoubleComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_int_t di, dn; magma_int_t dpanel; magma_int_t ngpu = magma_num_gpus(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // Allocate memory on GPUs for A and workspaces magma_int_t ldda = ((m + 31) / 32) * 32; magma_int_t lddwork = ((n + 31) / 32) * 32; magma_int_t min_lblocks = (n / nb) / ngpu; // min. blocks per gpu magma_int_t last_dev = (n / nb) % ngpu; // device with last block magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magmaDoubleComplex *dA[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dT[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dV[ MagmaMaxGPUs ] = { NULL }; magmaDoubleComplex *dW[ MagmaMaxGPUs ] = { NULL }; magma_queue_t stream[ MagmaMaxGPUs ] = { NULL }; for( int d = 0; d < ngpu; ++d ) { // example with n = 75, nb = 10, ngpu = 3 // min_lblocks = 2 // last_dev = 1 // gpu 0: 2 blocks, cols: 0- 9, 30-39, 60-69 // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial) // gpu 2: 1 block, cols: 20-29, 50-59 magma_setdevice( d ); nlocal[d] = min_lblocks*nb; if ( d < last_dev ) { nlocal[d] += nb; } else if ( d == last_dev ) { nlocal[d] += (n % nb); } ldwork = nlocal[d]*ldda // dA + nb*m // dT + nb*ldda // dV + nb*lddwork; // dW if ( MAGMA_SUCCESS != magma_zmalloc( &dA[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } dT[d] = dA[d] + nlocal[d]*ldda; dV[d] = dT[d] + nb*m; dW[d] = dV[d] + nb*ldda; magma_queue_create( &stream[d] ); } trace_init( 1, ngpu, 1, stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for zungqr workspace lwork = n * nb; magma_zmalloc_cpu( &work, lwork ); if (work == NULL) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } // Use unblocked code for the last or only block. if (kk < n) { trace_cpu_start( 0, "ungqr", "ungqr last block" ); m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; dpanel = (kk / nb) % ngpu; di = ((kk / nb) / ngpu) * nb; magma_setdevice( dpanel ); lapackf77_zungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); magma_zsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(dpanel, kk, di), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_zlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(dpanel, 0, di), ldda ); trace_cpu_end( 0 ); } if (kk > 0) { // Use blocked code // send T to all GPUs for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set T" ); magma_zsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, stream[d] ); trace_gpu_end( d, 0 ); } // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation for( i = ki; i >= 0; i -= nb ) { ib = min(nb, k - i); mi = m - i; dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; // Send current panel to the GPUs lapackf77_zlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); trace_gpu_start( d, 0, "set", "set V" ); magma_zsetmatrix_async( mi, ib, A(i, i), lda, dV[d], ldda, stream[d] ); trace_gpu_end( d, 0 ); } // set panel to identity magma_setdevice( dpanel ); magmablasSetKernelStream( stream[dpanel] ); trace_gpu_start( dpanel, 0, "laset", "laset" ); magmablas_zlaset( MagmaFull, i, ib, c_zero, c_zero, dA(dpanel, 0, di), ldda ); magmablas_zlaset( MagmaFull, mi, ib, c_zero, c_one, dA(dpanel, i, di), ldda ); trace_gpu_end( dpanel, 0 ); if (i < n) { // Apply H to A(i:m,i:n) from the left for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( stream[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn ); trace_gpu_start( d, 0, "larfb", "larfb" ); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, dn-di, ib, dV[d], ldda, dT(d,0,i), nb, dA(d, i, di), ldda, dW[d], lddwork ); trace_gpu_end( d, 0 ); } } } } // copy result back to CPU trace_cpu_start( 0, "get", "get A" ); magma_zgetmatrix_1D_col_bcyclic( m, n, dA, ldda, A, lda, ngpu, nb ); trace_cpu_end( 0 ); #ifdef TRACING char name[80]; snprintf( name, sizeof(name), "zungqr-n%d-ngpu%d.svg", m, ngpu ); trace_finalize( name, "trace.css" ); #endif CLEANUP: for( int d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( dA[d] ); magma_queue_destroy( stream[d] ); } magma_free_cpu( work ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zungqr */
void magmablas_ssymm_mgpu_com( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, float alpha, float *dA[], magma_int_t ldda, magma_int_t offset, float *dB[], magma_int_t lddb, float beta, float *dC[], magma_int_t lddc, float *dwork[], magma_int_t dworksiz, float *C, magma_int_t ldc, float *work[], magma_int_t worksiz, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) //printf("####################################################\n"); //printf(" start ssymm \n"); //printf("####################################################\n"); if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nstream >= ngpu ); assert( nbevents >= ngpu*ngpu ); float c_one = MAGMA_S_ONE; float *dwork1[MagmaMaxGPUs]; float *dwork2[MagmaMaxGPUs]; magma_int_t maxgsize = n*m; magma_int_t lddwork = lddc; magma_int_t ldwork = m; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork1[dev] = dwork[dev]; // size of dwork1 is n*lddwork dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu } assert( dworksiz >= (n*lddwork+maxgsize*ngpu) ); assert( worksiz >= (n*ldwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gmaster; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(float) ); // put all dC on all dev to 0 except the one which // hold i==0 because this one has to multiply by beta. if(dev!=stdev){ cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(float) ); } } magma_int_t newoffset = offset; // 1. symmetrize if(blockoffset>0){ newoffset = offset+fstblksiz; // newoffset is adjusted over nb magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0); //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz); magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t newstdev = (newoffset/nb)%ngpu; magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 ); magma_int_t devperm = (dev-newstdev+ngpu)%ngpu; magma_int_t nbblkoffst = newoffset/nb; magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magmablas_ssymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb ); if(remm%nb>0){ magma_int_t nblstblks = (nbblk+1)%ngpu; magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu; //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb); if(devperm==devlstblk) magmablas_ssymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile } } /* magma_int_t siz = m+offset; float *R; magma_smalloc_cpu( &R, siz*siz ); // collecte back A magmablas_sgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb ); magma_setdevice( 0 ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); //magma_sgetmatrix( siz, siz, dA[0], ldda, R, siz ); FILE *trace_file; trace_file = fopen("AJETE/Aafter", "w"); for (int j = 0; j < siz ; j++) for (int i = 0; i < siz ; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]); fclose(trace_file); return; */ // ROW GEMM transpose a row and make a gemm with a block // if only 1 GPU used the ROW GEMM is integrated with the // COL GEMM (better accuracy observed) and better perf if(ngpu>1){ for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix //magma_int_t dev = (ioff / nb) % ngpu; magma_int_t nbblkoffst = offset/nb; magma_int_t nbblk = magma_ceildiv(i, nb); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 ); magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); magma_int_t myrowsize = myblk * nb; magma_int_t coloffset = myblkoffst*nb; if(dev==stdev) { myrowsize = myrowsize -blockoffset; coloffset = myblkoffst*nb+blockoffset; } //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib, alpha, dA(dev,ioff,coloffset), ldda, dB(dev,i,0), lddb, c_one, dwork(dev,0,0), lddwork ); } } } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_event_record(redevents[dev][1], streams[dev][1]); } } // COL GEMM // blockoffset is offset within first block; for subsequent blocks it is 0 if(blockoffset>0){ magma_int_t ib = min( nb-blockoffset, m ); // block size magma_int_t iblock = (offset / nb) / ngpu; // local block id magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix magma_setdevice( stdev ); magmablasSetKernelStream( streams[ stdev ][ 0 ] ); //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib, alpha, dA(stdev,offset,di), ldda, dB(stdev,0,0), lddb, beta, dC(stdev,0,0), lddc ); } // COL GEMM for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix magma_int_t iblock = (ioff / nb) / ngpu; // local block id magma_int_t dev = (ioff / nb) % ngpu; magma_int_t di = iblock*nb; // local index in parent matrix //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib); magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); if(i==0){ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, beta, dC(dev,i,0), lddc ); }else{ magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, c_one, dC(dev,i,0), lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); // if only 1 GPU is used, do the ROW GEMM if(ngpu==1){ // NOTE THAT because the COL gemm write dC below the diagonal (i) // and the ROW GEMM write dC from 0 to diag-1, so they could // run in parallel on different streams. // // NO NO NO because // it might happen that col finished i and strated i+1 while row still at i // magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_sgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib, alpha, dA(dev,ioff,offset), ldda, dB(dev,i,0), lddb, c_one, dC(dev,0,0), lddc ); } } if(ngpu>1){ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t nbblkrow = nbblk-1; magma_int_t devperm = (dev-stdev+ngpu)%ngpu; magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 ); magma_int_t myrowsize = myblk * nb; if(dev==stdev) { myrowsize = myrowsize - blockoffset; } //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][1]); //magma_queue_sync( streams[ dev ][ 1 ] ); // for each dev add the computed ROW block each on its placment with dC for( magma_int_t blki = 0; blki < myblk; ++blki){ magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset; magma_int_t lcblki = blki*nb; magma_int_t ib = nb;// min(nb, m-gbblki); if(dev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } magmablas_sgeadd(ib, n, c_one, &dwork[dev][lcblki], lddwork, &dC[dev][gbblki] , lddc ); } magma_event_record(redevents[dev][0], streams[dev][0]); } } } // =========================================================== // COMMUNICATION ALL_REDUCE_SUM // =========================================================== if(ngpu==1){ return; } // INITIALIZE COMM for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ gpuisactive[dev] = mycolsize; if(masterdev==-1) { masterdev = dev; nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //******************************* // each GPU send its result // to its master. The master make // the addition and then send to // to the masters of other real // and receive from the masters of // other real make the addition // and broadcast locally the final // result. //******************************* //printf("=======================================================================\n"); //printf(" sending to my master \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU. if I am not the master, then send my result to my master. // store result on dwork[masterdev][dev*maxgsize] if(dev!=masterdev){ magma_setdevice( dev ); //printf(" GPU %d sending to my master %d\n", dev, masterdev); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ dev ][ 0 ], redevents[dev][0]); // sending to the master of my real magma_scopymatrix_async( m, n, &dC[dev][0], lddc, &dwork2[masterdev][maxgsize*dev], m, streams[dev][0] ); magma_event_record(redevents[dev][masterdev], streams[dev][0]); } // end I am not the masterdev }// end if mycolsize>0 }// for idev }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master do addition of local result and broadcast to other masters \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][0]); // ======================================== // local addition // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" master %d receiving from %d and adding \n", masterdev, lcdev); // this is an active GPU of my real. // wait I received what he send it to me and then do addition. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[lcdev][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*lcdev], m, &dC[masterdev][0] , lddc ); } }// for l=1:myngpu // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // // ======================================== // send to other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until finish the local addition then send using gmaster stream. //use stream 0 to make it sequential or stream gmaster to make it parallel. //Now both re the same. //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ gmaster ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dwork2[gmaster][maxgsize*masterdev], m, streams[masterdev][gmaster] ); magma_event_record(redevents[masterdev][gmaster], streams[masterdev][gmaster]); magma_event_record(redevents[masterdev][masterdev], streams[masterdev][gmaster]); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( streams[ masterdev ][ 0 ] ); // master has to wait until finishing all the send to other masters. magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); // ======================================== // addition of results from other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until receiving from gmaster, then do addition using stream 0 //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[gmaster][masterdev]); magmablas_sgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*gmaster], m, &dC[masterdev][0] , lddc ); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], streams[masterdev][0]); // ======================================== // ======================================== // local broadcast of final results // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ // this is an active GPU of my real. // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now. // to make it parallel put stream lcdev instead of stream 0 //printf(" master %d broadcasting local to %d \n", masterdev, lcdev); magma_queue_wait_event(streams[ masterdev ][ 0 ], redevents[masterdev][masterdev]); magma_scopymatrix_async( m, n, &dC[masterdev][0], lddc, &dC[lcdev][0], lddc, streams[masterdev][0] ); magma_event_record(redevents[masterdev][lcdev], streams[masterdev][0]); } }// for l=1:myngpu // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if(lccolsize>0){ magma_setdevice( lcdev ); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[lcdev][0]); magma_queue_wait_event(streams[ lcdev ][ 0 ], redevents[masterdev][lcdev]); } }// for l=1:myngpu }// end of if masterdev!=-1 maening real is active }// for cmplxid //printf("****************************************************\n"); //printf(" finish ssymm \n"); //printf("****************************************************\n"); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
/***************************************************************************//** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[in,out] dR_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDR, N/NB) dR should be of size (LDDR, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of R are stored in dR only when provide_RT > 0. @param[in] lddr INTEGER The leading dimension of the array dR. LDDR >= min(M,N) when provide_RT == 1 otherwise LDDR >= min(NB, min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[in,out] dT_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDT, N/NB) dT should be of size (LDDT, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of T are stored in dT only when provide_RT > 0. @param[in] lddt INTEGER The leading dimension of the array dT. LDDT >= min(NB,min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[out] dtau_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[in] provide_RT INTEGER provide_RT = 0 no R and no T in output. dR and dT are used as local workspace to store the R and T of each step. provide_RT = 1 the whole R of size (min(M,N), N) and the nbxnb block of T are provided in output. provide_RT = 2 the nbxnb diag block of R and of T are provided in output. @param[out] info_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf_batched *******************************************************************************/ extern "C" magma_int_t magma_dgeqrf_expert_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dR_array, magma_int_t lddr, double **dT_array, magma_int_t lddt, double **dtau_array, magma_int_t provide_RT, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) /* Local Parameter */ magma_int_t nb = magma_get_dgeqrf_batched_nb(m); magma_int_t nnb = 8; magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; else if (lddr < min_mn && provide_RT == 1) arginfo = -6; else if (lddr < min(min_mn, nb)) arginfo = -6; else if (lddt < min(min_mn, nb)) arginfo = -8; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if (min_mn == 0 ) return arginfo; if ( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t i, k, ib=nb, jb=nnb, offset_RT=0, use_stream; magma_int_t ldw, offset; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dW5_displ = NULL; double **dR_displ = NULL; double **dT_displ = NULL; double *dwork = NULL; double **cpuAarray = NULL; double **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_displ, batchCount * sizeof(*dR_displ)); magma_malloc((void**)&dT_displ, batchCount * sizeof(*dT_displ)); magma_dmalloc(&dwork, (2 * nb * n) * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(double*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_displ == NULL || dT_displ == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_ddisplace_pointers(dR_displ, dR_array, lddr, 0, 0, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, 0, batchCount, queue); // set dwork to zero because our GEMM routine does propagate NAN when C=betaC+alphaA*B and beta=0 magmablas_dlaset_q( MagmaFull, 2*nb, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, 2*nb, queue ); // set dR and dT to zero. if provide_RT == 0 only a tile of size nbxnb is used and overwritten at each step magmablas_dlaset_batched( MagmaFull, lddr, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dR_displ, lddr, batchCount, queue ); magmablas_dlaset_batched( MagmaFull, lddt, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dT_displ, lddt, batchCount, queue ); /* if ( provide_RT > 0 ) { magmablas_dlaset_q( MagmaFull, lddr, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } else { magmablas_dlaset_q( MagmaFull, lddr, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } */ magma_int_t streamid; const magma_int_t nbstreams=10; magma_queue_t queues[nbstreams]; for (i=0; i < nbstreams; i++) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1, queue); magma_getvector( batchCount, sizeof(double*), dT_array, 1, cpuTarray, 1, queue); for (i=0; i < min_mn; i += nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); if ( provide_RT > 0 ) { offset_RT = i; magma_ddisplace_pointers(dR_displ, dR_array, lddr, (provide_RT == 1 ? offset_RT:0), offset_RT, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, offset_RT, batchCount, queue); } //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_dgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dR_displ, lddr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, queue); //=============================================== // end of panel //=============================================== //=============================================== // update trailing matrix //=============================================== if ( (n-ib-i) > 0) { //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; magma_dset_pointer( dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; magma_dset_pointer( dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); // set the diagonal of v as one and the upper triangular part as zero already set inside geqrf_panel //magmablas_dlaset_batched( MagmaUpper, ib, ib, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); //magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_dlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dW4_displ, nb*lddt, batchCount, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- use_stream = magma_drecommend_cublas_gemm_stream(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib); if ( use_stream ) { magma_queue_sync(queue); for (k=0; k < batchCount; k++) { streamid = k%nbstreams; // the queue gemm must take cpu pointer magma_dlarfb_gpu_gemm( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k] + offset_RT*lddt, lddt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1, queues[streamid] ); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // if queue is NULL, no need to sync if ( queue != NULL ) { for (magma_int_t s=0; s < nbstreams; s++) magma_queue_sync(queues[s]); } } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_dlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const double**)dW0_displ, ldda, (const double**)dT_displ, lddt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, queue ); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update, // only when provide_RT=0 otherwise the nbxnb block of V is set to diag=1/0 // The upper portion of V could be set totaly to 0 here if ( provide_RT == 0 ) { magmablas_dlacpy_batched( MagmaUpper, ib, ib, dR_displ, lddr, dW0_displ, ldda, batchCount, queue ); } } magma_queue_sync(queue); for (k=0; k < nbstreams; k++) { magma_queue_destroy( queues[k] ); } magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); return arginfo; }
/** Purpose ------- CGEQRF_OOC computes a QR factorization of a COMPLEX M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This is an out-of-core (ooc) version that is similar to magma_cgeqrf but the difference is that this version can use a GPU even if the matrix does not fit into the GPU memory at once. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_cgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqrf_ooc( magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info ) { #define A(i_,j_) ( A + (i_) + (j_)*lda ) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) /* Constants */ const magmaFloatComplex c_one = MAGMA_C_ONE; /* Local variables */ magmaFloatComplex_ptr dA, dwork; magma_int_t i, ib, IB, j, min_mn, lddwork, ldda, rows; magma_int_t nb = magma_get_cgeqrf_nb( m, n ); magma_int_t lwkopt = n * nb; work[0] = magma_cmake_lwork( lwkopt ); bool lquery = (lwork == -1); *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Check how much memory do we have */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); magma_int_t NB = magma_int_t(0.8*freeMem/m); NB = (NB / nb) * nb; if (NB >= n) return magma_cgeqrf(m, n, A, lda, tau, work, lwork, info); min_mn = min(m,n); if (min_mn == 0) { work[0] = c_one; return *info; } lddwork = magma_roundup( NB, 32 ) + nb; ldda = magma_roundup( m, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dA, (NB + nb)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magmaFloatComplex_ptr ptr = dA + ldda*NB; dwork = dA + ldda*(NB + nb); /* start the main loop over the blocks that fit in the GPU memory */ for (i=0; i < n; i += NB) { IB = min( n-i, NB ); //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB); /* 1. Copy the next part of the matrix to the GPU */ magma_csetmatrix_async( m, IB, A(0,i), lda, dA(0,0), ldda, queues[0] ); magma_queue_sync( queues[0] ); /* 2. Update it with the previous transformations */ for (j=0; j < min(i,min_mn); j += nb) { ib = min( min_mn-j, nb ); /* Get a panel in ptr. */ // 1. Form the triangular factor of the block reflector // 2. Send it to the GPU. // 3. Put 0s in the upper triangular part of V. // 4. Send V to the GPU in ptr. // 5. Update the matrix. // 6. Restore the upper part of V. rows = m-j; lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(j,j), &lda, tau+j, work, &ib); magma_csetmatrix_async( ib, ib, work, ib, dwork, lddwork, queues[1] ); magma_cpanel_to_q( MagmaUpper, ib, A(j,j), lda, work+ib*ib ); magma_csetmatrix_async( rows, ib, A(j,j), lda, ptr, rows, queues[1] ); magma_queue_sync( queues[1] ); magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, IB, ib, ptr, rows, dwork, lddwork, dA(j, 0), ldda, dwork+ib, lddwork, queues[1] ); magma_cq_to_panel( MagmaUpper, ib, A(j,j), lda, work+ib*ib ); } /* 3. Do a QR on the current part */ if (i < min_mn) magma_cgeqrf2_gpu( m-i, IB, dA(i,0), ldda, tau+i, info ); /* 4. Copy the current part back to the CPU */ magma_cgetmatrix_async( m, IB, dA(0,0), ldda, A(0,i), lda, queues[0] ); } magma_queue_sync( queues[0] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_cgeqrf_ooc */
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (devices workspaces) REAL array of arrays, dimension NRGPU. if NRGPU = 1 the dimension of the first workspace should be (3*N*N/2+3*N) otherwise the NRGPU workspaces should have the size ceil((N-N1) * (N-N1) / floor(ngpu/2)) + NB * ((N-N1) + (N-N1) / floor(ngpu/2)) @param queues (device queues) magma_queue_t array, dimension (MagmaMaxGPUs,2) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3_m( magma_int_t ngpu, magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *Q, magma_int_t ldq, float rho, float *dlamda, float *Q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, magmaFloat_ptr dwork[], magma_queue_t queues[MagmaMaxGPUs][2], magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) #define dQ2(id) (dwork[id]) #define dS(id, ii) (dwork[id] + n2*n2_loc + (ii)*(n2*nb)) #define dQ(id, ii) (dwork[id] + n2*n2_loc + 2*(n2*nb) + (ii)*(n2_loc*nb)) if (ngpu == 1) { magma_setdevice(0); magma_slaex3(k, n, n1, d, Q, ldq, rho, dlamda, Q2, indx, ctot, w, s, indxq, *dwork, range, vl, vu, il, iu, info ); return *info; } float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; magma_int_t n1_loc, n2_loc, ib, nb, ib2, igpu; magma_int_t ni_loc[MagmaMaxGPUs]; magma_int_t i, ind, iq2, j, n12, n2, n23, tmp; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ //#define CHECK_CPU #ifdef CHECK_CPU float *hwS[2][MagmaMaxGPUs], *hwQ[2][MagmaMaxGPUs], *hwQ2[MagmaMaxGPUs]; #define hQ2(id) (hwQ2[id]) #define hS(id, ii) (hwS[ii][id]) #define hQ(id, ii) (hwQ[ii][id]) #endif n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; //lq2 = iq2 + n2 * n23; n1_loc = (n1-1) / (ngpu/2) + 1; n2_loc = (n2-1) / (ngpu/2) + 1; nb = magma_get_slaex3_m_nb(); if (n1 >= magma_get_slaex3_m_k()) { #ifdef CHECK_CPU for (igpu = 0; igpu < ngpu; ++igpu) { magma_smalloc_pinned( &(hwS[0][igpu]), n2*nb ); magma_smalloc_pinned( &(hwS[1][igpu]), n2*nb ); magma_smalloc_pinned( &(hwQ2[igpu]), n2*n2_loc ); magma_smalloc_pinned( &(hwQ[0][igpu]), n2_loc*nb ); magma_smalloc_pinned( &(hwQ[1][igpu]), n2_loc*nb ); } #endif for (igpu = 0; igpu < ngpu-1; igpu += 2) { ni_loc[igpu] = min(n1_loc, n1 - igpu/2 * n1_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu], &n12, Q2+n1_loc*(igpu/2), &n1, hQ2(igpu), &n1_loc); #endif magma_setdevice(igpu); magma_ssetmatrix_async( ni_loc[igpu], n12, Q2+n1_loc*(igpu/2), n1, dQ2(igpu), n1_loc, queues[igpu][0] ); ni_loc[igpu+1] = min(n2_loc, n2 - igpu/2 * n2_loc); #ifdef CHECK_CPU lapackf77_slacpy("A", &ni_loc[igpu+1], &n23, Q2+iq2+n2_loc*(igpu/2), &n2, hQ2(igpu+1), &n2_loc); #endif magma_setdevice(igpu+1); magma_ssetmatrix_async( ni_loc[igpu+1], n23, Q2+iq2+n2_loc*(igpu/2), n2, dQ2(igpu+1), n2_loc, queues[igpu+1][0] ); } } // #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info = iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_svrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_sirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); if (rk > 0) { if (n1 < magma_get_slaex3_m_k()) { // stay on the CPU if ( n23 != 0 ) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } else { //use the gpus ib = min(nb, rk); for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib, Q(ctot[0],iil-1), ldq, dS(igpu+1,0), n23, queues[igpu+1][0] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib, Q(0,iil-1), ldq, dS(igpu,0), n12, queues[igpu][0] ); } } for (i = 0; i < rk; i += nb) { ib = min(nb, rk - i); ind = (i/nb)%2; if (i+nb < rk) { ib2 = min(nb, rk - i - nb); for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_ssetmatrix_async( n23, ib2, Q(ctot[0],iil-1+i+nb), ldq, dS(igpu+1,(ind+1)%2), n23, queues[igpu+1][(ind+1)%2] ); } if (n12 != 0) { magma_setdevice(igpu); magma_ssetmatrix_async( n12, ib2, Q(0,iil-1+i+nb), ldq, dS(igpu,(ind+1)%2), n12, queues[igpu][(ind+1)%2] ); } } } // Ensure that the data is copied on gpu since we will overwrite it. for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n23, &ib, Q(ctot[0],iil-1+i), &ldq, hS(igpu+1,ind), &n23); #endif magma_setdevice(igpu+1); magma_queue_sync( queues[igpu+1][ind] ); } if (n12 != 0) { #ifdef CHECK_CPU lapackf77_slacpy("A", &n12, &ib, Q(0,iil-1+i), &ldq, hS(igpu,ind), &n12); #endif magma_setdevice(igpu); magma_queue_sync( queues[igpu][ind] ); } } for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu+1], &ib, &n23, &d_one, hQ2(igpu+1), &n2_loc, hS(igpu+1,ind), &n23, &d_zero, hQ(igpu+1, ind), &n2_loc); #endif magma_setdevice(igpu+1); magmablasSetKernelStream(queues[igpu+1][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu+1], ib, n23, d_one, dQ2(igpu+1), n2_loc, dS(igpu+1, ind), n23, d_zero, dQ(igpu+1, ind), n2_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu+1, cpu_gpu_sdiff(ni_loc[igpu+1], ib, hQ(igpu+1, ind), n2_loc, dQ(igpu+1, ind), n2_loc)); #endif } if (n12 != 0) { #ifdef CHECK_CPU blasf77_sgemm("N", "N", &ni_loc[igpu], &ib, &n12, &d_one, hQ2(igpu), &n1_loc, hS(igpu,ind%2), &n12, &d_zero, hQ(igpu, ind%2), &n1_loc); #endif magma_setdevice(igpu); magmablasSetKernelStream(queues[igpu][ind]); magma_sgemm(MagmaNoTrans, MagmaNoTrans, ni_loc[igpu], ib, n12, d_one, dQ2(igpu), n1_loc, dS(igpu, ind), n12, d_zero, dQ(igpu, ind), n1_loc); #ifdef CHECK_CPU printf("norm Q %d: %f\n", igpu, cpu_gpu_sdiff(ni_loc[igpu], ib, hQ(igpu, ind), n1_loc, dQ(igpu, ind), n1_loc)); #endif } } for (igpu = 0; igpu < ngpu-1; igpu += 2) { if (n23 != 0) { magma_setdevice(igpu+1); magma_sgetmatrix( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, Q(n1+n2_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu+1], ib, dQ(igpu+1, ind), n2_loc, // Q(n1+n2_loc*(igpu/2),iil-1+i), ldq, queues[igpu+1][ind] ); } if (n12 != 0) { magma_setdevice(igpu); magma_sgetmatrix( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, Q(n1_loc*(igpu/2),iil-1+i), ldq ); // magma_sgetmatrix_async( ni_loc[igpu], ib, dQ(igpu, ind), n1_loc, // Q(n1_loc*(igpu/2),iil-1+i), ldq, queues[igpu][ind] ); } } } for (igpu = 0; igpu < ngpu; ++igpu) { #ifdef CHECK_CPU magma_free_pinned( hwS[1][igpu] ); magma_free_pinned( hwS[0][igpu] ); magma_free_pinned( hwQ2[igpu] ); magma_free_pinned( hwQ[1][igpu] ); magma_free_pinned( hwQ[0][igpu] ); #endif magma_setdevice(igpu); magma_queue_sync( queues[igpu][0] ); magma_queue_sync( queues[igpu][1] ); } if ( n23 == 0 ) lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 == 0 ) lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_slaed3_m */
extern "C" magma_int_t magma_sgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, float **dlA, magma_int_t ldda, float *tau, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dlA(gpu,a_1,a_2) ( dlA[gpu]+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hwrk_ref(a_1) ( local_work + (a_1)) #define lhwrk ( local_work + (nb)*(m)) float *dwork[4], *panel[4], *local_work; magma_int_t i, j, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; magma_device_t cdevice; magma_getdevice(&cdevice); int panel_gpunum, i_local, n_local[4], la_gpu, displacement; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_sgeqrf_nb(m); displacement = n * nb; lwork = (m+n+64) * nb; lhwork = lwork - (m)*nb; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif if (MAGMA_SUCCESS != magma_smalloc( &(dwork[i]), (n + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Set the number of local n for each GPU */ for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; } if (MAGMA_SUCCESS != magma_smalloc_pinned( &local_work, lwork )) { *info = -9; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_free( dwork[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } cudaStream_t streaml[4][2]; for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); } nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { /* Set the GPU number that holds the current panel */ panel_gpunum = (i/nb)%num_gpus; /* Set the local index where the current panel is */ i_local = i/(nb*num_gpus)*nb; ib = min(k-i, nb); rows = m -i; /* Send current panel to the CPU */ #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix_async( rows, ib, dlA(panel_gpunum, i, i_local), ldda, hwrk_ref(i), ldwork, streaml[panel_gpunum][1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left; this is the look-ahead application to the trailing matrix */ la_gpu = panel_gpunum; /* only the GPU that has next panel is done look-ahead */ #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n_local[la_gpu]-i_local-old_ib, old_ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, old_i, i_local+old_ib), ldda, dwork[la_gpu]+old_ib, lddwork); la_gpu = ((i-nb)/nb)%num_gpus; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_ssetmatrix_async( old_ib, old_ib, hwrk_ref(old_i), ldwork, panel[la_gpu], ldda, streaml[la_gpu][0] ); } #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_queue_sync( streaml[panel_gpunum][1] ); lapackf77_sgeqrf(&rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &lhwork, info); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hwrk_ref(i), &ldwork, tau+i, lhwrk, &ib); spanel_to_q( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); // Send the current panel back to the GPUs // Has to be done with asynchronous copies for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif if (j == panel_gpunum) panel[j] = dlA(j, i, i_local); else panel[j] = dwork[j]+displacement; magma_ssetmatrix_async( rows, ib, hwrk_ref(i), ldwork, panel[j], ldda, streaml[j][0] ); } for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_queue_sync( streaml[j][0] ); } /* Restore the panel */ sq_to_panel( MagmaUpper, ib, hwrk_ref(i), ldwork, lhwrk+ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. Has to be done with asynchronous copies */ for(j=0; j<num_gpus; j++) { #ifdef MultiGPUs magma_setdevice(j); #endif magma_ssetmatrix_async( ib, ib, lhwrk, ib, dwork[j], lddwork, streaml[j][0] ); } if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left; This is update for the next panel; part of the look-ahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif //magma_queue_sync( streaml[j][0] ); if (j==la_gpu) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_loc), ldda, dwork[j]+ib, lddwork); else if (j<=panel_gpunum) magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local-ib, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local+ib), ldda, dwork[j]+ib, lddwork); else magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[j]-i_local, ib, panel[j], ldda, dwork[j], lddwork, dlA(j, i, i_local), ldda, dwork[j]+ib, lddwork); } } else { /* do the entire update as we exit and there would be no lookahead */ la_gpu = (panel_gpunum+1)%num_gpus; int i_loc = (i+nb)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(la_gpu); #endif magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n_local[la_gpu]-i_loc, ib, panel[la_gpu], ldda, dwork[la_gpu], lddwork, dlA(la_gpu, i, i_loc), ldda, dwork[la_gpu]+ib, lddwork); #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_ssetmatrix( ib, ib, hwrk_ref(i), ldwork, dlA(panel_gpunum, i, i_local), ldda ); } old_i = i; old_ib = ib; } } } else { i = 0; } for(j=0; j<num_gpus; j++){ #ifdef MultiGPUs magma_setdevice(j); #endif magma_free( dwork[j] ); } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; lhwork = lwork - rows*ib; panel_gpunum = (panel_gpunum+1)%num_gpus; int i_loc = (i)/(nb*num_gpus)*nb; #ifdef MultiGPUs magma_setdevice(panel_gpunum); #endif magma_sgetmatrix( rows, ib, dlA(panel_gpunum, i, i_loc), ldda, lhwrk, rows ); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, lhwrk, &rows, tau+i, lhwrk+ib*rows, &lhwork, info); magma_ssetmatrix( rows, ib, lhwrk, rows, dlA(panel_gpunum, i, i_loc), ldda ); } for(i=0; i<num_gpus; i++){ #ifdef MultiGPUs magma_setdevice(i); #endif magma_queue_destroy( streaml[i][0] ); magma_queue_destroy( streaml[i][1] ); } magma_setdevice(cdevice); magma_free_pinned( local_work ); return *info; } /* magma_sgeqrf2_mgpu */
void magmablas_cher2k_mgpu2( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_ptr dA[], magma_int_t ldda, magma_int_t a_offset, magmaFloatComplex_ptr dB[], magma_int_t lddb, magma_int_t b_offset, float beta, magmaFloatComplex_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda + (a_offset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb + (b_offset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower ) { info = -1; // upper not yet handled } else if ( trans != MagmaNoTrans ) { info = -2; // conj not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == MagmaNoTrans) && ldda < max(1,n)) || ((trans == Magma_ConjTrans) && ldda < max(1,k)) ) { info = -7; } else if ( a_offset < 0 || a_offset > ldda ) { info = -8; } else if ( ((trans == MagmaNoTrans) && lddb < max(1,n)) || ((trans == Magma_ConjTrans) && lddb < max(1,k)) ) { info = -10; } else if ( b_offset < 0 || b_offset > lddb ) { info = -11; } else if ( lddc < max(1,n) ) { info = -13; } else if ( c_offset < 0 || c_offset > lddc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nqueue <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex cbeta = MAGMA_C_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t orig_dev; magma_getdevice( &orig_dev ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B**H + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_cgemm( MagmaNoTrans, Magma_ConjTrans, n-i, ib, k, alpha, dA(idev,i,0), ldda, dB(idev,i,0), lddb, cbeta, dC(idev,ioff,di), lddc, queues[idev][s] ); blockoffset = 0; } // second loop does C = conj(alpha)*B*A**H + C_hat alpha = MAGMA_C_CONJ( alpha ); blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; // C[i:n,i] += conj(alpha) * B[i:n,0] * A[i,0]' //printf( "cgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_cgemm( MagmaNoTrans, Magma_ConjTrans, n-i, ib, k, alpha, dB(idev,i,0), lddb, dA(idev,i,0), ldda, c_one, dC(idev,ioff,di), lddc, queues[idev][s] ); blockoffset = 0; } magma_setdevice( orig_dev ); }
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA REAL array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf_mgpu_right( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr d_lA[], magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) float z_one = MAGMA_S_MAKE( 1.0, 0.0 ); float mz_one = MAGMA_S_MAKE( -1.0, 0.0 ); float one = 1.0; float m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevtrsmrows=0, nqueue = 5; float *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; float *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, igpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t queues[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_spotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_smalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_sgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_spotrf( uplo_, &n, panel, &ldpanel, info); magma_ssetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < ngpu; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / ngpu) * nb; if (d < (n / nb) % ngpu) n_local[d] += nb; else if (d == (n / nb) % ngpu) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_smalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < nqueue; j++ ) { magma_queue_create( &queues[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (ngpu == 4) crosspoint = n; else if (ngpu == 3) crosspoint = n; else if (ngpu == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = magma_wtime(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_sgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, queues[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = magma_wtime()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % ngpu; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( queues[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = magma_wtime(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = magma_wtime() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = magma_wtime(); #endif lapackf77_spotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = magma_wtime() - tcchol; ttot_cchol += tcchol; tctrsm = magma_wtime(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_strsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = magma_wtime() - tctrsm; ttot_ctrsm += tctrsm; tctm = magma_wtime(); #endif d = (id + 1) % ngpu; // send current panel to gpus for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_ssetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, queues[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % ngpu; for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); } #if defined (ENABLE_TIMER) tctm = magma_wtime() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % ngpu; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * ngpu) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = magma_wtime(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream( queues[d][STREAM_ID(j_local2)] ); #define SSYRK_ON_DIAG #ifdef SSYRK_ON_DIAG magma_ssyrk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_sgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_sgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = magma_wtime() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( queues[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_sgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, queues[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < ngpu; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) therk[d] = magma_wtime(); #endif //magmablasSetKernelStream( queues[d] ); //magma_ssyrk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef SSYRK_ON_DIAG magma_ssyrk_mgpu #else magma_ssyrk_mgpu2 #endif (ngpu, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, nqueue, queues ); #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = magma_wtime() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % ngpu, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( id=0; id < nqueue; id++ ) { magma_queue_sync( queues[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % ngpu; // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = magma_wtime(); #endif magma_sgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_spotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_ssetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = magma_wtime() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < nqueue; j++ ) { magma_queue_destroy( queues[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_spotrf_mgpu_right */
/** Purpose ------- CLAHRU is an auxiliary MAGMA routine that is used in CGEHRD to update the trailing sub-matrices after the reductions of the corresponding panels. See further details below. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ihi INTEGER Last row to update. Same as IHI in cgehrd. @param[in] k INTEGER Number of rows of the matrix Am (see details below) @param[in] nb INTEGER Block size @param[out] A COMPLEX array, dimension (LDA,N-K) On entry, the N-by-(N-K) general matrix to be updated. The computation is done on the GPU. After Am is updated on the GPU only Am(1:NB) is transferred to the CPU - to update the corresponding Am matrix. See Further Details below. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- This implementation follows the algorithm and notations described in: S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. The difference is that here Am is computed on the GPU. M is renamed Am, G is renamed Ag. @ingroup magma_cgeev_aux ********************************************************************/ extern "C" magma_int_t magma_clahru_m( magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, struct cgehrd_data* data ) { #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dW( d, i, j ) (data->W [d] + (i) + (j)*ldda) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t d; magma_int_t dk, dkhi, dknb, dn; magma_int_t info = 0; if (n < 0) { info = -1; } else if (ihi < 0 || ihi > n) { info = -2; } else if (k < 0 || k > n) { info = -3; } else if (nb < 1 || nb > n) { info = -4; } else if (lda < max(1,n)) { info = -6; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k, ihi, &dk, &dkhi ); magma_indices_1D_bcyclic( nb, ngpu, d, k+nb, n, &dknb, &dn ); // ----- // on right, A := A Q = A - A V T V' // Update Am = Am - Am V T Vd' = Am - Ym Wd', with Wd = Vd T' // Wd = Vd T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)' // Vd and Wd are the portions corresponding to the block cyclic dkstribution magma_cgemm( MagmaNoTrans, MagmaConjTrans, dkhi-dk, nb, nb, c_one, dVd(d, dk, 0), ldvd, dTi(d), nb, c_zero, dW (d, dk, 0), ldda ); // Am = Am - Ym Wd' = A(0:k-1, k:ihi-1) - Ym(0:k-1, 0:nb-1) * W(k:ihi-1, 0:nb-1)' magma_cgemm( MagmaNoTrans, MagmaConjTrans, k, dkhi-dk, nb, c_neg_one, dY(d, 0, 0), ldda, dW(d, dk, 0), ldda, c_one, dA(d, 0, dk), ldda ); // ----- // on right, A := A Q = A - A V T V' // Update Ag = Ag - Ag V T V' = Ag - Yg Wd' // Ag = Ag - Yg Wd' = A(k:ihi-1, nb:ihi-k-1) - Y(k:ihi-1, 0:nb-1) * W(k+nb:ihi-1, 0:nb-1)' magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, dkhi-dknb, nb, c_neg_one, dY(d, k, 0), ldda, dW(d, dknb, 0), ldda, c_one, dA(d, k, dknb), ldda ); // ----- // on left, A := Q' A = A - V T' V' A // Ag2 = Ag2 - V T' V' Ag2 = W Yg, with W = V T' and Yg = V' Ag2 // Note that Ag is A(k:ihi, nb+1:ihi-k) // while Ag2 is A(k:ihi, nb+1: n -k) // here V and W are the whole matrices, not just block cyclic portion // W = V T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)' // TODO would it be cheaper to compute the whole matrix and // copy the block cyclic portions to another workspace? magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, nb, nb, c_one, dV (d, k, 0), ldv, dTi(d), nb, c_zero, dW (d, k, 0), ldda ); // Z = V(k:ihi-1, 0:nb-1)' * A(k:ihi-1, nb:n-k-1); Z is stored over Y magma_cgemm( MagmaConjTrans, MagmaNoTrans, nb, dn-dknb, ihi-k, c_one, dV(d, k, 0), ldv, dA(d, k, dknb), ldda, c_zero, dY(d, 0, 0), nb ); // Ag2 = Ag2 - W Z = A(k:ihi-1, k+nb:n-1) - W(k+nb:n-1, 0:nb-1) * Z(0:nb-1, k+nb:n-1) magma_cgemm( MagmaNoTrans, MagmaNoTrans, ihi-k, dn-dknb, nb, c_neg_one, dW(d, k, 0), ldda, dY(d, 0, 0), nb, c_one, dA(d, k, dknb), ldda ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return info; }
extern "C" void magma_ssyr2k_mgpu( magma_int_t num_gpus, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, float alpha, float **db, magma_int_t lddb, magma_int_t offset_b, float beta, float **dc, magma_int_t lddc, magma_int_t offset, magma_int_t num_streams, magma_queue_t stream[][10]) { #define dB(id, i, j) (db[(id)]+(j)*lddb + (i)+offset_b) #define dB1(id, i, j) (db[(id)]+(j)*lddb + (i)+offset_b)+k*lddb #define dC(id, i, j) (dc[(id)]+(j)*lddc + (i)) magma_int_t i, id, ib, ii, kk, n1; float c_one = MAGMA_S_ONE; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = (i/(nb*num_gpus))%num_streams; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); /* ssyr2k on diagonal block */ trace_gpu_start( id, kk, "syr2k", "syr2k" ); magma_ssyr2k(uplo, trans, ib, k, alpha, dB1(id, i, 0 ), lddb, dB(id, i, 0 ), lddb, beta, dC(id, i+offset, ii), lddc); trace_gpu_end( id, kk ); } /* off-diagonal update */ if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = (i/(nb*num_gpus))%num_streams; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); magma_sgemm(MagmaNoTrans, MagmaConjTrans, i, ib, k, alpha, dB1(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, c_one, dC(id, 0, ii), lddc); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = (i/(nb*num_gpus))%num_streams; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); n1 = n-i-ib; // sgemm on off-diagonal blocks trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, alpha, dB1(id, i+ib, 0 ), lddb, dB(id, i, 0 ), lddb, c_one, dC(id, i+offset+ib, ii), lddc); trace_gpu_end( id, kk ); } } if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = (i/(nb*num_gpus))%num_streams; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); magma_sgemm(MagmaNoTrans, MagmaConjTrans, i, ib, k, alpha, dB( id, 0, 0 ), lddb, dB1(id, i, 0 ), lddb, c_one, dC(id, 0, ii), lddc); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = (i/(nb*num_gpus))%num_streams; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); ib = min(nb, n-i); ii = nb*((i+offset)/(nb*num_gpus)); n1 = n-i-ib; /* sgemm on off-diagonal blocks */ trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_sgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, alpha, dB(id, i+ib, 0 ), lddb, dB1(id, i, 0 ), lddb, c_one, dC(id, i+offset+ib, ii), lddc); trace_gpu_end( id, kk ); } } for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); for( kk=0; kk < num_streams; kk++ ) { magma_queue_sync(stream[id][kk]); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
extern "C" magma_int_t magma_zbulge_applyQ_v2_m( magma_int_t ngpu, magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t lde, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_int_t nbevents =2, nstream=2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t myevent[MagmaMaxGPUs][20]; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&myevent[dev][i],cudaEventDisableTiming); } } // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel magmaDoubleComplex *dE[MagmaMaxGPUs]; magmaDoubleComplex *dwork[MagmaMaxGPUs], *dwork0[MagmaMaxGPUs], *dwork1[MagmaMaxGPUs]; //magmaDoubleComplex *dwvt[MagmaMaxGPUs]; magmaDoubleComplex *dwvt0[MagmaMaxGPUs], *dwvt1[MagmaMaxGPUs]; magmaDoubleComplex *dT0[MagmaMaxGPUs], *dV0[MagmaMaxGPUs], *dT1[MagmaMaxGPUs], *dV1[MagmaMaxGPUs]; magma_int_t dev; magma_int_t ldde = N; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t ne_loc = magma_ceildiv(NE, ngpu); if (ne_loc < 256) ne_loc=256; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*NE; // lddv*Vblksiz; magma_int_t dworksiz = ne_loc*Vblksiz; // lddv*Vblksiz; // NE*Vblksiz; ngpu = min(ngpu, magma_ceildiv(NE,ne_loc)); // Don't use GPU that will not have data. // copy dE to GPUs for (dev=0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_zmalloc( &dE[dev], ldde * ne_loc)) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dE\n" ); exit(-1); } if (MAGMA_SUCCESS != magma_zmalloc( &dwork[dev], 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0[dev] = dwork[dev]; // size = dworksiz; dwork1[dev] = dwork0[dev] + dworksiz; // size = dworksiz; dwvt0[dev] = dwork[dev] + 2*dworksiz; // size = dwVTsiz; dwvt1[dev] = dwvt0[dev] + dwVTsiz; // size = dwVTsiz; dV0[dev] = dwork[dev] + 2*dworksiz + 2*dwVTsiz; dT0[dev] = dV0[dev] + Vchunksiz*Vblksiz*lddv; dV1[dev] = dT0[dev] + Vchunksiz*Vblksiz*lddt; dT1[dev] = dV1[dev] + Vchunksiz*Vblksiz*lddv; magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_zsetmatrix_async( N, ie_loc, E+lde*ne_loc*dev, lde, dE(dev, 0, 0), ldde, streams[dev][1] ); } // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22_m GPU with NGPU %d N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", ngpu, N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_zsetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0[dev], vld, streams[dev][0]); magma_zsetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0[dev], tld, streams[dev][0]); } } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, dwvt0[dev], Vm); } magma_event_record( myevent[dev][0], streams[dev][0] ); } } else { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][1]); magma_queue_wait_event( streams[dev][1], myevent[dev][0] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib); magma_zlarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, dwvt1[dev], Vm); } magma_event_record( myevent[dev][1], streams[dev][1] ); } } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { printf("versionL 114 not implemented in zbulge_applyQ_v2_m\n"); exit(-1); mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } */ } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { printf("Side 'R' not implemented in zbulge_applyQ_v2_m\n"); exit(-1); /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_zsetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_zsetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT // copy back the dE form each GPU for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_zgetmatrix_async( N, ie_loc, dE(dev, 0, 0), ldde, E+lde*ne_loc*dev, lde, streams[dev][0] ); magma_event_record( myevent[dev][0], streams[dev][0] ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_device_sync(); // no need for synchronize magma_free(dwork[dev]); magma_free(dE[dev]); for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( myevent[dev][i] ); } for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; }
/** Purpose ------- ZGETRS solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by ZGETRF_NOPIV_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) The factors L and U from the factorization A = P*L*U as computed by ZGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrs_nopiv_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { // Constants const magmaDoubleComplex c_one = MAGMA_Z_ONE; // Local variables bool notran = (trans == MagmaNoTrans); *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if (notran) { /* Solve A * X = B. */ if ( nrhs == 1) { magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1, queue ); magma_ztrsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } else { /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_ztrsv( MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); magma_ztrsv( MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ztrsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ztrsm( MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } magma_queue_destroy( queue ); return *info; }