/***************************************************************************//** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U**H * U, if uplo = MagmaUpper, or A = L * L**H, if uplo = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. This uses multiple queues to overlap communication and computation. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If uplo = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If uplo = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_potrf *******************************************************************************/ extern "C" magma_int_t magma_spotrf( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float d_one = 1.0; const float d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, ldda, nb; magmaFloat_ptr dA = NULL; /* Check arguments */ *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_spotrf_nb( n ); if (nb <= 1 || nb >= n) { lapackf77_spotrf( uplo_, &n, A, &lda, info ); } else { /* Use hybrid blocked code. */ ldda = magma_roundup( n, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ return magma_spotrf_m( ngpu, uplo, n, A, lda, info ); } if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda )) { /* alloc failed so call the non-GPU-resident version */ return magma_spotrf_m( ngpu, uplo, n, A, lda, info ); } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. */ jb = min( nb, n-j ); magma_ssetmatrix_async( jb, n-j, A(j, j), lda, dA(j, j), ldda, queues[1] ); magma_ssyrk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_sgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, queues[0] ); if (j+jb < n) { magma_sgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // this could be on any queue; it isn't needed until exit. magma_sgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, queues[0] ); lapackf77_spotrf( MagmaUpperStr, &jb, A(j, j), &lda, info ); if (*info != 0) { *info = *info + j; break; } magma_ssetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); if (j+jb < n) { magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j ), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. jb = min( nb, n-j ); magma_ssetmatrix_async( n-j, jb, A(j, j), lda, dA(j, j), ldda, queues[1] ); magma_ssyrk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_sgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, queues[0] ); if (j+jb < n) { magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // this could be on any queue; it isn't needed until exit. magma_sgetmatrix_async( jb, j, dA(j, 0), ldda, A(j, 0), lda, queues[0] ); lapackf77_spotrf( MagmaLowerStr, &jb, A(j, j), &lda, info ); if (*info != 0) { *info = *info + j; break; } magma_ssetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); if (j+jb < n) { magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); } return *info; } /* magma_spotrf */
/** @deprecated Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX_16 array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX_16 array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX_16 array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex_ptr dauxv, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; magmaDoubleComplex z__1; magma_int_t k, rk; magmaDoubleComplex_ptr dAks; magmaDoubleComplex tauk = MAGMA_Z_ZERO; magma_int_t pvt; double tol3z; magma_int_t itemp; double lsticc; magmaDouble_ptr dlsticcs; magma_dmalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS idamax; pvt, k are 0-based. pvt = k + magma_idamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_zswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_zswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_zcopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_dgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_zcopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_cgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_cgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_dopts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); real_Double_t res; magma_d_matrix A={Magma_CSR}, AT={Magma_CSR}, A2={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; int i=1; real_Double_t start, end; CHECK( magma_dparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_dm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_d_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", int(A.num_rows), int(A.num_cols), int(A.nnz) ); // scale matrix CHECK( magma_dmscale( &A, zopts.scaling, queue )); // remove nonzeros in matrix start = magma_sync_wtime( queue ); for (int j=0; j<10; j++) CHECK( magma_dmcsrcompressor( &A, queue )); end = magma_sync_wtime( queue ); printf( " > MAGMA CPU: %.2e seconds.\n", (end-start)/10 ); // transpose CHECK( magma_dmtranspose( A, &AT, queue )); // convert, copy back and forth to check everything works CHECK( magma_dmconvert( AT, &B, Magma_CSR, Magma_CSR, queue )); magma_dmfree(&AT, queue ); CHECK( magma_dmtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&B, queue ); start = magma_sync_wtime( queue ); for (int j=0; j<10; j++) CHECK( magma_dmcsrcompressor_gpu( &B_d, queue )); end = magma_sync_wtime( queue ); printf( " > MAGMA GPU: %.2e seconds.\n", (end-start)/10 ); CHECK( magma_dmtransfer( B_d, &B, Magma_DEV, Magma_CPU, queue )); magma_dmfree(&B_d, queue ); CHECK( magma_dmconvert( B, &AT, Magma_CSR, Magma_CSR, queue )); magma_dmfree(&B, queue ); // transpose back CHECK( magma_dmtranspose( AT, &A2, queue )); magma_dmfree(&AT, queue ); CHECK( magma_dmdiff( A, A2, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester matrix compressor: ok\n"); else printf("%% tester matrix compressor: failed\n"); magma_dmfree(&A, queue ); magma_dmfree(&A2, queue ); i++; } cleanup: magma_dmfree(&AT, queue ); magma_dmfree(&B, queue ); magma_dmfree(&A, queue ); magma_dmfree(&A2, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
/** Purpose ------- ZGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) COMPLEX_16 array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_zgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_zgeqrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaDoubleComplex *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_zgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(magmaDoubleComplex)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_zgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_zsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_zgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); zsplit_diag_block(ib, work(i), ldwork, ut); magma_zsetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_zsetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_zsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_zgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_zgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_zsetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zgeqrf_gpu */
extern "C" magma_int_t magma_dgeqrf_ooc(magma_int_t m, magma_int_t n, double *a, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQRF_OOC computes a QR factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This is an out-of-core (ooc) version that is similar to magma_dgeqrf but the difference is that this version can use a GPU even if the matrix does not fit into the GPU memory at once. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) double *da, *dwork; double c_one = MAGMA_D_ONE; int k, lddwork, ldda; *info = 0; int nb = magma_get_dgeqrf_nb(min(m, n)); int lwkopt = n * nb; work[0] = MAGMA_D_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Check how much memory do we have */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(double); magma_int_t IB, NB = (magma_int_t)(0.8*freeMem/m); NB = (NB / nb) * nb; if (NB >= n) return magma_dgeqrf(m, n, a, lda, tau, work, lwork, info); k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((NB+31)/32)*32+nb; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc( &da, (NB + nb)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); // magmablasSetKernelStream(stream[1]); double *ptr = da + ldda * NB; dwork = da + ldda*(NB + nb); /* start the main loop over the blocks that fit in the GPU memory */ for(int i=0; i<n; i+=NB) { IB = min(n-i, NB); //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB); /* 1. Copy the next part of the matrix to the GPU */ magma_dsetmatrix_async( (m), IB, a_ref(0,i), lda, da_ref(0,0), ldda, stream[0] ); magma_queue_sync( stream[0] ); /* 2. Update it with the previous transformations */ for(int j=0; j<min(i,k); j+=nb) { magma_int_t ib = min(k-j, nb); /* Get a panel in ptr. */ // 1. Form the triangular factor of the block reflector // 2. Send it to the GPU. // 3. Put 0s in the upper triangular part of V. // 4. Send V to the GPU in ptr. // 5. Update the matrix. // 6. Restore the upper part of V. magma_int_t rows = m-j; lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(j,j), &lda, tau+j, work, &ib); magma_dsetmatrix_async( ib, ib, work, ib, dwork, lddwork, stream[1] ); dpanel_to_q(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); magma_dsetmatrix_async( rows, ib, a_ref(j,j), lda, ptr, rows, stream[1] ); magma_queue_sync( stream[1] ); magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, IB, ib, ptr, rows, dwork, lddwork, da_ref(j, 0), ldda, dwork+ib, lddwork); dq_to_panel(MagmaUpper, ib, a_ref(j,j), lda, work+ib*ib); } /* 3. Do a QR on the current part */ if (i<k) magma_dgeqrf2_gpu(m-i, IB, da_ref(i,0), ldda, tau+i, info); /* 4. Copy the current part back to the CPU */ magma_dgetmatrix_async( (m), IB, da_ref(0,0), ldda, a_ref(0,i), lda, stream[0] ); } magma_queue_sync( stream[0] ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_dgeqrf_ooc */
/** Purpose ------- SSYEVD_GPU computes all eigenvalues and, optionally, eigenvectors of a real symmetric matrix A. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA, N). On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[out] w REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param wA (workspace) REAL array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_ssytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_ssyev_driver ********************************************************************/ extern "C" magma_int_t magma_ssyevd_gpu( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *w, float *wA, magma_int_t ldwa, float *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indwrk, liwmin; magma_int_t llwork; float smlnum; magma_int_t lquery; magmaFloat_ptr dwork; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (lower || (uplo == MagmaUpper))) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } magma_int_t nb = magma_get_ssytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -10; } else if ((liwork < liwmin) && ! lquery) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { magma_int_t lda = n; float *A; magma_smalloc_cpu( &A, lda*n ); magma_sgetmatrix( n, n, dA, ldda, A, lda, queue ); lapackf77_ssyevd( lapack_vec_const(jobz), lapack_uplo_const(uplo), &n, A, &lda, w, work, &lwork, iwork, &liwork, info ); magma_ssetmatrix( n, n, A, lda, dA, ldda, queue ); magma_free_cpu( A ); magma_queue_destroy( queue ); return *info; } // ssytrd2_gpu requires ldda*ceildiv(n,64) + 2*ldda*nb // sormtr_gpu requires lddc*n // slansy requires n magma_int_t ldwork = max( ldda*magma_ceildiv(n,64) + 2*ldda*nb, lddc*n ); ldwork = max( ldwork, n ); if ( wantz ) { // sstedx requires 3n^2/2 ldwork = max( ldwork, 3*n*(n/2 + 1) ); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt( smlnum ); rmax = magma_ssqrt( bignum ); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_slansy( MagmaMaxNorm, uplo, n, dA, ldda, dwork, ldwork, queue ); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_slascl( uplo, 0, 0, 1., sigma, n, n, dA, ldda, queue, info ); } /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */ // ssytrd work: e (n) + tau (n) + llwork (n*nb) ==> 2n + n*nb // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2) ==> 1 + 6n + 2n^2 inde = 0; indtau = inde + n; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_SYMV magma_ssytrd2_gpu( uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dwork, ldwork, &iinfo ); #else magma_ssytrd_gpu( uplo, n, dA, ldda, w, &work[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo ); #endif timer_stop( time ); #ifdef FAST_SYMV timer_printf( "time ssytrd2 = %6.2f\n", time ); #else timer_printf( "time ssytrd = %6.2f\n", time ); #endif /* For eigenvalues only, call SSTERF. For eigenvectors, first call SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call SORMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf( &n, w, &work[inde], info ); } else { timer_start( time ); magma_sstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde], &work[indwrk], n, &work[indwk2], llwrk2, iwork, liwork, dwork, info ); timer_stop( time ); timer_printf( "time sstedx = %6.2f\n", time ); timer_start( time ); magma_ssetmatrix( n, n, &work[indwrk], n, dwork, lddc, queue ); magma_sormtr_gpu( MagmaLeft, uplo, MagmaNoTrans, n, n, dA, ldda, &work[indtau], dwork, lddc, wA, ldwa, &iinfo ); magma_scopymatrix( n, n, dwork, lddc, dA, ldda, queue ); timer_stop( time ); timer_printf( "time sormtr + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { d__1 = 1. / sigma; blasf77_sscal( &n, &d__1, w, &ione ); } work[0] = magma_smake_lwork( lwmin ); iwork[0] = liwmin; magma_queue_destroy( queue ); magma_free( dwork ); return *info; } /* magma_ssyevd_gpu */
/** Purpose ------- ZHEGST_GPU reduces a complex Hermitian-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U^H or L^H*A*L. B must have been previously factorized as U^H*U or L*L^H by ZPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U^H)*A*inv(U) or inv(L)*A*inv(L^H); = 2 or 3: compute U*A*U^H or L^H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U^H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L^H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] dA COMPLEX_16 array, on the GPU device, dimension (LDDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] dB COMPLEX_16 array, on the GPU device, dimension (LDDB,N) The triangular factor from the Cholesky factorization of B, as returned by ZPOTRF. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zheev_comp ********************************************************************/ extern "C" magma_int_t magma_zhegst_gpu( magma_int_t itype, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_const_ptr dB, magma_int_t lddb, magma_int_t *info) { #define A(i_, j_) (work + (i_) + (j_)*lda ) #define B(i_, j_) (work + (i_) + (j_)*ldb + nb*ldb) #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dB(i_, j_) (dB + (i_) + (j_)*lddb) /* Constants */ const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const magmaDoubleComplex c_half = MAGMA_Z_HALF; const magmaDoubleComplex c_neg_half = MAGMA_Z_NEG_HALF; const double d_one = 1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t k, kb, kb2, nb; magma_int_t lda; magma_int_t ldb; magmaDoubleComplex *work; bool upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_zhegst_nb( n ); lda = nb; ldb = nb; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, 2*nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { kb = min( n, nb ); /* Compute inv(U^H)*A*inv(U) */ magma_zgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), lda, queues[0] ); magma_zgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), ldb, queues[0] ); for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); kb2 = min( n-k-nb, nb ); magma_queue_sync( queues[0] ); // finish get dA(k,k) -> A(0,0) and dB(k,k) -> B(0,0) /* Update the upper triangle of A(k:n,k:n) */ lapackf77_zhegst( &itype, uplo_, &kb, A(0,0), &lda, B(0,0), &ldb, info ); magma_zsetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, queues[0] ); if (k+kb < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish set dA(k,k) // Start copying next B block magma_zgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), ldb, queues[0] ); magma_zhemm( MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k,k+kb), ldda, queues[1] ); magma_zher2k( MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda, queues[1] ); // Start copying next A block magma_queue_sync( queues[1] ); magma_zgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, queues[0] ); magma_zhemm( MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k,k+kb), ldda, queues[1] ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda, queues[1] ); } } } else { kb = min( n, nb ); /* Compute inv(L)*A*inv(L^H) */ magma_zgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), lda, queues[0] ); magma_zgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), ldb, queues[0] ); for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); kb2 = min( n-k-nb, nb ); magma_queue_sync( queues[0] ); // finish get dA(k,k) -> A(0,0) and dB(k,k) -> B(0,0) /* Update the lower triangle of A(k:n,k:n) */ lapackf77_zhegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info ); magma_zsetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, queues[0] ); if (k+kb < n) { magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda, queues[1] ); magma_queue_sync( queues[0] ); // finish set dA(k,k) // Start copying next B block magma_zgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), ldb, queues[0] ); magma_zhemm( MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda, queues[1] ); magma_zher2k( MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda, queues[1] ); // Start copying next A block magma_queue_sync( queues[1] ); magma_zgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, queues[0] ); magma_zhemm( MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb,k), ldda, queues[1] ); magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda, queues[1] ); } } } } else { // itype == 2 or 3 if (upper) { /* Compute U*A*U^H */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); magma_zgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, queues[0] ); magma_zgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), ldb, queues[0] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ztrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda, queues[1] ); magma_zhemm( MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0,k), ldda, queues[1] ); magma_zher2k( MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda, queues[1] ); magma_zhemm( MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0,k), ldda, queues[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // finish get dA(k,k) -> A(0,0) and dB(k,k) -> B(0,0) lapackf77_zhegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info ); magma_zsetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, queues[1] ); magma_queue_sync( queues[1] ); // wait for A(0,0) before getting next panel } } else { /* Compute L^H*A*L */ for (k = 0; k < n; k += nb) { kb = min( n-k, nb ); magma_zgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, queues[0] ); magma_zgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), ldb, queues[0] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ztrmm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda, queues[1] ); magma_zhemm( MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k,0), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zher2k( MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda, queues[1] ); magma_zhemm( MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k,0), ldda, queues[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda, queues[1] ); } magma_queue_sync( queues[0] ); // finish get dA(k,k) -> A(0,0) and dB(k,k) -> B(0,0) lapackf77_zhegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info ); magma_zsetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, queues[1] ); magma_queue_sync( queues[1] ); // wait for A(0,0) before getting next panel } } } magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_zhegst_gpu */
/** Purpose ------- CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L. B must have been previously factorized as U**H*U or L*L**H by CPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H); = 2 or 3: compute U*A*U**H or L**H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U**H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L**H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] B COMPLEX array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by CPOTRF. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chegst(magma_int_t itype, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define B(i, j) (B + (j)*ldb + (i)) #define dA(i, j) (dw + (j)*ldda + (i)) #define dB(i, j) (dw + n*ldda + (j)*lddb + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nb; magma_int_t k, kb, kb2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex *dw; magma_int_t ldda = n; magma_int_t lddb = n; float d_one = 1.0; int upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; if (MAGMA_SUCCESS != magma_cmalloc( &dw, 2*n*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } nb = magma_get_chegst_nb(n); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_csetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda ); magma_csetmatrix( n, n, B(0, 0), ldb, dB(0, 0), lddb ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ for (k = 0; k < n; k += nb) { kb = min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the upper triangle of A(k:n,k:n) */ lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_cher2k(MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, stream[1] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_ctrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[1] ); } } magma_queue_sync( stream[0] ); } else { /* Compute inv(L)*A*inv(L') */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the lower triangle of A(k:n,k:n) */ lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { magma_ctrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_cher2k(MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, stream[1] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda); } magma_queue_sync( stream[1] ); } } magma_queue_sync( stream[0] ); } else { if (upper) { /* Compute U*A*U' */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, stream[0] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_queue_sync( stream[1] ); magma_cher2k(MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_ctrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda); } magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } else { /* Compute L'*A*L */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, stream[0] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_queue_sync( stream[1] ); magma_cher2k(MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_ctrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda); } magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } } magma_cgetmatrix( n, n, dA(0, 0), ldda, A(0, 0), lda ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dw ); return *info; } /* magma_chegst_gpu */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_sgeqrf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) float *dA, *dwork, *dT; float c_one = MAGMA_S_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_sgeqrf_nb(min(m, n)); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max(n*nb, 2*nb*nb); work[0] = MAGMA_S_MAKE( (float)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = ((n+31)/32)*32 - nb; ldda = ((m+31)/32)*32; magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_sgeqrf4(ngpu, m, n, A, lda, tau, work, lwork, info); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_sgeqrf_ooc(m, n, A, lda, tau, work, lwork, info); } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_ssetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i > 0) { /* download i-th panel */ magma_queue_sync( stream[1] ); magma_sgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT, nb, dA(old_i, old_i+2*old_ib), ldda, dwork, lddwork); magma_sgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[0] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib); spanel_to_q(MagmaUpper, ib, A(i,i), lda, work+ib*ib); /* download the i-th V matrix */ magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_ssetmatrix_async( ib, ib, work, ib, dT, nb, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+ib < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT, nb, dA(i, i+ib), ldda, dwork, lddwork); sq_to_panel(MagmaUpper, ib, A(i,i), lda, work+ib*ib); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i != 0) { magma_sgetmatrix_async( m, ib, dA(0,i), ldda, A(0,i), lda, stream[1] ); magma_queue_sync( stream[1] ); } magma_int_t rows = m-i; lapackf77_sgeqrf(&rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); magma_free( dA ); return *info; } /* magma_sgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_copts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_c_matrix x={Magma_CSR}, b={Magma_CSR}, t={Magma_CSR}; magma_c_matrix x1={Magma_CSR}, x2={Magma_CSR}; //Chronometry real_Double_t tempo1, tempo2; int i=1; CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; CHECK( magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } printf( "\n%% matrix info: %d-by-%d with %d nonzeros\n\n", int(A.num_rows), int(A.num_cols), int(A.nnz) ); // for the eigensolver case zopts.solver_par.ev_length = A.num_rows; CHECK( magma_ceigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix CHECK( magma_cmscale( &A, zopts.scaling, queue )); CHECK( magma_cmconvert( A, &B, Magma_CSR, zopts.output_format, queue )); CHECK( magma_cmtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_cvinit( &b, Magma_DEV, A.num_cols, 1, one, queue )); CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); CHECK( magma_cvinit( &t, Magma_DEV, A.num_cols, 1, zero, queue )); CHECK( magma_cvinit( &x1, Magma_DEV, A.num_cols, 1, zero, queue )); CHECK( magma_cvinit( &x2, Magma_DEV, A.num_cols, 1, zero, queue )); //preconditioner CHECK( magma_c_precondsetup( B_d, b, &zopts.solver_par, &zopts.precond_par, queue ) ); float residual; CHECK( magma_cresidual( B_d, b, x, &residual, queue )); zopts.solver_par.init_res = residual; printf("data = [\n"); printf("%%runtime left preconditioner:\n"); tempo1 = magma_sync_wtime( queue ); info = magma_c_applyprecond_left( MagmaNoTrans, B_d, b, &x1, &zopts.precond_par, queue ); tempo2 = magma_sync_wtime( queue ); if( info != 0 ){ printf("error: preconditioner returned: %s (%d).\n", magma_strerror( info ), int(info) ); } CHECK( magma_cresidual( B_d, b, x1, &residual, queue )); printf("%.8e %.8e\n", tempo2-tempo1, residual ); printf("%%runtime right preconditioner:\n"); tempo1 = magma_sync_wtime( queue ); info = magma_c_applyprecond_right( MagmaNoTrans, B_d, b, &x2, &zopts.precond_par, queue ); tempo2 = magma_sync_wtime( queue ); if( info != 0 ){ printf("error: preconditioner returned: %s (%d).\n", magma_strerror( info ), int(info) ); } CHECK( magma_cresidual( B_d, b, x2, &residual, queue )); printf("%.8e %.8e\n", tempo2-tempo1, residual ); printf("];\n"); info = magma_c_applyprecond_left( MagmaNoTrans, B_d, b, &t, &zopts.precond_par, queue ); info = magma_c_applyprecond_right( MagmaNoTrans, B_d, t, &x, &zopts.precond_par, queue ); CHECK( magma_cresidual( B_d, b, x, &residual, queue )); zopts.solver_par.final_res = residual; magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); magma_cmfree(&B_d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&x1, queue ); magma_cmfree(&x2, queue ); magma_cmfree(&b, queue ); magma_cmfree(&t, queue ); i++; } cleanup: magma_cmfree(&B_d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&x1, queue ); magma_cmfree(&x2, queue ); magma_cmfree(&b, queue ); magma_cmfree(&t, queue ); magma_csolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
/** Purpose ------- CGEQRS solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by CGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by CGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_CGEQRF_GPU. @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in,out] dT COMPLEX array that is the output (the 6th argument) of magma_cgeqrf_gpu of size 2*MIN(M, N)*NB + ceil(N/32)*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size (ceil(N/32)*32)* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_cgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgels_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqrs_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_const_ptr dA, magma_int_t ldda, magmaFloatComplex const *tau, magmaFloatComplex_ptr dT, magmaFloatComplex_ptr dB, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_) (dT + (lddwork + (i_))*nb) /* Constants */ const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const magma_int_t ione = 1; /* Local variables */ magmaFloatComplex_ptr dwork; magma_int_t i, min_mn, lddwork, rows, ib; magma_int_t nb = magma_get_cgeqrf_nb( m, n ); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; bool lquery = (lwork == -1); hwork[0] = magma_cmake_lwork( lwkopt ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min(m,n); if (min_mn == 0) { hwork[0] = c_one; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* B := Q^H * B */ magma_cunmqr_gpu( MagmaLeft, Magma_ConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { magma_queue_destroy( queue ); return *info; } /* Solve R*X = B(1:n,:) */ lddwork= min_mn; if (nb < min_mn) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (min_mn - 1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_cunmqr_gpu, hwork contains // the last block of A and B (i.e., C in cunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ctrsm and drop the csetmatrix. if ( nrhs == 1 ) { blasf77_ctrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ctrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_csetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork, queue ); // update c if (nrhs == 1) { magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1, queue ); } else { magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb, queue ); } magma_int_t start = i-nb; if (nb < min_mn) { for (i = start; i >= 0; i -= nb) { ib = min(min_mn - i, nb); rows = m - i; if (i + ib < n) { if (nrhs == 1) { magma_cgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1, queue ); magma_cgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1, queue ); } else { magma_cgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork, queue ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb, queue ); } } } } magma_ccopymatrix( n, nrhs, dwork, lddwork, dB, lddb, queue ); magma_queue_destroy( queue ); return *info; }
/** Purpose ------- CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu) On entry, the Hermitian matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); magmaFloatComplex *work; bool upper = (uplo == MagmaUpper); magmaFloatComplex *dwork[MagmaMaxGPUs]; magma_queue_t queues[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_cpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); magma_queue_create( 0, &queues[0][0] ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( n, n, d_lA[0], ldda, work, n, queues[0][0] ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, d_lA[0], ldda, queues[0][0] ); magma_free_pinned( work ); magma_queue_destroy( queues[0][0] ); } else { lddp = magma_roundup( n, nb ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) { magma_queue_create( d, &queues[d][j] ); } for( j=0; j < 5; j++ ) { magma_event_create( &event[d][j] ); } } magma_setdevice(0); h = 1; //ngpu; //magma_ceildiv( n, nb ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, queues, event, info); } else { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, queues, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); magma_queue_destroy( queues[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_cpotrf_mgpu */
/** Purpose ------- DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A DOUBLE_PRECISION array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dorgqr2( magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; double *dA, *dV, *dW, *dT, *T; double *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_dmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } double *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_dlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_dlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_dsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_dsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_dsetmatrix_async( ib, ib, T, nb, dT, nb, stream ); // set panel to identity magmablas_dlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); magma_queue_sync( stream ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_dgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dorgqr */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magmablas_cgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
extern "C" magma_int_t magma_sorgqr(magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) REAL array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. DT (input) REAL array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_sgeqrf_gpu. NB (input) INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; float *dA, *dV, *dW; float *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_smalloc( &dA, ldda*n + ldda*nb + lddwork*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_smalloc_cpu( &work, lwork ); if (work == NULL) { magma_free( dA ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } float *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* // Replacing this with the following 4 routines works but sorgqr is slow for // k smaller than the sorgqr's blocking size (new version can be up to 60x faster) lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_slacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_slaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_slarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_ssetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_slaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_ssetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_slaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_sgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_sorgqr */
/** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version has LAPACK-complaint arguments. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Other versions (magma_dgeqrf_gpu and magma_dgeqrf3_gpu) store the intermediate T matrices. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dA(a_1,a_2) ( dA+(a_2)*(ldda) + (a_1)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) double *dwork; double *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; /* Function Body */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; /* download i-th panel */ magma_queue_sync( stream[1] ); magma_dgetmatrix_async( rows, ib, dA(i,i), ldda, work_ref(i), ldwork, stream[0] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); magma_dsetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, stream[1] ); } magma_queue_sync( stream[0] ); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); dpanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ magma_dsetmatrix_async( rows, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[0] ); /* download the T matrix */ magma_queue_sync( stream[1] ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork, lddwork, stream[0] ); magma_queue_sync( stream[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); } else { magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork, lddwork, dA(i, i+ib), ldda, dwork+ib, lddwork); dq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_dsetmatrix_async( ib, ib, work_ref(i), ldwork, dA(i,i), ldda, stream[1] ); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free( dwork ); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix_async( rows, ib, dA(i, i), ldda, work, rows, stream[1] ); magma_queue_sync( stream[1] ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix_async( rows, ib, work, rows, dA(i, i), ldda, stream[1] ); } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } return *info; } /* magma_dgeqrf2_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_copts zopts; magma_queue_t queue; magma_queue_create( 0, &queue ); magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0); magma_c_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_c_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; TESTING_CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; TESTING_CHECK( magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } // for the eigensolver case zopts.solver_par.ev_length = A.num_cols; TESTING_CHECK( magma_ceigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix TESTING_CHECK( magma_cmscale( &A, zopts.scaling, queue )); // preconditioner if ( zopts.solver_par.solver != Magma_ITERREF ) { TESTING_CHECK( magma_c_precondsetup( A, b, &zopts.solver_par, &zopts.precond_par, queue ) ); } TESTING_CHECK( magma_cmconvert( A, &B, Magma_CSR, zopts.output_format, queue )); printf( "\n%% matrix info: %lld-by-%lld with %lld nonzeros\n\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); printf("matrixinfo = [\n"); printf("%% size (m x n) || nonzeros (nnz) || nnz/m || stored nnz\n"); printf("%%============================================================================%%\n"); printf(" %8lld %8lld %10lld %4lld %10lld\n", (long long) B.num_rows, (long long) B.num_cols, (long long) B.true_nnz, (long long) (B.true_nnz/B.num_rows), (long long) B.nnz ); printf("%%============================================================================%%\n"); printf("];\n"); TESTING_CHECK( magma_cmtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess TESTING_CHECK( magma_cvinit( &b, Magma_DEV, A.num_rows, 1, one, queue )); //magma_cvinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_c_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_cmfree(&x, queue ); TESTING_CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_c_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ) { printf("%%error: solver returned: %s (%lld).\n", magma_strerror( info ), (long long) info ); } printf("convergence = [\n"); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); zopts.solver_par.verbose = 0; printf("solverinfo = [\n"); magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); printf("precondinfo = [\n"); printf("%% setup runtime\n"); printf(" %.6f %.6f\n", zopts.precond_par.setuptime, zopts.precond_par.runtime ); printf("];\n\n"); magma_cmfree(&B_d, queue ); magma_cmfree(&B, queue ); magma_cmfree(&A, queue ); magma_cmfree(&x, queue ); magma_cmfree(&b, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_cpotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may not fit entirely in the GPU memory. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; char uplo_[2] = {uplo, 0}; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; #ifdef ROW_MAJOR_PROFILE magma_timestr_t start, end, start0, end0; float chol_time = 1.0; #endif *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb(n); if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if( NB >= n ) { #ifdef CHECK_CPOTRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CPOTRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_CPOTRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); fflush(stdout); #endif for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE start0 = get_current_time(); #endif if (nb <= 1 || nb >= n) { lapackf77_cpotrf(uplo_, &n, a, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo); #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif /* update with the previous big-panels */ for( j=0; j<J; j+=nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_cgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_cherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if( n > J+JB ) { for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal (and diagonal!!!) big panel */ magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo); //magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif for( j=0; j<J; j+=nb ) { /* upload the diagonal of big panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj<JB; jj+=nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_cgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_cherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if( n > J+JB ) { /* off-diagonal */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal big panel */ magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ #ifdef ROW_MAJOR_PROFILE end0 = get_current_time(); #endif if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j<5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE printf("\n n=%d NB=%d nb=%d\n",n,NB,nb); printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, GetTimerValue(start0, end0), FLOPS_CPOTRF(n)/(1000000*GetTimerValue(start0, end0))); printf(" Performance %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, chol_time, FLOPS_CPOTRF(n)/(1000000*chol_time)); #endif return *info; } /* magma_cpotrf_ooc */
/***************************************************************************//** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_potrf *******************************************************************************/ extern "C" magma_int_t magma_spotrf_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda + dA_offset) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const float c_one = MAGMA_S_ONE; const float c_neg_one = MAGMA_S_NEG_ONE; const float d_one = 1.0; const float d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, nb; float *work; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_spotrf_nb( n ); if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { /* Use unblocked code. */ magma_sgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] ); lapackf77_spotrf( uplo_, &n, work, &n, info ); magma_ssetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] ); } else { /* Use blocked code. */ if (upper) { //========================================================= /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_ssyrk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_sgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block row right of diagonal block if (j+jb < n) { magma_sgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } // simultaneous with above sgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_spotrf( MagmaUpperStr, &jb, work, &jb, info ); magma_ssetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block row right of diagonal block if (j+jb < n) { magma_strsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_ssyrk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_sgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block column below diagonal block if (j+jb < n) { magma_sgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } // simultaneous with above sgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_spotrf( MagmaLowerStr, &jb, work, &jb, info ); magma_ssetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block column below diagonal if (j+jb < n) { magma_strsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_spotrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- testing zdot */ int main( int argc, char** argv ) { magma_int_t info = 0; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); const double one = MAGMA_D_MAKE(1.0, 0.0); const double zero = MAGMA_D_MAKE(0.0, 0.0); double alpha; TESTING_INIT(); magma_d_matrix a={Magma_CSR}, b={Magma_CSR}, x={Magma_CSR}, y={Magma_CSR}, skp={Magma_CSR}; printf("%%=======================================================================================================================================================================\n"); printf("\n"); printf(" | runtime | GFLOPS\n"); printf("%% n num_vecs | CUDOT CUGEMV MAGMAGEMV MDOT MDGM MDGM_SHFL | CUDOT CUGEMV MAGMAGEMV MDOT MDGM MDGM_SHFL\n"); printf("%%------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); printf("\n"); for( magma_int_t num_vecs=1; num_vecs <= 32; num_vecs += 1 ) { for( magma_int_t n=500000; n < 500001; n += 10000 ) { int iters = 10; double computations = (2.* n * iters * num_vecs); #define ENABLE_TIMER #ifdef ENABLE_TIMER real_Double_t mdot1, mdot2, mdgm1, mdgm2, magmagemv1, magmagemv2, cugemv1, cugemv2, cudot1, cudot2; real_Double_t mdot_time, mdgm_time, mdgmshf_time, magmagemv_time, cugemv_time, cudot_time; #endif CHECK( magma_dvinit( &a, Magma_DEV, n, num_vecs, one, queue )); CHECK( magma_dvinit( &b, Magma_DEV, n, 1, one, queue )); CHECK( magma_dvinit( &x, Magma_DEV, n, 8, one, queue )); CHECK( magma_dvinit( &y, Magma_DEV, n, 8, one, queue )); CHECK( magma_dvinit( &skp, Magma_DEV, 1, num_vecs, zero, queue )); // warm up CHECK( magma_dgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); // CUDOT #ifdef ENABLE_TIMER cudot1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { for( int l=0; l<num_vecs; l++){ alpha = magma_ddot( n, a.dval+l*a.num_rows, 1, b.dval, 1, queue ); //cudaDeviceSynchronize(); } //cudaDeviceSynchronize(); } #ifdef ENABLE_TIMER cudot2 = magma_sync_wtime( queue ); cudot_time=cudot2-cudot1; #endif // CUGeMV #ifdef ENABLE_TIMER cugemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { magma_dgemv( MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1, queue ); } #ifdef ENABLE_TIMER cugemv2 = magma_sync_wtime( queue ); cugemv_time=cugemv2-cugemv1; #endif // MAGMAGeMV #ifdef ENABLE_TIMER magmagemv1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { magmablas_dgemv( MagmaTrans, n, num_vecs, one, a.dval, n, b.dval, 1, zero, skp.dval, 1, queue ); } #ifdef ENABLE_TIMER magmagemv2 = magma_sync_wtime( queue ); magmagemv_time=magmagemv2-magmagemv1; #endif // MDOT #ifdef ENABLE_TIMER mdot1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { for( int c = 0; c<num_vecs/2; c++ ){ CHECK( magma_dmdotc( n, 2, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } for( int c = 0; c<num_vecs%2; c++ ){ CHECK( magma_dmdotc( n, 1, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } //h++; } #ifdef ENABLE_TIMER mdot2 = magma_sync_wtime( queue ); mdot_time=mdot2-mdot1; #endif // MDGM #ifdef ENABLE_TIMER mdgm1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { CHECK( magma_dgemvmdot( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); //h++; } #ifdef ENABLE_TIMER mdgm2 = magma_sync_wtime( queue ); mdgm_time=mdgm2-mdgm1; #endif // MDGM_shfl #ifdef ENABLE_TIMER mdgm1 = magma_sync_wtime( queue ); #endif for( int h=0; h < iters; h++) { CHECK( magma_dgemvmdot_shfl( n, num_vecs, a.dval, b.dval, x.dval, y.dval, skp.dval, queue )); } #ifdef ENABLE_TIMER mdgm2 = magma_sync_wtime( queue ); mdgmshf_time=mdgm2-mdgm1; #endif //magma_dprint_gpu(num_vecs,1,skp.dval,num_vecs); //Chronometry #ifdef ENABLE_TIMER printf("%d %d %e %e %e %e %e %e || %e %e %e %e %e %e\n", int(n), int(num_vecs), cudot_time/iters, (cugemv_time)/iters, (magmagemv_time)/iters, (mdot_time)/iters, (mdgm_time)/iters, (mdgmshf_time)/iters, computations/(cudot_time*1e9), computations/(cugemv_time*1e9), computations/(magmagemv_time*1e9), computations/(mdot_time*1e9), computations/(mdgm_time*1e9), computations/(mdgmshf_time*1e9) ); #endif magma_dmfree(&a, queue ); magma_dmfree(&b, queue ); magma_dmfree(&x, queue ); magma_dmfree(&y, queue ); magma_dmfree(&skp, queue ); } //printf("%%================================================================================================================================================\n"); //printf("\n"); //printf("\n"); } // use alpha to silence compiler warnings if ( isnan( real( alpha ))) { info = -1; } cleanup: magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/** Purpose ------- SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U**T * U, if uplo = MagmaUpper, or A = L * L**T, if uplo = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If uplo = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If uplo = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**T * U or A = L * L**T. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_sposv_comp ********************************************************************/ extern "C" magma_int_t magma_spotrf(magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (work + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t j, jb; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *work; float d_one = 1.0; float d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_spotrf_m(num_gpus, uplo, n, A, lda, info); } ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_smalloc( &work, (n)*ldda )) { /* alloc failed so call the non-GPU-resident version */ return magma_spotrf_m(num_gpus, uplo, n, A, lda, info); } /* Define user stream if current stream is NULL */ cudaStream_t stream[3], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); magma_queue_create( &stream[2] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; nb = magma_get_spotrf_nb(n); if (nb <= 1 || nb >= n) { lapackf77_spotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_ssetmatrix_async( jb, (n-j), A(j, j), lda, dA(j, j), ldda, stream[1]); magma_ssyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_sgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[0] ); if ( (j+jb) < n) { magma_sgemm(MagmaTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_sgetmatrix_async( j, jb, dA(0, j), ldda, A (0, j), lda, stream[2] ); magma_queue_sync( stream[0] ); lapackf77_spotrf(MagmaUpperStr, &jb, A(j, j), &lda, info); if (*info != 0) { *info = *info + j; break; } magma_ssetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0] ); magma_queue_sync( stream[0] ); if ( (j+jb) < n ) { magma_strsm(MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_ssetmatrix_async( (n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[1]); magma_ssyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_sgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); if ( (j+jb) < n) { magma_sgemm( MagmaNoTrans, MagmaTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_sgetmatrix_async( jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[2] ); magma_queue_sync( stream[0] ); lapackf77_spotrf(MagmaLowerStr, &jb, A(j, j), &lda, info); if (*info != 0) { *info = *info + j; break; } magma_ssetmatrix_async( jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0] ); magma_queue_sync( stream[0] ); if ( (j+jb) < n) { magma_strsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[2] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } magma_free( work ); return *info; } /* magma_spotrf */
extern "C" magma_int_t magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, float *A, magma_int_t lda, float *dA, magma_int_t ldda, magma_int_t *jpvt, float *tau, float *vn1, float *vn2, float *auxv, float *F, magma_int_t ldf, float *dF, magma_int_t lddf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SLAQPS computes a step of QR factorization with column pivoting of a real M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) REAL array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) REAL array, dimension (NB) Auxiliar vector. F (input/output) REAL array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) float c_zero = MAGMA_S_MAKE( 0.,0.); float c_one = MAGMA_S_MAKE( 1.,0.); float c_neg_one = MAGMA_S_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; float z__1; magma_int_t j, k, rk; float Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t stream; magma_queue_create( &stream ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k. // C: pvt, k, isamax are all 0-based; don't subtract 1. pvt = k + cblas_isamax( n-k, &vn1[k], ione ); if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_sgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb){ /* no need of transfer if pivot is within the panel */ blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( stream ); /* 2. Swap as usual on CPU */ blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_ssetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione ); magma_sgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, stream ); blasf77_sgemv( MagmaTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( stream ); blasf77_sgemv( MagmaTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_S_NEGATE( tau[k] ); blasf77_sgemv( MagmaTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_sgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_ssetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione); else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( stream ); return MAGMA_SUCCESS; } /* magma_slaqps */
/** Purpose ------- DGEQRF2_MGPU computes a QR factorization of a real M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf2_mgpu( magma_int_t num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t ldda, double *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. double *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; double *hwork=NULL, *hpanel=NULL; magma_queue_t stream[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dgeqrf_nb( m ); /* dwork is (n*nb) --- for T (nb*nb) and dlarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think dlarfb work could be smaller, max(n_local[:]). * Oddly, T and dlarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_dmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for dgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_dmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < num_gpus; dev++ ) { n_local[dev] = ((n/nb)/num_gpus)*nb; if (dev < (n/nb) % num_gpus) n_local[dev] += nb; else if (dev == (n/nb) % num_gpus) n_local[dev] += n % nb; } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_create( &stream[dev][0] ); magma_queue_create( &stream[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % num_gpus; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*num_gpus)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( stream[panel_dev][1], panel_event[panel_dev] ); magma_dgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, stream[panel_dev][1] ); magma_queue_sync( stream[panel_dev][1] ); // Factor panel lapackf77_dgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); dpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_dsetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, stream[dev][0] ); } for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_sync( stream[dev][0] ); } // TODO: if dpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ dq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_dsetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, stream[dev][0] ); } la_dev = (panel_dev+1) % num_gpus; for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magmablasSetKernelStream( stream[dev][0] ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*num_gpus)*nb; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work magma_event_record( panel_event[dev], stream[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork ); // work } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork ); // work } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_dsetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, stream[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for dgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_dgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %d\n", (int) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % num_gpus; i_local = j/(nb*num_gpus)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_dsetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < num_gpus; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( stream[dev][0] ); magma_queue_destroy( stream[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dgeqrf2_mgpu */
/** Purpose ------- DSYGVDX computes selected eigenvalues and, optionally, eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] mout INTEGER The total number of eigenvalues found. 0 <= MOUT <= N. If RANGE = MagmaRangeAll, MOUT = N, and if RANGE = MagmaRangeI, MOUT = IU-IL+1. @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_dsygv_driver ********************************************************************/ extern "C" magma_int_t magma_dsygvdx( magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t *mout, double *w, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); double d_one = MAGMA_D_ONE; double *dA=NULL, *dB=NULL; magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t lddb = ldda; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin, liwmin; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -17; } else if (liwork < liwmin && ! lquery) { *info = -19; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { lapackf77_dsygvd( &itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info ); *mout = n; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); magma_dsetmatrix_async( n, n, A, lda, dA, ldda, queue ); magma_timer_t time=0; timer_start( time ); magma_dpotrf_gpu( uplo, n, dB, lddb, info ); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time dpotrf_gpu = %6.2f\n", time ); magma_queue_sync( queue ); magma_dgetmatrix_async( n, n, dB, lddb, B, ldb, queue ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu( itype, uplo, n, dA, ldda, dB, lddb, info ); timer_stop( time ); timer_printf( "time dsygst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * set dB=NULL so we know to re-allocate below * TODO: have dwork here that will be used as dB and then passed to dsyevd. */ if (n > 5000) { magma_queue_sync( queue ); magma_free( dB ); dB=NULL; } timer_start( time ); magma_dsyevdx_gpu( jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, mout, w, A, lda, work, lwork, iwork, liwork, info ); timer_stop( time ); timer_printf( "time dsyevdx_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (dB == NULL) { if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) { magma_free( dA ); dA=NULL; *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_dtrsm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } magma_dtrmm( MagmaLeft, uplo, trans, MagmaNonUnit, n, *mout, d_one, dB, lddb, dA, ldda, queue ); } magma_dgetmatrix( n, *mout, dA, ldda, A, lda, queue ); timer_stop( time ); timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( queue ); magma_queue_destroy( queue ); work[0] = magma_dmake_lwork( lwmin ); iwork[0] = liwmin; magma_free( dA ); dA=NULL; magma_free( dB ); dB=NULL; return *info; } /* magma_dsygvd */
extern "C" magma_int_t magma_zgetrf_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ztranspose( dAP, cols, dAT(i,i), lddat, nb, cols ); magmablas_ztranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb ); // make sure that that the transpose has completed magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork, stream[0]); if ( i>0 ){ magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_zsetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm, stream[0]); magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); //magmablas_ztranspose(dAT(i,i), lddat, dAP, maxm, cols, nb); magmablas_ztranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows); magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ztranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* End of MAGMA_ZGETRF_GPU */
extern "C" magma_int_t magma_zlauum(char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = 'U' or 'u' then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = 'L' or 'l' then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the triangular factor stored in the array A is upper or lower triangular: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the triangular factor U or L. N >= 0. A (input/output) COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = 'U', the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = 'L', the lower triangle of A is overwritten with the lower triangle of the product L' * L. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -k, the k-th argument had an illegal value ===================================================================== */ /* Local variables */ char uplo_[2] = {uplo, 0}; magma_int_t ldda, nb; magma_int_t i, ib; cuDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; cuDoubleComplex *work; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &work, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, a, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix(ib, (n-i), sizeof(cuDoubleComplex), A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for(i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix((n-i), ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); //cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( work ); return *info; }
/** Purpose ------- CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((float)NB/nb) ) { ngpu = (int)ceil((float)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((float)N/nb) ) { ngpu = (int)ceil((float)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_csetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_claswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_cgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_cgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_CGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_cgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgeqrs_gpu */ int main( int argc, char** argv) { //#if defined(PRECISION_s) /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float matnorm, work[1]; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A, d_B; /* Matrix size */ magma_int_t M = 0, N = 0, n2; magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork; magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000}; magma_int_t i, info, min_mn, nb, l1, l2; magma_int_t ione = 1; magma_int_t nrhs = 3; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-nrhs", argv[i])==0) nrhs = atoi(argv[++i]); } if (N>0 && M>0 && M >= N) printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N); printf(" M has to be >= N, exit.\n"); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, 1024, 1024); M = N = size[6]; } ldda = ((M+31)/32)*32; lddb = ldda; n2 = M * N; min_mn = min(M, N); nb = magma_get_sgeqrf_nb(M); lda = ldb = M; lworkgpu = (M-N + nb)*(nrhs+2*nb); /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( tau, float, min_mn ); TESTING_MALLOC_PIN( h_A, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_B, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_X, float, ldb*nrhs ); TESTING_MALLOC_PIN( h_R, float, ldb*nrhs ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* * Get size for host workspace */ lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = -1; lapackf77_sormqr( MagmaLeftStr, MagmaTransStr, &M, &nrhs, &min_mn, h_A, &lda, tau, h_X, &ldb, tmp, &lhwork, &info); l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] ); lhwork = max( max( l1, l2 ), lworkgpu ); TESTING_MALLOC_PIN( hwork, float, lhwork ); printf("\n"); printf(" ||b-Ax|| / (N||A||)\n"); printf(" M N CPU GFlop/s GPU GFlop/s CPU GPU \n"); printf("============================================================\n"); for(i=0; i<7; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); ldb = lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_GEQRF( (float)M, (float)N ) + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9; /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); n2 = M*nrhs; lapackf77_slarnv( &ione, ISEED, &n2, h_B ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Warm up to measure the performance */ magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); magma_ssetmatrix( M, N, h_A, 0, lda, d_A, 0, ldda, queue ); magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue ); gpu_time = magma_wtime(); magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda, d_B, 0, lddb, hwork, lworkgpu, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgels had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; // Get the solution in h_X magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue ); // compute the residual blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, hwork, &lhwork, &info); cpu_time = magma_wtime()-cpu_time; cpu_perf = gflops / cpu_time; if (info < 0) printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); printf("%5d %5d %6.1f %6.1f %7.2e %7.2e\n", M, N, cpu_perf, gpu_perf, lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm), lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) ); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( hwork ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }