extern "C" magma_int_t magma_ztrtri_gpu(magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *info) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2012 Purpose ======= ZTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) DOUBLE PRECISION array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse can not be computed. ===================================================================== */ /* Local variables */ magma_uplo_t uplo_ = uplo; magma_diag_t diag_ = diag; magma_int_t nb, nn, j, jb; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; int upper = lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaUpper)); int nounit = lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaNonUnit)); *info = 0; if ((! upper) && (! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaLower)))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaUnit)))) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); /* Create Queues */ magma_queue_t queues[2]; magma_device_t device; int num = 0; magma_err_t err; err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 0 failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 1 failed: %d\n", err ); exit(-1); } if (MAGMA_SUCCESS != magma_malloc_host( (void**)&work, nb*nb*sizeof(magmaDoubleComplex) )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (nb <= 1 || nb >= n) { magma_zgetmatrix( n, n, dA, dA_offset, ldda, work, 0, n, queues[0] ); lapackf77_ztrtri(lapack_const(uplo_), lapack_const(diag_), &n, work, &n, info); magma_zsetmatrix( n, n, work, 0, n, dA, dA_offset, ldda, queues[0] ); } else { if (upper){ /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j =j+ nb){ jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda, queues[0]); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0]); } } else{ /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb){ jb=min(nb,(n-j)); if((j+jb) < n){ /* Compute rows j+jb:n of current block column */ magma_ztrmm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, queues[0]); } magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0] ); } } } magma_free_host( work ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); return *info; }
magma_err_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magma_err_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_malloc( &dwork, n*nb*sizeof(float))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( MAGMA_SUCCESS != magma_malloc_host((void**)&work, lwork*sizeof(float)) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } magma_event_t event[2] = {NULL, NULL}; nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue ); magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queue, &event[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue); magma_ssetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork, dA(old_i, old_i), ldda, queue, &event[1]); } magma_event_sync(event[0]); lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); if (i > 0) { magma_event_sync(event[1]); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_ssetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_ssetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queue); if (i+nb < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); else { magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); magma_ssetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queue); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queue); } magma_free_host(work); return *info; } /* magma_sgeqrf2_gpu */
magma_int_t magma_zsytrf_stapiv_gpu(char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, double criteria, PASTIX_INT * npiv, PASTIX_FLOAT * tmp4, magma_int_t *info) { /* -- MAGMA (version 1.2.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver May 2012 Purpose ======= ZSYTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb; char uplo_[2] = {uplo, 0}; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; cuDoubleComplex *work; double d_one = 1.0; double d_neg_one = -1.0; long int upper = uplo_lapackf77_lsame(uplo_, "U"); *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zsytrf_nb(n); if (MAGMA_SUCCESS != magma_zmalloc_host( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } static cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA, ldda, work, n ); assert(!upper); /* PaStiX only works with lower */ PASTIX_sytrf_block((PASTIX_FLOAT*)work, n, n, npiv, criteria, tmp4); magma_zsetmatrix( n, n, work, n, dA, ldda ); } else { /* Use blocked code. */ if (upper) { assert(0); /* PaStiX only works with lower */ /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j<n; j+=nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[1] ); /* lapackf77_zsytrf(MagmaUpperStr, &jb, work, &jb, info); */ magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j<n; j+=nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_zsyrk(MagmaLower, MagmaNoTrans, jb, j, c_neg_one, dA(j, 0), ldda, c_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[1] ); PASTIX_sytrf_block((PASTIX_FLOAT*)work, jb, jb, npiv, criteria, tmp4); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_host( work ); return *info; } /* magma_zsytrf_gpu */