extern "C" magma_int_t magma_zgetrf_mc(magma_context *cntxt, int *m, int *n, cuDoubleComplex *a, int *lda, int *ipiv, int *info) { /* -- MAGMA (version 1.6.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2015 Purpose ======= ZGETRF computes an LU factorization of a general COMPLEX_16 M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ if (cntxt->num_cores == 1 && cntxt->num_gpus == 1) { //int result = magma_zgetrf(*m, *n, a, *lda, ipiv, info); //return result; } int EN_BEE = cntxt->nb; Quark* quark = cntxt->quark; int i,j,l; int ii,jj,ll; void *fakedep; int ione=1; cuDoubleComplex fone = MAGMA_Z_ONE; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; int M,N,MM,NN,MMM,K; int priority=0; *info = 0; int nb = (EN_BEE==-1)? magma_get_zpotrf_nb(*n): EN_BEE; /* Check arguments */ if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } int k = min(*m,*n); int iinfo[2]; iinfo[1] = 0; char label[10000]; ii = -1; /* Loop across diagonal blocks */ for (i = 0; i < k; i += nb) { ii++; jj = -1; priority = 10000 - ii; /* Update panels in left looking fashion */ for (j = 0; j < i; j += nb) { jj++; NN=min(nb,(*n)-i); MM=min(nb,(*m)-j); l = j + nb; MMM = min(nb,(*m)-l); sprintf(label, "UPDATE %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_panel_update, 0, sizeof(int), &NN, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[j], INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j), INPUT, sizeof(int), &MMM, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); ll = jj + 1; /* Split gemm into tiles */ for (l = j + (2*nb); l < (*m); l += nb) { ll++; MMM = min(nb,(*m)-l); fakedep = (void *)(intptr_t)(j+1); sprintf(label, "GEMM %d %d %d", ii, jj, ll); QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(int), &MMM, VALUE, sizeof(int), &NN, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT | GATHERV, sizeof(void*), fakedep, OUTPUT | GATHERV, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } M=(*m)-i; N=min(nb,(*n)-i); iinfo[0] = i; sprintf(label, "GETRF %d", ii); QUARK_Insert_Task(quark, SCHED_zgetrf, 0, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), INOUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[i], OUTPUT, sizeof(int), iinfo, OUTPUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } K = (*m)/nb; if ((K*nb)==(*m)) { ii = K - 1; K = *m; } else { ii = k; K = (K+1)*nb; } priority = 0; /* If n > m */ for (i = K; i < (*n); i += nb) { ii++; jj = -1; /* Update remaining panels in left looking fashion */ for (j = 0; j < (*m); j += nb) { jj++; NN=min(nb,(*n)-i); MM=min(nb,(*m)-j); l = j + nb; MMM = min(nb,(*m)-l); sprintf(label, "UPDATE %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_panel_update, 0, sizeof(int), &NN, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[j], INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j), INPUT, sizeof(int), &MMM, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); ll = jj + 1; /* Split gemm into tiles */ for (l = j + (2*nb); l < (*m); l += nb) { ll++; MMM = min(nb,(*m)-l); fakedep = (void *)(intptr_t)(j+1); sprintf(label, "GEMM %d %d %d", ii, jj, ll); QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(int), &MMM, VALUE, sizeof(int), &NN, VALUE, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j), INPUT, sizeof(int), lda, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i), INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i), INOUT, sizeof(int), &priority,VALUE | TASK_PRIORITY, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i), OUTPUT | GATHERV, sizeof(void*), fakedep, OUTPUT | GATHERV, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } } ii = -1; /* Swap behinds */ for (i = 0; i < k; i += nb) { ii++; jj = -1; MM = min(nb,(*m)-i); MM = min(MM,(*n)-i); for (j = 0; j < i; j += nb) { jj++; fakedep = (void *)(intptr_t)(j+1); sprintf(label, "LASWPF %d %d", ii, jj); QUARK_Insert_Task(quark, SCHED_zlaswp, 0, sizeof(int), &nb, VALUE, sizeof(cuDoubleComplex)*(*m)*(*n), A(i,j), INOUT, sizeof(int), lda, VALUE, sizeof(int), &MM, VALUE, sizeof(cuDoubleComplex)*nb, &ipiv[i], INPUT, sizeof(int), &priority, VALUE | TASK_PRIORITY, sizeof(void*), fakedep, INPUT, sizeof(cuDoubleComplex)*(*m)*(*n), A(i+nb,j), OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 7, "purple", VALUE | TASKCOLOR, 0); } } /* Synchronization point */ QUARK_Barrier(quark); /* Fix pivot */ ii = -1; for (i = 0; i < k; i +=nb) { ii++; for (j = 0; j < min(nb,(k-i)); j++) { ipiv[ii*nb+j] += ii*nb; } } QUARK_Barrier(quark); }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda + dA_offset) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const double d_one = 1.0; const double d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, nb; magmaDoubleComplex *work; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb( n ); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] ); lapackf77_zpotrf( uplo_, &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] ); } else { /* Use blocked code. */ if (upper) { //========================================================= /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block row right of diagonal block if (j+jb < n) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaUpperStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block row right of diagonal block if (j+jb < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block column below diagonal block if (j+jb < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaLowerStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block column below diagonal if (j+jb < n) { magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu(magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA, ldda, work, n ); lapackf77_zpotrf(uplo_, &n, work, &n, info); magma_zsetmatrix( n, n, work, n, dA, ldda ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
/** Purpose ------- ZTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. @ingroup magma_zgesv_aux ********************************************************************/ extern "C" magma_int_t magma_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (i) + (j)*lda ) #define dA(i, j) (dA + (i) + (j)*ldda) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); magma_int_t ldda, nb, nn, j, jb; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dA; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_Z_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_zpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ztrtri(uplo_, diag_, &n, A, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); magma_zsetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ztrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_zgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for (j=nn-1; j >= 0; j -= nb) { jb=min(nb,(n-j)); if ((j+jb) < n) { magma_zsetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ztrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ztrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); magma_zgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_zgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ------- ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L' * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_zposv_aux ***************************************************************************/ extern "C" magma_int_t magma_zlauum( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t i, ib; magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; magmaDoubleComplex *dA; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, A, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
extern "C" magma_int_t magma_ztrtri_gpu(magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *info) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2012 Purpose ======= ZTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) DOUBLE PRECISION array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse can not be computed. ===================================================================== */ /* Local variables */ magma_uplo_t uplo_ = uplo; magma_diag_t diag_ = diag; magma_int_t nb, nn, j, jb; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; int upper = lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaUpper)); int nounit = lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaNonUnit)); *info = 0; if ((! upper) && (! lapackf77_lsame(lapack_const(uplo_), lapack_const(MagmaLower)))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(lapack_const(diag_), lapack_const(MagmaUnit)))) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); /* Create Queues */ magma_queue_t queues[2]; magma_device_t device; int num = 0; magma_err_t err; err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 0 failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queues[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create 1 failed: %d\n", err ); exit(-1); } if (MAGMA_SUCCESS != magma_malloc_host( (void**)&work, nb*nb*sizeof(magmaDoubleComplex) )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (nb <= 1 || nb >= n) { magma_zgetmatrix( n, n, dA, dA_offset, ldda, work, 0, n, queues[0] ); lapackf77_ztrtri(lapack_const(uplo_), lapack_const(diag_), &n, work, &n, info); magma_zsetmatrix( n, n, work, 0, n, dA, dA_offset, ldda, queues[0] ); } else { if (upper){ /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j =j+ nb){ jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ztrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda, queues[0]); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0]); } } else{ /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb){ jb=min(nb,(n-j)); if((j+jb) < n){ /* Compute rows j+jb:n of current block column */ magma_ztrmm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, queues[0]); magma_ztrsm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, queues[0]); } magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, 0, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, lapack_const(diag_), &jb, work, &jb, info); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0] ); } } } magma_free_host( work ); magma_queue_destroy(queues[0]); magma_queue_destroy(queues[1]); return *info; }
extern "C" magma_int_t magma_ztrtri(char uplo, char diag, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. ===================================================================== */ #define A(i, j) ( A + (i) + (j)*lda ) #define dA(i, j) (dA + (i) + (j)*ldda) /* Local variables */ char uplo_[2] = {uplo, 0}; char diag_[2] = {diag, 0}; magma_int_t ldda, nb, nn, j, jb; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dA; int upper = lapackf77_lsame(uplo_, "U"); int nounit = lapackf77_lsame(diag_, "N"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(diag_, "U"))) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for ( j=0; j<n; ++j ) { if ( MAGMA_Z_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_zpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ztrtri(uplo_, diag_, &n, A, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j=j+nb) { jb = min(nb, (n-j)); magma_zsetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ztrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_zgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb) { jb=min(nb,(n-j)); if((j+jb) < n) { magma_zsetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ztrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ztrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); magma_zgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_zgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
extern "C" magma_int_t magma_zpotrf_mc(magma_context *cntxt, char *uplo, magma_int_t *n, cuDoubleComplex *a, magma_int_t *lda, magma_int_t *info) { /* -- MAGMA (version 1.5.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date September 2014 Purpose ======= ZPOTRF computes the Cholesky factorization of a Hermitian positive definite matrix A. The factorization has the form A = U**T * U, if UPLO = 'U', or A = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ if (cntxt->num_cores == 1 && cntxt->num_gpus == 1) { //magma_int_t result = magma_zpotrf(*uplo, *n, a, *lda, info); //return result; } // check arguments magma_int_t upper = (magma_int_t) lsame_(uplo, "U"); *info = 0; if (! upper && ! lsame_(uplo, "L")) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } Quark* quark = cntxt->quark; // get block size magma_int_t nb = (cntxt->nb ==-1)? magma_get_zpotrf_nb(*n): cntxt->nb; magma_int_t i,j,k; magma_int_t ii,jj,kk; magma_int_t temp,temp2,temp3; char label[10000]; magma_int_t iinfo[2]; iinfo[1] = 0; ii = -1; // traverse diagonal blocks for (i = 0; i < (*n); i += nb) { ii++; temp2 = min(nb,(*n)-i); // if not first block if (i > 0) { // first do large syrk, then split if (i < (*n)/2) { sprintf(label, "SYRK %d", ii); if (upper) { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &i, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(0,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i-nb,i), INPUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &i, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,0), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i-nb), INPUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } } else { jj = -1; // split syrk into tiles for (j = 0; j < i; j += nb) { jj++; sprintf(label, "SYRK %d %d", ii, jj); if (upper) { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zsyrk, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 6, "green", VALUE | TASKCOLOR, 0); } } } // if not last block if (i < ((*n)-nb)) { jj = -1; // split gemm into tiles for (j = i+nb; j < (*n); j += nb){ jj++; kk = -1; for (k = 0; k < i; k += nb) { kk++; temp = min(nb,(*n)-j); sprintf(label, "GEMM %d %d %d", ii, jj, kk); if (upper) { QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(k,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(k,j), INPUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_zgemm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,k), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,k), INPUT, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "blue", VALUE | TASKCOLOR, 0); } } } } } iinfo[0] = i; sprintf(label, "POTRF %d", ii); QUARK_Insert_Task(quark, SCHED_zpotrf, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp2, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INOUT, sizeof(magma_int_t), lda, VALUE, sizeof(magma_int_t), iinfo, OUTPUT, strlen(label)+1, label, VALUE | TASKLABEL, 5, "cyan", VALUE | TASKCOLOR, 0); // if not last block if (i < ((*n)-nb)) { // split trsm into tiles for (j = i + nb; j < (*n); j += nb) { temp = min(nb,(*n)-j); sprintf(label, "TRSM %d", ii); if (upper) { QUARK_Insert_Task(quark, SCHED_ztrsm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 4, "red", VALUE | TASKCOLOR, 0); } else { QUARK_Insert_Task(quark, SCHED_ztrsm, 0, sizeof(magma_int_t), &upper, VALUE, sizeof(magma_int_t), &temp, VALUE, sizeof(magma_int_t), &nb, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i), INPUT, sizeof(magma_int_t), lda, VALUE, sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INOUT, strlen(label)+1, label, VALUE | TASKLABEL, 4, "red", VALUE | TASKCOLOR, 0); } } } } QUARK_Barrier(quark); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_zpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_zpotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], magmaDoubleComplex, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { h_A(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(h_A(i,i)) + N, 0 ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_Z_CNJG( h_A(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_zpotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zgetmatrix( N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_zgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_zpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_zpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_zlange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf_mgpu */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_setdevice(0); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; cuDoubleComplex *h_A, *h_R; cuDoubleComplex *d_lA[4]; magma_int_t N = 0, n2, mb, nb, nk, lda, ldda, n_local, ldn_local; //magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t n_sizes = 10, flag = 0; magma_int_t i, j, k, info, num_gpus0 = 1, num_gpus; const char *uplo = MagmaLowerStr; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm; N = size[n_sizes-1]; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { flag = 1; N = atoi(argv[++i]); size[0] = size[n_sizes-1] = N; } if (strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if (strcmp("-UPLO",argv[i])==0) { if (strcmp("L",argv[++i])==0) uplo = MagmaLowerStr; else uplo = MagmaUpperStr; } } if (strcmp(uplo,MagmaLowerStr)==0) printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", (int) N, (int) num_gpus0 ); else printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", (int) N, (int) num_gpus0 ); } else { printf("\nDefault: \n"); printf(" testing_zpotrf_mgpu -N %d:%d -NGPU %d -UPLO L\n\n", (int) size[0], (int) size[n_sizes-1], (int) num_gpus0 ); } if( N <= 0 || num_gpus0 <= 0 ) { printf( " invalid input N=%d NGPU=%d\n", (int) N, (int) num_gpus0 ); exit(1); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0; i<n_sizes; i++){ N = size[i]; nb = magma_get_zpotrf_nb(N); mb = nb; if( num_gpus0 > N/nb ) { num_gpus = N/nb; if( N%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus)) * mb*((N+mb-1)/mb); if( n_local > ldda ) ldda = n_local; if( n2 < N*N ) n2 = N*N; if (flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_HOSTALLOC( h_A, cuDoubleComplex, n2); TESTING_HOSTALLOC( h_R, cuDoubleComplex, n2); /* allocate local matrix on GPU */ for(i=0; i<num_gpus0; i++){ magma_setdevice(i); TESTING_DEVALLOC( d_lA[i], cuDoubleComplex, ldda ); } magma_setdevice(0); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<n_sizes; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS( (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_A[i*lda+i], ( MAGMA_Z_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConj(h_A[j*lda+i]); } } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_zpotrf_nb(N); if( num_gpus0 > N/nb ) { num_gpus = N/nb; if( N%nb != 0 ) num_gpus ++; printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); } else { num_gpus = num_gpus0; } /* distribute matrix to gpus */ if( lapackf77_lsame(uplo, "U") ) { /* going through each block-column */ ldda = ((N+mb-1)/mb)*mb; for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zsetmatrix( N, nk, h_A+j*lda, lda, d_lA[k]+j/(nb*num_gpus)*nb*ldda, ldda ); } } else { /* going through each block-row */ ldda = (1+N/(nb*num_gpus))*nb; for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zsetmatrix( nk, N, h_A+j, lda, d_lA[k]+j/(nb*num_gpus)*nb, ldda ); } } magma_setdevice(0); /* call magma_zpotrf_mgpu */ start = get_current_time(); magma_zpotrf_mgpu(num_gpus, uplo[0], N, d_lA, ldda, &info); end = get_current_time(); if (info < 0) { printf("Argument %d of magma_zpotrf_mgpu had an illegal value.\n", (int) -info); break; } else if (info != 0) { printf("magma_zpotrf_mgpu returned info=%d\n", (int) info ); break; } gpu_perf = flops / GetTimerValue(start, end); /* gather matrix from gpus */ if( lapackf77_lsame(uplo, "U") ) { for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zgetmatrix( N, nk, d_lA[k]+j/(nb*num_gpus)*nb*ldda, ldda, h_R+j*lda, lda ); } } else { for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zgetmatrix( nk, N, d_lA[k]+j/(nb*num_gpus)*nb, ldda, h_R+j, lda ); } } magma_setdevice(0); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zpotrf(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info < 0) { printf("Argument %d of zpotrf had an illegal value.\n", (int) -info); break; } else if (info != 0) { printf("lapackf77_zpotrf returned info=%d\n", (int) info ); break; } cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_zlange("f", &N, &N, h_R, &lda, work) / matnorm); if (flag != 0) break; } /* Memory clean up */ TESTING_HOSTFREE( h_A ); TESTING_HOSTFREE( h_R ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_DEVFREE( d_lA[i] ); } /* Shutdown */ TESTING_CUDA_FINALIZE(); }
extern "C" magma_int_t magma_zlauum(char uplo, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = 'U' or 'u' then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = 'L' or 'l' then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the triangular factor stored in the array A is upper or lower triangular: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the triangular factor U or L. N >= 0. A (input/output) COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = 'U', the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = 'L', the lower triangle of A is overwritten with the lower triangle of the product L' * L. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -k, the k-th argument had an illegal value ===================================================================== */ /* Local variables */ char uplo_[2] = {uplo, 0}; magma_int_t ldda, nb; magma_int_t i, ib; cuDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; cuDoubleComplex *work; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &work, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, a, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix(ib, (n-i), sizeof(cuDoubleComplex), A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for(i=0; i<n; i=i+nb) { ib=min(nb,n-i); //cublasSetMatrix((n-i), ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); //cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), // A(i, i), lda, dA(i, i), ldda); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( work ); return *info; }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX_16 array of pointers on the GPU, dimension (num_gpus) On entry, the Hermitian matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_mgpu_right(magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **d_lA, magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (num_streams > 1 ? 1+((i)/nb)%(num_streams-1) : 0) magmaDoubleComplex z_one = MAGMA_Z_MAKE( 1.0, 0.0 ); magmaDoubleComplex mz_one = MAGMA_Z_MAKE( -1.0, 0.0 ); double one = 1.0; double m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows, num_streams = 5; magmaDoubleComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; magmaDoubleComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, ngpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t stream[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_zpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_zgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_zpotrf( uplo_, &n, panel, &ldpanel, info); magma_zsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < num_gpus; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / num_gpus) * nb; if (d < (n / nb) % num_gpus) n_local[d] += nb; else if (d == (n / nb) % num_gpus) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_zmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < num_streams; j++ ) { magma_queue_create( &stream[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (num_gpus == 4) crosspoint = n; else if (num_gpus == 3) crosspoint = n; else if (num_gpus == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = get_time(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_zgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, stream[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = get_time()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % num_gpus; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( stream[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = get_time(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = get_time() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = get_time(); #endif lapackf77_zpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = get_time() - tcchol; ttot_cchol += tcchol; tctrsm = get_time(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_ztrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = get_time() - tctrsm; ttot_ctrsm += tctrsm; tctm = get_time(); #endif d = (id + 1) % num_gpus; // send current panel to gpus for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_zsetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, stream[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % num_gpus; for (ngpu = 0; ngpu < num_gpus; ngpu++, d = (d + 1) % num_gpus ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } #if defined (ENABLE_TIMER) tctm = get_time() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % num_gpus; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * num_gpus) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = get_time(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream(stream[d][STREAM_ID(j_local2)]); #define ZHERK_ON_DIAG #ifdef ZHERK_ON_DIAG magma_zherk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_zgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = get_time() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( stream[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_zgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, stream[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < num_gpus; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) therk[d] = get_time(); #endif //magmablasSetKernelStream(stream[d]); //magma_zherk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef ZHERK_ON_DIAG magma_zherk_mgpu #else magma_zherk_mgpu2 #endif (num_gpus, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, num_streams, stream ); #if defined (ENABLE_TIMER) for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = get_time() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; prevj = j; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % num_gpus, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( id=0; id < num_streams; id++ ) { magma_queue_sync( stream[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % num_gpus; // Set the local index where the current panel is j_local = j / (nb * num_gpus) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = get_time(); #endif magma_zgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_zpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_zsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = get_time() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < num_gpus; d++ ) { magma_setdevice(d); for( j=0; j < num_streams; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_zpotrf_mgpu_right */
extern "C" magma_int_t magma_ztrtri_gpu( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_queue_t queues[2], magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) COMPLEX_16 array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. (Singularity check is currently disabled.) ===================================================================== */ /* Local variables */ magma_int_t nb, nn, j, jb; //magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Check for singularity if non-unit */ /* cannot do here with matrix dA on GPU -- need kernel */ /* if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_Z_EQUAL( *dA(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } */ /* Determine the block size for this environment */ nb = magma_get_zpotrf_nb(n); /* Create Queues */ //magma_queue_t queues[2]; //magma_device_t device[MagmaMaxGPUs]; //magma_int_t num = 0; //magma_int_t err; // //err = magma_getdevices( device, MagmaMaxGPUs, &num ); //if ( err != 0 || num < 1 ) { // fprintf( stderr, "magma_getdevices failed: %d\n", err ); // exit(-1); //} //err = magma_queue_create( device[0], &queues[0] ); //if ( err != 0 ) { // fprintf( stderr, "magma_queue_create 0 failed: %d\n", err ); // exit(-1); //} //err = magma_queue_create( device[0], &queues[1] ); //if ( err != 0 ) { // fprintf( stderr, "magma_queue_create 1 failed: %d\n", err ); // exit(-1); //} if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (nb <= 1 || nb >= n) { magma_zgetmatrix( n, n, dA, dA_offset, ldda, work, n, queues[0] ); lapackf77_ztrtri( lapack_const(uplo), lapack_const(diag), &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dA, dA_offset, ldda, queues[0] ); } else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb){ jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ztrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda, queues[0] ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda, queues[0] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri( MagmaUpperStr, lapack_const(diag), &jb, work, &jb, info ); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, jb, dA(j, j), ldda, queues[0] ); } } else { /* Compute inverse of lower triangular matrix */ nn = ((n-1)/nb)*nb+1; for(j=nn-1; j >= 0; j -= nb){ jb = min(nb,(n-j)); if((j+jb) < n){ /* Compute rows j+jb:n of current block column */ magma_ztrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, queues[0] ); magma_ztrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, queues[0] ); } magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri( MagmaLowerStr, lapack_const(diag), &jb, work, &jb, info ); /* magma_zsetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_zsetmatrix( jb, jb, work, jb, dA(j, j), ldda, queues[0] ); } } } //magma_queue_destroy( queues[0] ); //magma_queue_destroy( queues[1] ); magma_free_cpu( work ); return *info; }