/***************************************************************************//** * **/ void plasma_pdlacpy(plasma_context_t *plasma) { PLASMA_enum uplo; PLASMA_desc A; PLASMA_desc B; PLASMA_sequence *sequence; PLASMA_request *request; int X, Y; int m, n; int next_m; int next_n; int ldam, ldbm; plasma_unpack_args_5(uplo, A, B, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; switch (uplo) { /* * PlasmaUpper */ case PlasmaUpper: m = 0; n = PLASMA_RANK; while (n >= A.nt) { m++; n = n - A.nt + m; } while (m < A.mt) { next_m = m; next_n = n; next_n += PLASMA_SIZE; while (next_n >= A.nt && next_m < A.mt) { next_m++; next_n = next_n - A.nt + next_m; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( m == n ? uplo : PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; /* * PlasmaLower */ case PlasmaLower: n = 0; m = PLASMA_RANK; while (m >= A.mt) { n++; m = m - A.mt + n; } while (n < A.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= A.mt && next_n < A.nt) { next_n++; next_m = next_m - A.mt + next_n; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( m == n ? uplo : PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; /* * PlasmaUpperLower */ case PlasmaUpperLower: default: n = 0; m = PLASMA_RANK; while (m >= A.mt) { n++; m = m - A.mt; } while (n < A.nt) { next_m = m; next_n = n; next_m += PLASMA_SIZE; while (next_m >= A.mt && next_n < A.nt) { next_n++; next_m = next_m - A.mt; } X = m == A.mt-1 ? A.m-m*A.mb : A.mb; Y = n == A.nt-1 ? A.n-n*A.nb : A.nb; ldam = BLKLDD(A, m); ldbm = BLKLDD(B, m); CORE_dlacpy( PlasmaUpperLower, X, Y, A(m, n), ldam, B(m, n), ldbm); n = next_n; m = next_m; } break; } }
/** Purpose ------- DGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA DOUBLE_PRECISION array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL DOUBLE_PRECISION array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL DOUBLE_PRECISION array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK DOUBLE_PRECISION array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, double *hA, magma_int_t ldha, double *dA, magma_int_t ldda, double *hL, magma_int_t ldhl, double *dL, magma_int_t lddl, magma_int_t *ipiv, double *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; double *dAT; #ifndef WITHOUTTRTRI double *dL2 = dL + ib; double *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_dgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_dlacpy(PlasmaUpperLower, mindim, mindim, (double*)hA, ldha, (double*)hL2, ldhl ); CORE_dtrtri( PlasmaLower, PlasmaUnit, mindim, (double*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_dsetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_dsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_dtranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_dsetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_dtranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_dgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_dgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_dlaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_dlacpy(PlasmaLower, sb, sb, (double*)hA(i, i), ldha, (double*)hL2(i), ldhl ); CORE_dtrtri( PlasmaLower, PlasmaUnit, sb, (double*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_dsetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_dsetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_dtranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_dgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_dgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }