void GradDpElement :: computeStiffnessMatrix_ku(FloatMatrix &answer, MatResponseMode rMode, TimeStep *tStep) { double dV; NLStructuralElement *elem = this->giveNLStructuralElement(); FloatArray Nk; FloatMatrix B, DkuB, Dku; StructuralCrossSection *cs = elem->giveStructuralCrossSection(); answer.clear(); int nlGeo = elem->giveGeometryMode(); for ( auto &gp: *elem->giveIntegrationRule(0) ) { GradDpMaterialExtensionInterface *dpmat = dynamic_cast< GradDpMaterialExtensionInterface * >( cs->giveMaterialInterface(GradDpMaterialExtensionInterfaceType, gp) ); if ( !dpmat ) { OOFEM_ERROR("Material doesn't implement the required DpGrad interface!"); } elem->computeBmatrixAt(gp, B); if ( nlGeo == 1 ) { if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) { elem->computeBmatrixAt(gp, B); } else { elem->computeBHmatrixAt(gp, B); } } dpmat->givePDGradMatrix_ku(Dku, rMode, gp, tStep); this->computeNkappaMatrixAt(gp, Nk); dV = elem->computeVolumeAround(gp); DkuB.beProductOf(Dku, B); answer.plusProductUnsym(Nk, DkuB, -dV); if ( dpmat->giveAveragingType() == 2 ) { double dl1, dl2, dl3; FloatArray Gk; FloatMatrix D, DB, LDB; FloatMatrix Bk, BktM22, BktM22Gk, BktM12, BktM12Gk, M22(2, 2), M12(2, 2); FloatMatrix dL1(1, 3), dL2(1, 3), result1, result2, dLdS, n(2, 2); this->computeBkappaMatrixAt(gp, Bk); dpmat->givePDGradMatrix_uu(D, rMode, gp, tStep); dpmat->givePDGradMatrix_LD(dLdS, rMode, gp, tStep); this->computeNonlocalGradient(Gk, gp, tStep); dl1 = dLdS.at(3, 3); dl2 = dLdS.at(4, 4); dl3 = dLdS.at(5, 5); n.at(1, 1) = dLdS.at(1, 1); n.at(1, 2) = dLdS.at(1, 2); n.at(2, 1) = dLdS.at(2, 1); n.at(2, 2) = dLdS.at(2, 2); // first term Bk^T M22 G L1 D B // M22 = n2 \otimes n2 M22.at(1, 1) = n.at(1, 2) * n.at(1, 2); M22.at(1, 2) = n.at(1, 2) * n.at(2, 2); M22.at(2, 1) = n.at(2, 2) * n.at(1, 2); M22.at(2, 2) = n.at(2, 2) * n.at(2, 2); // dL1 dL1.at(1, 1) = dl1 * n.at(1, 1) * n.at(1, 1) + dl2 *n.at(1, 2) * n.at(1, 2); dL1.at(1, 2) = dl1 * n.at(2, 1) * n.at(2, 1) + dl2 *n.at(2, 2) * n.at(2, 2); dL1.at(1, 3) = dl1 * n.at(1, 1) * n.at(2, 1) + dl2 *n.at(1, 2) * n.at(2, 2); DB.beProductOf(D, B); LDB.beProductOf(dL1, DB); BktM22.beTProductOf(Bk, M22); ///@todo This can't possibly work if this is uncommented (!) / Mikael //BktM22Gk.beProductOf(BktM22,Gk); result1.beProductOf(BktM22Gk, LDB); answer.add(dV, result1); // This would be slightly shorter and faster; //GkLDB.beProductOf(Gk, LDB); //MGkLDB.beProductOf(M22, GkLDB); //answer.plusProductUnsym(Bk, MGkLDB, dV); // M12 + M21 = n1 \otimes n2 + n2 \otimes n1 M12.at(1, 1) = n.at(1, 1) * n.at(1, 2) + n.at(1, 2) * n.at(1, 1); M12.at(1, 2) = n.at(1, 1) * n.at(2, 2) + n.at(1, 2) * n.at(2, 1); M12.at(2, 1) = n.at(2, 1) * n.at(1, 2) + n.at(2, 2) * n.at(1, 1); M12.at(2, 2) = n.at(2, 1) * n.at(2, 2) + n.at(2, 2) * n.at(2, 1); //dL2 dL2.at(1, 1) = dl3 * ( n.at(1, 1) * n.at(1, 2) + n.at(1, 1) * n.at(1, 2) ); dL2.at(1, 2) = dl3 * ( n.at(2, 1) * n.at(2, 2) + n.at(2, 1) * n.at(2, 2) ); dL2.at(1, 3) = dl3 * ( n.at(1, 2) * n.at(2, 1) + n.at(1, 1) * n.at(2, 2) ); LDB.beProductOf(dL2, DB); BktM12.beTProductOf(Bk, M12); ///@todo This can't possibly work if this is uncommented (!) / Mikael //BktM12Gk.beProductOf(BktM12,Gk); result2.beProductOf(BktM12Gk, LDB); answer.add(dV, result2); // This would be slightly shorter and faster; //GkLDB.beProductOf(Gk, LDB); //MGkLDB.beProductOf(M12, GkLDB); //answer.plusProductUnsym(Bk, MGkLDB, dV); } } }
/** Purpose ------- CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; magmaFloatComplex *dAT; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaLower, sb, sb, (PLASMA_Complex32_t*)hA(i, i), ldha, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
void GradDpElement :: computeStiffnessMatrix(FloatMatrix &answer, MatResponseMode rMode, TimeStep *tStep) { //set displacement and nonlocal location array this->setDisplacementLocationArray(); this->setNonlocalLocationArray(); NLStructuralElement *elem = this->giveNLStructuralElement(); StructuralCrossSection *cs = elem->giveStructuralCrossSection(); FloatMatrix B, D, DB; FloatMatrix DkuB, Dku; FloatArray Nk; FloatMatrix SNk, gPSigma; FloatMatrix lStiff; FloatMatrix Bk, LBk; FloatMatrix answer_uu, answer_ku, answer_uk, answer_kk; int nlGeo = elem->giveGeometryMode(); bool matStiffSymmFlag = elem->giveCrossSection()->isCharacteristicMtrxSymmetric(rMode); for ( auto &gp : *elem->giveIntegrationRule(0) ) { GradDpMaterialExtensionInterface *dpmat = dynamic_cast< GradDpMaterialExtensionInterface * >( cs->giveMaterialInterface(GradDpMaterialExtensionInterfaceType, gp) ); if ( !dpmat ) { OOFEM_ERROR("Material doesn't implement the required DpGrad interface!"); } double dV = elem->computeVolumeAround(gp); if ( nlGeo == 0 ) { elem->computeBmatrixAt(gp, B); } else if ( nlGeo == 1 ) { if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) { elem->computeBmatrixAt(gp, B); } else { elem->computeBHmatrixAt(gp, B); } } this->computeNkappaMatrixAt(gp, Nk); this->computeBkappaMatrixAt(gp, Bk); dpmat->givePDGradMatrix_uu(D, rMode, gp, tStep); dpmat->givePDGradMatrix_ku(Dku, rMode, gp, tStep); dpmat->givePDGradMatrix_uk(gPSigma, rMode, gp, tStep); dpmat->givePDGradMatrix_kk(lStiff, rMode, gp, tStep); /////////////////////////////////////////////////////////////////// uu: DB.beProductOf(D, B); if ( matStiffSymmFlag ) { answer_uu.plusProductSymmUpper(B, DB, dV); } else { answer_uu.plusProductUnsym(B, DB, dV); } //////////////////////////////////////////////////////////////////////// ku: DkuB.beProductOf(Dku, B); answer_ku.plusProductUnsym(Nk, DkuB, -dV); if ( dpmat->giveAveragingType() == 2 ) { double dl1, dl2, dl3; FloatMatrix LDB; FloatMatrix GkLDB, MGkLDB; FloatMatrix M22, M12; FloatMatrix dL1(1, 3), dL2(1, 3), dLdS; FloatArray Gk, n1, n2; dpmat->givePDGradMatrix_LD(dLdS, rMode, gp, tStep); this->computeNonlocalGradient(Gk, gp, tStep); dl1 = dLdS.at(3, 3); dl2 = dLdS.at(4, 4); dl3 = dLdS.at(5, 5); n1 = {dLdS.at(1, 1), dLdS.at(2, 1)}; n2 = {dLdS.at(1, 2), dLdS.at(2, 2)}; // first term Bk^T M22 G L1 D B // M22 = n2 \otimes n2 M22.plusDyadUnsym(n2, n2, 1.); // dL1 dL1.at(1, 1) = dl1 * n1.at(1) * n1.at(1) + dl2 * n2.at(1) * n2.at(1); dL1.at(1, 2) = dl1 * n1.at(2) * n1.at(2) + dl2 * n2.at(2) * n2.at(2); dL1.at(1, 3) = dl1 * n1.at(1) * n1.at(2) + dl2 * n2.at(1) * n2.at(2); LDB.beProductOf(dL1, DB); GkLDB.beProductOf(Gk, LDB); MGkLDB.beProductOf(M22, GkLDB); answer.plusProductUnsym(Bk, MGkLDB, dV); // M12 + M21 = n1 \otimes n2 + n2 \otimes n1 M12.plusDyadUnsym(n1, n2, 1.); M12.plusDyadUnsym(n2, n1, 1.); //dL2 dL2.at(1, 1) = dl3 * ( n1.at(1) * n2.at(1) + n1.at(1) * n2.at(1) ); dL2.at(1, 2) = dl3 * ( n1.at(2) * n2.at(2) + n1.at(2) * n2.at(2) ); dL2.at(1, 3) = dl3 * ( n1.at(2) * n2.at(1) + n1.at(1) * n2.at(2) ); // Bk * ((M12 * L2 + M22 * L1) * DB) LDB.beProductOf(dL2, DB); GkLDB.beProductOf(Gk, LDB); MGkLDB.beProductOf(M12, GkLDB); answer.plusProductUnsym(Bk, MGkLDB, dV); } //////////////////////////////////////////////////////////////////////// uk: SNk.beProductOf(gPSigma, Nk); answer_uk.plusProductUnsym(B, SNk, -dV); // uk /////////////////////////////////////////////////////////////////////// kk: answer_kk.plusProductUnsym(Nk, Nk, dV); if ( dpmat->giveAveragingType() == 0 || dpmat->giveAveragingType() == 1 ) { double l = lStiff.at(1, 1); answer_kk.plusProductUnsym(Bk, Bk, l * l * dV); } else if ( dpmat->giveAveragingType() == 2 ) { LBk.beProductOf(lStiff, Bk); answer_kk.plusProductUnsym(Bk, LBk, dV); } } if ( elem->domain->giveEngngModel()->giveFormulation() == AL ) { FloatMatrix initialStressMatrix; elem->computeInitialStressMatrix(initialStressMatrix, tStep); answer_uu.add(initialStressMatrix); } if ( matStiffSymmFlag ) { answer_uu.symmetrized(); } answer.resize(totalSize, totalSize); answer.zero(); answer.assemble(answer_uu, locU); answer.assemble(answer_uk, locU, locK); answer.assemble(answer_ku, locK, locU); answer.assemble(answer_kk,locK); }
extern "C" magma_int_t magma_sgetrf_incpiv_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, float *hA, magma_int_t ldha, float *dA, magma_int_t ldda, float *hL, magma_int_t ldhl, float *dL, magma_int_t lddl, magma_int_t *ipiv, float *dwork, magma_int_t lddwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. hA (input,output) DOUBLE COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. LDHA (input) INTEGER The leading dimension of the array hA. LDHA >= max(1,M). dA (input,output) DOUBLE COMPLEX array, dimension(LDDA, N) , on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). hL (output) DOUBLE COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. LDHL (input) INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). dL (output) DOUBLE COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. LDDL (input) INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). IPIV (output) INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. dWORK (output) DOUBLE COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. LDDWORK (input) INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). INFO (output) INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; float *dAT; #ifndef WITHOUTTRTRI float *dL2 = dL + ib; float *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_sgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_slacpy(PlasmaUpperLower, mindim, mindim, (float*)hA, ldha, (float*)hL2, ldhl ); CORE_strtri( PlasmaLower, PlasmaUnit, mindim, (float*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_ssetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( (storev == 'R') || (storev == 'r') ) { magma_ssetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_stranspose( dA, ldda, dwork, lddwork, m, n ); } else { magma_ssetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( (storev == 'C') || (storev == 'c') ) { magmablas_sgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i<s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i>0 ){ // download i-th panel magmablas_stranspose( dwork, maxm, AT(0, i), ldda, sb, m ); magma_sgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_sgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for(j=ii ; j <fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_slaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_slacpy(PlasmaLower, sb, sb, (float*)hA(i, i), ldha, (float*)hL2(i), ldhl ); CORE_strtri( PlasmaLower, PlasmaUnit, sb, (float*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_ssetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_ssetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_stranspose( AT(i,i), ldda, dwork, cols, rows, sb); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_sgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_sgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }