Real SolidWallBC::computeQpOffDiagJacobian(unsigned jvar) { RealVectorValue hU(0., 0., 0.); if (jvar == _hu_var) { switch (_equ_type) { case y_mom: return _eos.dp_dhu(_h[_qp], hU)*_normals[_qp](1)*_test[_i][_qp]; break; default: return 0.; break; } } else if (jvar == _hv_var) { switch (_equ_type) { case x_mom: return _eos.dp_dhv(_h[_qp], hU)*_normals[_qp](0)*_test[_i][_qp]; break; default: return 0.; break; } } else return 0.; }
Real SaintVenantSetWaterVelocity::computeQpJacobian() { RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.); Real dpdhu = _eos.dp_dhu(_h[_qp], hU); switch (_equ_type) { case continuity: return _phi[_j][_qp]*_u_bc*_normals[_qp](0)*_test[_i][_qp]; break; case x_mom: return _phi[_j][_qp]*dpdhu*_normals[_qp](0)*_test[_i][_qp]; break; default: mooseError("'" << this->name() << "' Invalid equation name."); } }
Real SaintVenantSetWaterVelocity::computeQpResidual() { RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.); Real p = _eos.pressure(_h[_qp], hU); switch (_equ_type) { case continuity: return _h[_qp]*_u_bc*_normals[_qp](0)*_test[_i][_qp]; break; case x_mom: return (_u_bc*_u_bc*_h[_qp]+p)*_normals[_qp](0)*_test[_i][_qp]; break; default: mooseError("'" << this->name() << "' Invalid equation name."); } }
Real SolidWallBC::computeQpJacobian() { RealVectorValue hU(0., 0., 0.); switch (_equ_type) { case continuity: return 0.; break; case x_mom: return _eos.dp_dhu(_h[_qp], hU)*_normals[_qp](0)*_test[_i][_qp]; break; case y_mom: return _eos.dp_dhv(_h[_qp], hU)*_normals[_qp](1)*_test[_i][_qp]; break; default: mooseError("'" << this->name() << "' Invalid equation name."); } }
Real SolidWallBC::computeQpResidual() { RealVectorValue hU(0., 0., 0.); Real p = _eos.pressure(_h[_qp], hU); switch (_equ_type) { case continuity: return 0.; break; case x_mom: return p*_normals[_qp](0)*_test[_i][_qp]; case y_mom: return p*_normals[_qp](1)*_test[_i][_qp]; break; default: mooseError("'" << this->name() << "' Invalid equation name."); } }
Real SaintVenantSetWaterVelocity::computeQpOffDiagJacobian(unsigned jvar) { RealVectorValue hU(_h[_qp]*_u_bc, 0., 0.); Real dpdu(0.0); if (jvar == _h_var) { switch (_equ_type) { case x_mom: dpdu = _eos.dp_dh(_h[_qp], hU); return _phi[_j][_qp]*dpdu*_normals[_qp](0)*_test[_i][_qp]; break; default: mooseError("'" << this->name() << "' Invalid equation name."); } } else return 0.; }
// static void GSparseMatrix::test() { // Make the data GSparseMatrix sm(4, 4); sm.set(0, 0, 2.0); sm.set(0, 2, 3.0); sm.set(1, 0, 1.0); sm.set(2, 3, -2.0); sm.set(2, 2, 5.0); sm.set(3, 1, -3.0); sm.set(3, 3, -1.0); GData* fm = sm.toFullMatrix(); Holder<GData> hFM(fm); // Do it with the full matrix GData* pU; double* pDiag; GData* pV; fm->singularValueDecomposition(&pU, &pDiag, &pV); Holder<GData> hU(pU); ArrayHolder<double> hDiag(pDiag); Holder<GData> hV(pV); // Do it with the sparse matrix GSparseMatrix* pSU; double* pSDiag; GSparseMatrix* pSV; sm.singularValueDecomposition(&pSU, &pSDiag, &pSV); Holder<GSparseMatrix> hSU(pSU); ArrayHolder<double> hSDiag(pSDiag); Holder<GSparseMatrix> hSV(pSV); // Check the results GData* pV2 = pSV->toFullMatrix(); Holder<GData> hV2(pV2); double err = pV2->sumSquaredDifference(*pV, false); if(err > 1e-6) ThrowError("Failed"); }
void PressureBasedViscosityCoefficient::computeQpProperties() { // Cell size Real h_cell = _current_elem->hmin(); // Speed of sound RealVectorValue hU(_hu[_qp], 0., 0.); Real c2=_eos.c2(_h[_qp], hU); // Compute normalization parameter Real eps = std::sqrt(std::numeric_limits<Real>::min()); Real norm = 0.; switch (_pbv_type) { case JST: norm = std::fabs(_press[_qp]); break; case HMP: norm = h_cell*_press_grad[_qp].size()+eps; break; case ST: norm = 0.5*(h_cell*_press_grad[_qp].size()+std::fabs(_press[_qp])); break; default: mooseError("'" << this->name() << "': invalid viscosity type. The options are: JST, HMP and ST"); break; } // Pre-compute the viscosity coefficient Real kappa = (std::fabs(_norm_vel[_qp])+std::sqrt(c2))*std::fabs(_press_laplace[_qp])/norm; // Set material viscosity coefficient _kappa_max[_qp] = 0.5*h_cell*(hU.size()/_h[_qp]+std::sqrt(c2)); _kappa[_qp] = std::min(_Ce*h_cell*h_cell*h_cell*kappa, _kappa_max[_qp]); }
/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] NB INTEGER The blocking size. NB >= 0. @param[in,out] hU COMPLEX_16 array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. @param[in] ldhu INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). @param[in,out] dU COMPLEX_16 array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. @param[in] lddu INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). @param[in,out] hA COMPLEX_16 array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX_16 array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX_16 array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX_16 array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] hWORK COMPLEX_16 array, dimension(LDHWORK, 2*IB), on cpu. Workspace. @param[in] ldhwork INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). @param[out] dWORK COMPLEX_16 array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_ztstrf_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaDoubleComplex *hU, magma_int_t ldhu, magmaDoubleComplex_ptr dU, magma_int_t lddu, magmaDoubleComplex *hA, magma_int_t ldha, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *hL, magma_int_t ldhl, magmaDoubleComplex_ptr dL, magma_int_t lddl, magma_int_t *ipiv, magmaDoubleComplex *hwork, magma_int_t ldhwork, magmaDoubleComplex_ptr dwork, magma_int_t lddwork, magma_int_t *info) { #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaDoubleComplex_ptr dAT, dUT; magmaDoubleComplex_ptr dAp, dUp; #ifndef WITHOUTTRTRI magmaDoubleComplex_ptr dL2 = dL + ib; magmaDoubleComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ztstrf(m, n, ib, nb, (PLASMA_Complex64_t*)hU, ldhu, (PLASMA_Complex64_t*)hA, ldha, (PLASMA_Complex64_t*)hL, ldhl, ipiv, (PLASMA_Complex64_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex64_t*)hL, ldhl, (PLASMA_Complex64_t*)hL2, ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex64_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( order == MagmaRowMajor ) { magma_zsetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_zsetmatrix( m, n, hU, ldhu, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dA, ldda ); } magma_zsetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dU, dUT, lddu, m, n ); magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i < s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i > 0 ) { // download i-th panel magmablas_ztranspose( sb, ii, UT(0,i), lddu, dUp, lddu ); magmablas_ztranspose( sb, m, AT(0,i), ldda, dAp, ldda ); magma_zgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_zgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ztstrf(m, sb, ib, nb, (PLASMA_Complex64_t*)hU(i, i), ldhu, (PLASMA_Complex64_t*)hA(0, i), ldha, (PLASMA_Complex64_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex64_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im >= 0) && (im < m) ); magmablas_zswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex64_t*)hL(i), ldhl, (PLASMA_Complex64_t*)hL2(i), ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex64_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_zsetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_zsetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_zsetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ztranspose( sb, sb, dUp, lddu, UT(i,i), lddu ); magmablas_ztranspose( m, sb, dAp, ldda, AT(0,i), ldda ); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dU, dUT, lddu, m, n ); magmablas_zgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
extern "C" magma_int_t magma_ctstrf_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaFloatComplex *hU, magma_int_t ldhu, magmaFloatComplex *dU, magma_int_t lddu, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *hwork, magma_int_t ldhwork, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. NB (input) INTEGER The blocking size. NB >= 0. hU (input,output) COMPLEX array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. LDHU (input) INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). dU (input,output) COMPLEX array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. LDDU (input) INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). hA (input,output) COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. LDHA (input) INTEGER The leading dimension of the array hA. LDHA >= max(1,M). dA (input,output) COMPLEX array, dimension(LDDA, N) , on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). hL (output) COMPLEX array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDHL (input) INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). dL (output) COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDDL (input) INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). hWORK (output) COMPLEX array, dimension(LDHWORK, 2*IB), on cpu. Workspace. LDHWORK (input) INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). dWORK (output) COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. LDDWORK (input) INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). IPIV (output) INTEGER array on the cpu. The pivot indices array of size K as returned by CTSTRF INFO (output) INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaFloatComplex *dAT, *dUT; magmaFloatComplex *dAp, *dUp; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ctstrf(m, n, ib, nb, (PLASMA_Complex32_t*)hU, ldhu, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL, ldhl, ipiv, (PLASMA_Complex32_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hL, ldhl, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( (storev == 'R') || (storev == 'r') ) { magma_csetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ctranspose( dU, lddu, dwork, lddwork, m, n ); magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( dA, ldda, dwork, lddwork, m, n ); } else { magma_csetmatrix( m, n, hU, ldhu, dU, lddu ); magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } magma_csetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_in( dU, dUT, lddu, m, n ); magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i<s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i>0 ){ // download i-th panel magmablas_ctranspose( dUp, lddu, UT(0, i), lddu, sb, ii ); magmablas_ctranspose( dAp, ldda, AT(0, i), ldda, sb, m ); magma_cgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_cgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ctstrf(m, sb, ib, nb, (PLASMA_Complex32_t*)hU(i, i), ldhu, (PLASMA_Complex32_t*)hA(0, i), ldha, (PLASMA_Complex32_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex32_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_cswapblk( 'R', n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im>=0) && (im<m) ); magmablas_cswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex32_t*)hL(i), ldhl, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_csetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_csetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_csetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ctranspose( UT(i, i), lddu, dUp, lddu, sb, sb); magmablas_ctranspose( AT(0, i), ldda, dAp, ldda, m, sb); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_out( dU, dUT, lddu, m, n ); magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
void GSparseMatrix::singularValueDecompositionHelper(GSparseMatrix** ppU, double** ppDiag, GSparseMatrix** ppV, int maxIters) { int m = rows(); int n = cols(); if(m < n) ThrowError("Expected at least as many rows as columns"); int i, j, k; int l = 0; int q, iter; double c, f, h, s, x, y, z; double norm = 0.0; double g = 0.0; double scale = 0.0; GSparseMatrix* pU = new GSparseMatrix(m, m); Holder<GSparseMatrix> hU(pU); pU->copyFrom(this); double* pSigma = new double[n]; ArrayHolder<double> hSigma(pSigma); GSparseMatrix* pV = new GSparseMatrix(n, n); Holder<GSparseMatrix> hV(pV); GTEMPBUF(double, temp, n + m); double* temp2 = temp + n; // Householder reduction to bidiagonal form for(int i = 0; i < n; i++) { // Left-hand reduction temp[i] = scale * g; l = i + 1; g = 0.0; s = 0.0; scale = 0.0; if(i < m) { Iter kend = pU->colEnd(i); for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) scale += ABS(kk->second); } if(scale != 0.0) { for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) { double t = kk->second / scale; pU->set(kk->first, i, t); s += (t * t); } } f = pU->get(i, i); g = -GSparseMatrix_takeSign(sqrt(s), f); h = f * g - s; pU->set(i, i, f - g); if(i != n - 1) { for(j = l; j < n; j++) { s = 0.0; for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) s += kk->second * pU->get(kk->first, j); } f = s / h; for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) pU->set(kk->first, j, pU->get(kk->first, j) + f * kk->second); } } } for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) pU->set(kk->first, i, pU->get(kk->first, i) * scale); } } } pSigma[i] = scale * g; // Right-hand reduction g = 0.0; s = 0.0; scale = 0.0; if(i < m && i != n - 1) { Iter kend = pU->rowEnd(i); for(Iter kk = pU->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) scale += ABS(kk->second); } if(scale != 0.0) { for(Iter kk = pU->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) { double t = kk->second / scale; pU->set(i, kk->first, t); s += (t * t); } } f = pU->get(i, l); g = -GSparseMatrix_takeSign(sqrt(s), f); h = f * g - s; pU->set(i, l, f - g); for(k = l; k < n; k++) temp[k] = pU->get(i, k) / h; if(i != m - 1) { for(j = l; j < m; j++) { s = 0.0; for(Iter kk = pU->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) s += pU->get(j, kk->first) * kk->second; } Iter kend2 = pU->rowEnd(j); for(Iter kk = pU->rowBegin(j); kk != kend2; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) pU->set(j, kk->first, pU->get(j, kk->first) + s * temp[kk->first]); } } } for(Iter kk = pU->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) pU->set(i, kk->first, kk->second * scale); } } } norm = MAX(norm, ABS(pSigma[i]) + ABS(temp[i])); } // Accumulate right-hand transform for(int i = n - 1; i >= 0; i--) { if(i < n - 1) { if(g != 0.0) { Iter jend = pU->rowEnd(i); for(Iter jj = pU->rowBegin(i); jj != jend; jj++) { if(jj->first >= (unsigned int)n) break; if(jj->first >= (unsigned int)l) pV->set(i, jj->first, (jj->second / pU->get(i, l)) / g); // (double-division to avoid underflow) } for(j = l; j < n; j++) { s = 0.0; Iter kend = pU->rowEnd(i); for(Iter kk = pU->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) s += kk->second * pV->get(j, kk->first); } kend = pV->rowEnd(i); for(Iter kk = pV->rowBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)n) break; if(kk->first >= (unsigned int)l) pV->set(j, kk->first, pV->get(j, kk->first) + s * kk->second); } } } for(j = l; j < n; j++) { pV->set(i, j, 0.0); pV->set(j, i, 0.0); } } pV->set(i, i, 1.0); g = temp[i]; l = i; } // Accumulate left-hand transform for(i = n - 1; i >= 0; i--) { l = i + 1; g = pSigma[i]; if(i < n - 1) { for(j = l; j < n; j++) pU->set(i, j, 0.0); } if(g != 0.0) { g = 1.0 / g; if(i != n - 1) { for(j = l; j < n; j++) { s = 0.0; Iter kend = pU->colEnd(i); for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)l) s += kk->second * pU->get(kk->first, j); } f = (s / pU->get(i, i)) * g; if(f != 0.0) { for(Iter kk = pU->colBegin(i); kk != kend; kk++) { if(kk->first >= (unsigned int)i) pU->set(kk->first, j, pU->get(kk->first, j) + f * kk->second); } } } } for(j = i; j < m; j++) pU->set(j, i, pU->get(j, i) * g); } else { for(j = i; j < m; j++) pU->set(j, i, 0.0); } pU->set(i, i, pU->get(i, i) + 1.0); } // Diagonalize the bidiagonal matrix for(k = n - 1; k >= 0; k--) // For each singular value { for(iter = 1; iter <= maxIters; iter++) { // Test for splitting bool flag = true; for(l = k; l >= 0; l--) { q = l - 1; if(ABS(temp[l]) + norm == norm) { flag = false; break; } if(ABS(pSigma[q]) + norm == norm) break; } if(flag) { c = 0.0; s = 1.0; for(i = l; i <= k; i++) { f = s * temp[i]; temp[i] *= c; if(ABS(f) + norm == norm) break; g = pSigma[i]; h = GSparseMatrix_pythag(f, g); pSigma[i] = h; h = 1.0 / h; c = g * h; s = -f * h; Iter jendi = pU->colEnd(i); Iter jendq = pU->colEnd(q); Iter jji = pU->colBegin(i); Iter jjq = pU->colBegin(q); int tpos; for(tpos = 0; jji != jendi || jjq != jendq; tpos++) { if(jjq == jendq || (jji != jendi && jji->first < jjq->first)) { temp2[tpos] = jji->first; jji++; } else { temp2[tpos] = jjq->first; if(jji != jendi && jjq->first == jji->first) jji++; jjq++; } } for(int tpos2 = 0; tpos2 < tpos; tpos2++) { y = pU->get((unsigned int)temp2[tpos2], q); z = pU->get((unsigned int)temp2[tpos2], i); pU->set((unsigned int)temp2[tpos2], q, y * c + z * s); pU->set((unsigned int)temp2[tpos2], i, z * c - y * s); } } } z = pSigma[k]; if(l == k) { // Detect convergence if(z < 0.0) { // Singular value should be positive pSigma[k] = -z; for(j = 0; j < n; j++) pV->set(k, j, pV->get(k, j) * -1.0); } break; } if(iter >= maxIters) ThrowError("failed to converge"); // Shift from bottom 2x2 minor x = pSigma[l]; q = k - 1; y = pSigma[q]; g = temp[q]; h = temp[k]; f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); g = GSparseMatrix_pythag(f, 1.0); f = ((x - z) * (x + z) + h * ((y / (f + GSparseMatrix_takeSign(g, f))) - h)) / x; // QR transform c = 1.0; s = 1.0; for(j = l; j <= q; j++) { i = j + 1; g = temp[i]; y = pSigma[i]; h = s * g; g = c * g; z = GSparseMatrix_pythag(f, h); temp[j] = z; c = f / z; s = h / z; f = x * c + g * s; g = g * c - x * s; h = y * s; y = y * c; Iter pendi = pV->rowEnd(i); Iter pendj = pV->rowEnd(j); Iter ppi = pV->rowBegin(i); Iter ppj = pV->rowBegin(j); int tpos; for(tpos = 0; ppi != pendi || ppj != pendj; tpos++) { if(ppj == pendj || (ppi != pendi && ppi->first < ppj->first)) { temp2[tpos] = ppi->first; ppi++; } else { temp2[tpos] = ppj->first; if(ppi != pendi && ppj->first == ppi->first) ppi++; ppj++; } } for(int tpos2 = 0; tpos2 < tpos; tpos2++) { x = pV->get(j, (unsigned int)temp2[tpos2]); z = pV->get(i, (unsigned int)temp2[tpos2]); pV->set(j, (unsigned int)temp2[tpos2], x * c + z * s); pV->set(i, (unsigned int)temp2[tpos2], z * c - x * s); } z = GSparseMatrix_pythag(f, h); pSigma[j] = z; if(z != 0.0) { z = 1.0 / z; c = f * z; s = h * z; } f = c * g + s * y; x = c * y - s * g; pendi = pU->colEnd(i); pendj = pU->colEnd(j); ppi = pU->colBegin(i); ppj = pU->colBegin(j); for(tpos = 0; ppi != pendi || ppj != pendj; tpos++) { if(ppj == pendj || (ppi != pendi && ppi->first < ppj->first)) { temp2[tpos] = ppi->first; ppi++; } else { temp2[tpos] = ppj->first; if(ppi != pendi && ppj->first == ppi->first) ppi++; ppj++; } } for(int tpos2 = 0; tpos2 < tpos; tpos2++) { y = pU->get((unsigned int)temp2[tpos2], j); z = pU->get((unsigned int)temp2[tpos2], i); pU->set((unsigned int)temp2[tpos2], j, y * c + z * s); pU->set((unsigned int)temp2[tpos2], i, z * c - y * s); } } temp[l] = 0.0; temp[k] = f; pSigma[k] = x; } } // Sort the singular values from largest to smallest for(i = 1; i < n; i++) { for(j = i; j > 0; j--) { if(pSigma[j - 1] >= pSigma[j]) break; pU->swapColumns(j - 1, j); pV->swapRows(j - 1, j); std::swap(pSigma[j - 1], pSigma[j]); } } // Return results *ppU = hU.release(); *ppDiag = hSigma.release(); *ppV = hV.release(); }