Foam::lduSolverPerformance Foam::gmresSolver::solve ( scalarField& x, const scalarField& b, const direction cmpt ) const { // Prepare solver performance lduSolverPerformance solverPerf(typeName, fieldName()); scalarField wA(x.size()); scalarField rA(x.size()); // Calculate initial residual matrix_.Amul(wA, x, coupleBouCoeffs_, interfaces_, cmpt); // Use rA as scratch space when calculating the normalisation factor scalar normFactor = this->normFactor(x, b, wA, rA, cmpt); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // Calculate residual forAll (rA, i) { rA[i] = b[i] - wA[i]; }
typename Foam::BlockSolverPerformance<Type> Foam::BlockGaussSeidelSolver<Type>::solve ( Field<Type>& x, const Field<Type>& b ) { // Create local references to avoid the spread this-> ugliness const BlockLduMatrix<Type>& matrix = this->matrix_; // Prepare solver performance BlockSolverPerformance<Type> solverPerf ( typeName, this->fieldName() ); scalar norm = this->normFactor(x, b); Field<Type> wA(x.size()); // Calculate residual. Note: sign of residual swapped for efficiency matrix.Amul(wA, x); wA -= b; solverPerf.initialResidual() = gSum(cmptMag(wA))/norm; solverPerf.finalResidual() = solverPerf.initialResidual(); // Check convergence, solve if not converged if (!this->stop(solverPerf)) { // Iteration loop do { for (label i = 0; i < nSweeps_; i++) { gs_.precondition(x, b); solverPerf.nIterations()++; } // Re-calculate residual. Note: sign of residual swapped // for efficiency matrix.Amul(wA, x); wA -= b; solverPerf.finalResidual() = gSum(cmptMag(wA))/norm; solverPerf.nIterations()++; } while (!this->stop(solverPerf)); } return solverPerf; }
typename Foam::BlockSolverPerformance<Type> Foam::BlockGMRESSolver<Type>::solve ( Field<Type>& x, const Field<Type>& b ) { // Create local references to avoid the spread this-> ugliness const BlockLduMatrix<Type>& matrix = this->matrix_; // Prepare solver performance BlockSolverPerformance<Type> solverPerf ( typeName, this->fieldName() ); scalar norm = this->normFactor(x, b); // Multiplication helper typename BlockCoeff<Type>::multiply mult; Field<Type> wA(x.size()); // Calculate initial residual matrix.Amul(wA, x); Field<Type> rA(b - wA); solverPerf.initialResidual() = gSum(cmptMag(rA))/norm; solverPerf.finalResidual() = solverPerf.initialResidual(); // Check convergence, solve if not converged if (!solverPerf.checkConvergence(this->tolerance(), this->relTolerance())) { // Create the Hesenberg matrix scalarSquareMatrix H(nDirs_, 0); // Create y and b for Hessenberg matrix scalarField yh(nDirs_, 0); scalarField bh(nDirs_ + 1, 0); // Givens rotation vectors scalarField c(nDirs_, 0); scalarField s(nDirs_, 0); // Allocate Krylov space vectors FieldField<Field, Type> V(nDirs_ + 1); forAll (V, i) { V.set(i, new Field<Type>(x.size(), pTraits<Type>::zero)); }
Foam::scalar Foam::lduMatrix::solver::normFactor ( const scalarField& x, const scalarField& b, const direction cmpt ) const { scalarField wA(x.size()); scalarField tmpField(x.size()); matrix_.Amul(wA, x, interfaceBouCoeffs_, interfaces_, cmpt); return normFactor(x, b, wA, tmpField); }
Foam::coupledSolverPerformance Foam::coupledBicgSolver::solve ( FieldField<Field, scalar>& x, const FieldField<Field, scalar>& b, const direction cmpt ) const { // Prepare solver performance coupledSolverPerformance solverPerf(typeName, fieldName()); FieldField<Field, scalar> wA(x.size()); FieldField<Field, scalar> rA(x.size()); forAll (x, rowI) { wA.set(rowI, new scalarField(x[rowI].size(), 0)); rA.set(rowI, new scalarField(x[rowI].size(), 0)); }
Foam::scalar Foam::BlockIterativeSolver<Type>::normFactor ( Field<Type>& x, const Field<Type>& b ) const { const BlockLduMatrix<Type>& matrix = this->matrix_; // Calculate the normalisation factor const label nRows = x.size(); Field<Type> pA(nRows); Field<Type> wA(nRows); // Calculate reference value of x Type xRef = gAverage(x); // Calculate A.x matrix.Amul(wA, x); // Calculate A.xRef, temporarily using pA for storage matrix.Amul ( pA, Field<Type>(nRows, xRef) ); scalar normFactor = gSum(mag(wA - pA) + mag(b - pA)) + this->small_; if (BlockLduMatrix<Type>::debug >= 2) { Info<< "Iterative solver normalisation factor = " << normFactor << endl; } return normFactor; }
/** Purpose ------- CUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. The diagonal and the lower part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. @param[in,out] dC COMPLEX array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) COMPLEX array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by CHETRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmql2_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dC, magma_int_t lddc, magmaFloatComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ magmaFloatComplex *dwork; magma_cmalloc( &dwork, 2*(m + 64)*64 ); magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t i, i__4; magmaFloatComplex T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, nb, mi, ni, nq, nw; magma_int_t ldwork; int left, notran; wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } ldwork = nw; /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; step = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } // set nb-1 sub-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // already with the lower triangle parts already set to identity. magmablas_claset_band( MagmaLower, k, k, nb, c_zero, c_one, dA, ldda ); for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i + ib - 1; lapackf77_clarft("Backward", "Columnwise", &i__4, &ib, wA(1,i), &ldwa, &tau[i], T, &ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dwork+i__4*ib, ib ); magma_clarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dA(0,i-1), ldda, dwork+i__4*ib, ib, // dA using 0-based indices here dC(1,1), lddc, dwork+i__4*ib + ib*ib, ldwork); } magma_free( dwork ); return *info; } /* magma_cunmql */
Foam::SolverPerformance<Type> Foam::PBiCCCG<Type, DType, LUType>::solve ( Field<Type>& psi ) const { word preconditionerName(this->controlDict_.lookup("preconditioner")); // --- Setup class containing solver performance data SolverPerformance<Type> solverPerf ( preconditionerName + typeName, this->fieldName_ ); label nCells = psi.size(); Type* __restrict__ psiPtr = psi.begin(); Field<Type> pA(nCells); Type* __restrict__ pAPtr = pA.begin(); Field<Type> pT(nCells, Zero); Type* __restrict__ pTPtr = pT.begin(); Field<Type> wA(nCells); Type* __restrict__ wAPtr = wA.begin(); Field<Type> wT(nCells); Type* __restrict__ wTPtr = wT.begin(); scalar wArT = 1e15; //this->matrix_.great_; scalar wArTold = wArT; // --- Calculate A.psi and T.psi this->matrix_.Amul(wA, psi); this->matrix_.Tmul(wT, psi); // --- Calculate initial residual and transpose residual fields Field<Type> rA(this->matrix_.source() - wA); Field<Type> rT(this->matrix_.source() - wT); Type* __restrict__ rAPtr = rA.begin(); Type* __restrict__ rTPtr = rT.begin(); // --- Calculate normalisation factor Type normFactor = this->normFactor(psi, wA, pA); if (LduMatrix<Type, DType, LUType>::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = cmptDivide(gSumCmptMag(rA), normFactor); solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( this->minIter_ > 0 || !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) { // --- Select and construct the preconditioner autoPtr<typename LduMatrix<Type, DType, LUType>::preconditioner> preconPtr = LduMatrix<Type, DType, LUType>::preconditioner::New ( *this, this->controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA); preconPtr->preconditionT(wT, rT); // --- Update search directions: wArT = gSumProd(wA, rT); if (solverPerf.nIterations() == 0) { for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { scalar beta = wArT/wArTold; for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + (beta* pAPtr[cell]); pTPtr[cell] = wTPtr[cell] + (beta* pTPtr[cell]); } } // --- Update preconditioned residuals this->matrix_.Amul(wA, pA); this->matrix_.Tmul(wT, pT); scalar wApT = gSumProd(wA, pT); // --- Test for singularity if ( solverPerf.checkSingularity ( cmptDivide(pTraits<Type>::one*mag(wApT), normFactor) ) ) { break; } // --- Update solution and residual: scalar alpha = wArT/wApT; for (label cell=0; cell<nCells; cell++) { psiPtr[cell] += (alpha* pAPtr[cell]); rAPtr[cell] -= (alpha* wAPtr[cell]); rTPtr[cell] -= (alpha* wTPtr[cell]); } solverPerf.finalResidual() = cmptDivide(gSumCmptMag(rA), normFactor); } while ( ( solverPerf.nIterations()++ < this->maxIter_ && !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) || solverPerf.nIterations() < this->minIter_ ); } return solverPerf; }
Foam::lduSolverPerformance Foam::PBiCG::solve ( scalarField& x, const scalarField& b, const direction cmpt ) const { // --- Setup class containing solver performance data lduSolverPerformance solverPerf ( lduMatrix::preconditioner::getName(dict()) + typeName, fieldName() ); register label nCells = x.size(); scalar* __restrict__ xPtr = x.begin(); scalarField pA(nCells); scalar* __restrict__ pAPtr = pA.begin(); scalarField pT(nCells, 0.0); scalar* __restrict__ pTPtr = pT.begin(); scalarField wA(nCells); scalar* __restrict__ wAPtr = wA.begin(); scalarField wT(nCells); scalar* __restrict__ wTPtr = wT.begin(); scalar wArT = matrix_.great_; scalar wArTold = wArT; // Calculate A.x and T.x matrix_.Amul(wA, x, coupleBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, x, coupleIntCoeffs_, interfaces_, cmpt); // Calculate initial residual and transpose residual fields scalarField rA(b - wA); scalarField rT(b - wT); scalar* __restrict__ rAPtr = rA.begin(); scalar* __restrict__ rTPtr = rT.begin(); // Calculate normalisation factor scalar normFactor = this->normFactor(x, b, wA, pA, cmpt); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA)/normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // Check convergence, solve if not converged if (!stop(solverPerf)) { // Select and construct the preconditioner autoPtr<lduPreconditioner> preconPtr; preconPtr = lduPreconditioner::New ( matrix_, coupleBouCoeffs_, coupleIntCoeffs_, interfaces_, dict() ); // Solver iteration do { // Store previous wArT wArTold = wArT; // Precondition residuals preconPtr->precondition(wA, rA, cmpt); preconPtr->preconditionT(wT, rT, cmpt); // Update search directions: wArT = gSumProd(wA, rT); if (solverPerf.nIterations() == 0) { for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { scalar beta = wArT/wArTold; for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + beta*pAPtr[cell]; pTPtr[cell] = wTPtr[cell] + beta*pTPtr[cell]; } } // Update preconditioned residuals matrix_.Amul(wA, pA, coupleBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, pT, coupleIntCoeffs_, interfaces_, cmpt); scalar wApT = gSumProd(wA, pT); // Test for singularity if (solverPerf.checkSingularity(mag(wApT)/normFactor)) break; // Update solution and residual: scalar alpha = wArT/wApT; for (register label cell=0; cell<nCells; cell++) { xPtr[cell] += alpha*pAPtr[cell]; rAPtr[cell] -= alpha*wAPtr[cell]; rTPtr[cell] -= alpha*wTPtr[cell]; } solverPerf.finalResidual() = gSumMag(rA)/normFactor; solverPerf.nIterations()++; } while (!stop(solverPerf)); } return solverPerf; }
/** Purpose ------- ZUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by ZGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQLF in the last k columns of its array argument dA. The diagonal and the lower part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array dA. If SIDE = MagmaLeft, LDDA >= max(1,M); if SIDE = MagmaRight, LDDA >= max(1,N). @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQLF. @param[in,out] dC COMPLEX_16 array on the GPU, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array dC. LDDC >= max(1,M). @param[in] wA COMPLEX_16 array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. (A copy of the upper or lower part of dA, on the host.) @param[in] ldwa INTEGER The leading dimension of the array wA. If SIDE = MagmaLeft, LDWA >= max(1,M); if SIDE = MagmaRight, LDWA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_zunmql2_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex_ptr dC, magma_int_t lddc, const magmaDoubleComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Constants */ const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmax = 64; /* Local variables */ magmaDoubleComplex_ptr dwork = NULL, dT = NULL; magmaDoubleComplex T[ nbmax*nbmax ]; magma_int_t i, i1, i2, step, ib, lddwork, nb, mi, ni, nq, nq_i, nw; magma_queue_t queue = NULL; // Parameter adjustments for Fortran indexing wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } // size of the block nb = nbmax; lddwork = nw; /* Use hybrid CPU-GPU code */ if ( ( left && notran) || (! left && ! notran) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } // dwork is (n or m) x nb + nb x nb, for left or right respectively if (MAGMA_SUCCESS != magma_zmalloc( &dwork, lddwork*nb + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } dT = dwork + lddwork*nb; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // in bottom k x k portion of dA, // set nb-1 sub-diagonals to 0, and diagonal to 1, in // This way we can copy V directly to the GPU, // with the lower triangle parts already set to identity. // A is nq x k, either m x k (left) or n x k (right) magmablas_zlaset_band( MagmaLower, k, k, nb, c_zero, c_one, dA(nq-k,0), ldda, queue ); for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min( nb, k - i + 1 ); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib - 1; lapackf77_zlarft( "Backward", "Columnwise", &nq_i, &ib, wA(1,i), &ldwa, &tau[i], T, &ib ); if (left) { /* H or H^H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib - 1; } else { /* H or H^H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib - 1; } /* Apply H or H^H; First copy T to the GPU */ magma_zsetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_zlarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dA(0,i-1), ldda, dT, ib, // dA using 0-based indices here dC(1,1), lddc, dwork, lddwork, queue ); } cleanup: magma_queue_destroy( queue ); magma_free( dwork ); return *info; } /* magma_zunmql */
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] dC DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr2_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *dA, magma_int_t ldda, double *tau, double *dC, magma_int_t lddc, double *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ double *dwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t i, i__4, lddwork; double T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran; wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; magma_dmalloc( &dwork, (n + 64)*64 ); // TODO after checking args, else memory leak! } else { nq = n; nw = m; magma_dmalloc( &dwork, (m + 64)*64 ); // TODO after checking args, else memory leak! } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } // set nb-1 super-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // with the upper triangle parts already set to identity. magmablas_dlaset_band( MagmaUpper, k, k, nb, c_zero, c_one, dA, ldda ); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; lapackf77_dlarft("Forward", "Columnwise", &i__4, &ib, wA(i,i), &ldwa, &tau[i], T, &ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_dsetmatrix( ib, ib, T, ib, dwork, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i-1,i-1), ldda, dwork, ib, // dA using 0-based indices here dC(ic,jc), lddc, dwork + ib*ib, lddwork); } magma_free( dwork ); return *info; } /* magma_dormqr */
Foam::solverPerformance Foam::paralution_AMG::solve ( scalarField& psi, const scalarField& source, const direction cmpt ) const { word precond_name = lduMatrix::preconditioner::getName(controlDict_); double div = controlDict_.lookupOrDefault<double>("div", 1e+08); bool accel = controlDict_.lookupOrDefault<bool>("useAccelerator", true); word mformat = controlDict_.lookupOrDefault<word>("MatrixFormat", "CSR"); word pformat = controlDict_.lookupOrDefault<word>("PrecondFormat", "CSR"); word sformat = controlDict_.lookupOrDefault<word>("SmootherFormat", "CSR"); word solver_name = controlDict_.lookupOrDefault<word>("CoarseGridSolver", "CG"); word smoother_name = controlDict_.lookupOrDefault<word>("smoother", "paralution_MultiColoredGS"); int MEp = controlDict_.lookupOrDefault<int>("MEp", 1); word LBPre = controlDict_.lookupOrDefault<word>("LastBlockPrecond", "paralution_Jacobi"); int iterPreSmooth = controlDict_.lookupOrDefault<int>("nPreSweeps", 1); int iterPostSmooth = controlDict_.lookupOrDefault<int>("nPostSweeps", 2); double epsCoupling = controlDict_.lookupOrDefault<double>("couplingStrength", 0.01); int coarsestCells = controlDict_.lookupOrDefault<int>("nCellsInCoarsestLevel", 300); int ILUp = controlDict_.lookupOrDefault<int>("ILUp", 0); int ILUq = controlDict_.lookupOrDefault<int>("ILUq", 1); double relax = controlDict_.lookupOrDefault<double>("Relaxation", 1.0); double aggrrelax = controlDict_.lookupOrDefault<double>("AggrRelax", 2./3.); bool scaling = controlDict_.lookupOrDefault<bool>("scaleCorrection", true); word interp_name = controlDict_.lookupOrDefault<word>("InterpolationType", "SmoothedAggregation"); solverPerformance solverPerf(typeName + '(' + precond_name + ')', fieldName_); register label nCells = psi.size(); scalarField pA(nCells); scalarField wA(nCells); // --- Calculate A.psi matrix_.Amul(wA, psi, interfaceBouCoeffs_, interfaces_, cmpt); // --- Calculate initial residual field scalarField rA(source - wA); // --- Calculate normalisation factor scalar normFactor = this->normFactor(psi, source, wA, pA); // --- Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA)/normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); if ( !solverPerf.checkConvergence(tolerance_, relTol_) ) { paralution::_matrix_format mf = paralution::CSR; if (mformat == "CSR") mf = paralution::CSR; else if (mformat == "DIA") mf = paralution::DIA; else if (mformat == "HYB") mf = paralution::HYB; else if (mformat == "ELL") mf = paralution::ELL; else if (mformat == "MCSR") mf = paralution::MCSR; else if (mformat == "BCSR") mf = paralution::BCSR; else if (mformat == "COO") mf = paralution::COO; else if (mformat == "DENSE") mf = paralution::DENSE; paralution::_interp ip = paralution::SmoothedAggregation; if (interp_name == "SmoothedAggregation") ip = paralution::SmoothedAggregation; else if (interp_name == "Aggregation") ip = paralution::Aggregation; paralution::LocalVector<double> x; paralution::LocalVector<double> rhs; paralution::LocalMatrix<double> mat; paralution::AMG<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double> ls; paralution::import_openfoam_matrix(matrix(), &mat); paralution::import_openfoam_vector(source, &rhs); paralution::import_openfoam_vector(psi, &x); ls.SetOperator(mat); // coupling strength ls.SetCouplingStrength(epsCoupling); // number of unknowns on coarsest level ls.SetCoarsestLevel(coarsestCells); // interpolation type for grid transfer operators ls.SetInterpolation(ip); // Relaxation parameter for smoothed interpolation aggregation ls.SetInterpRelax(aggrrelax); // Manual smoothers ls.SetManualSmoothers(true); // Manual course grid solver ls.SetManualSolver(true); // grid transfer scaling ls.SetScaling(scaling); // operator format ls.SetOperatorFormat(mf); ls.SetSmootherPreIter(iterPreSmooth); ls.SetSmootherPostIter(iterPostSmooth); ls.BuildHierarchy(); int levels = ls.GetNumLevels(); // Smoother via preconditioned FixedPoint iteration paralution::IterativeLinearSolver<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double > **fp = NULL; fp = new paralution::IterativeLinearSolver<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double >*[levels-1]; paralution::Preconditioner<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double > **sm = NULL; sm = new paralution::Preconditioner<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double >*[levels-1]; for (int i=0; i<levels-1; ++i) { fp[i] = paralution::GetIterativeLinearSolver<double>("paralution_FixedPoint", relax); sm[i] = paralution::GetPreconditioner<double>(smoother_name, LBPre, sformat, ILUp, ILUq, MEp); fp[i]->SetPreconditioner(*sm[i]); fp[i]->Verbose(0); } // Coarse Grid Solver and its Preconditioner paralution::IterativeLinearSolver<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double > *cgs = NULL; cgs = paralution::GetIterativeLinearSolver<double>(solver_name, relax); cgs->Verbose(0); paralution::Preconditioner<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double > *cgp = NULL; cgp = paralution::GetPreconditioner<double>(precond_name, LBPre, pformat, ILUp, ILUq, MEp); if (cgp != NULL) cgs->SetPreconditioner(*cgp); ls.SetSmoother(fp); ls.SetSolver(*cgs); // Switch to L1 norm to be consistent with OpenFOAM solvers ls.SetResidualNorm(1); ls.Init(tolerance_*normFactor, // abs relTol_, // rel div, // div maxIter_); // max iter ls.Build(); if (accel) { mat.MoveToAccelerator(); rhs.MoveToAccelerator(); x.MoveToAccelerator(); ls.MoveToAccelerator(); } switch(mf) { case paralution::DENSE: mat.ConvertToDENSE(); break; case paralution::CSR: mat.ConvertToCSR(); break; case paralution::MCSR: mat.ConvertToMCSR(); break; case paralution::BCSR: mat.ConvertToBCSR(); break; case paralution::COO: mat.ConvertToCOO(); break; case paralution::DIA: mat.ConvertToDIA(); break; case paralution::ELL: mat.ConvertToELL(); break; case paralution::HYB: mat.ConvertToHYB(); break; } ls.Verbose(0); // Solve linear system ls.Solve(rhs, &x); paralution::export_openfoam_vector(x, &psi); solverPerf.finalResidual() = ls.GetCurrentResidual() / normFactor; // divide by normFactor, see lduMatrixSolver.C solverPerf.nIterations() = ls.GetIterationCount(); solverPerf.checkConvergence(tolerance_, relTol_); // Clear MultiGrid object ls.Clear(); // Free all structures for (int i=0; i<levels-1; ++i) { delete fp[i]; delete sm[i]; } cgs->Clear(); if (cgp != NULL) delete cgp; delete[] fp; delete[] sm; delete cgs; } return solverPerf; }
/** Purpose ------- DORMTR overwrites the general real M-by-N matrix C with SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H where Q is a real unitary matrix of order nq, with nq = m if SIDE = MagmaLeft and nq = n if SIDE = MagmaRight. Q is defined as the product of nq-1 elementary reflectors, as returned by DSYTRD: if UPLO = MagmaUpper, Q = H(nq-1) . . . H(2) H(1); if UPLO = MagmaLower, Q = H(1) H(2) . . . H(nq-1). Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A contains elementary reflectors from DSYTRD; - = MagmaLower: Lower triangle of A contains elementary reflectors from DSYTRD. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] dA DOUBLE_PRECISION array, dimension (LDDA,M) if SIDE = MagmaLeft (LDDA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. On output the diagonal, the subdiagonal and the upper part (UPLO=MagmaLower) or lower part (UPLO=MagmaUpper) are destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau DOUBLE_PRECISION array, dimension (M-1) if SIDE = MagmaLeft (N-1) if SIDE = MagmaRight TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DSYTRD. @param[in,out] dC DOUBLE_PRECISION array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) DOUBLE_PRECISION array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by DSYTRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dsyev_comp ********************************************************************/ extern "C" magma_int_t magma_dormtr_gpu( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_int_t m, magma_int_t n, magmaDouble_ptr dA, magma_int_t ldda, double *tau, magmaDouble_ptr dC, magma_int_t lddc, double *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) magma_int_t i1, i2, mi, ni, nq; int left, upper; magma_int_t iinfo; *info = 0; left = (side == MagmaLeft); upper = (uplo == MagmaUpper); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; //nw = n; } else { nq = n; //nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (trans != MagmaNoTrans && trans != MagmaTrans) { *info = -3; } else if (m < 0) { *info = -4; } else if (n < 0) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || nq == 1) { return *info; } if (left) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } if (upper) { magma_dormql2_gpu(side, trans, mi, ni, nq-1, dA(0,1), ldda, tau, dC, lddc, wA(0,1), ldwa, &iinfo); } else { /* Q was determined by a call to DSYTRD with UPLO = 'L' */ if (left) { i1 = 1; i2 = 0; } else { i1 = 0; i2 = 1; } magma_dormqr2_gpu(side, trans, mi, ni, nq-1, dA(1,0), ldda, tau, dC(i1,i2), lddc, wA(1,0), ldwa, &iinfo); } return *info; } /* magma_dormtr */
Foam::lduMatrix::solverPerformance Foam::paralution_PFGMRES::solve ( scalarField& psi, const scalarField& source, const direction cmpt ) const { word precond_name = lduMatrix::preconditioner::getName(controlDict_); double div = controlDict_.lookupOrDefault<double>("div", 1e+08); int basis = controlDict_.lookupOrDefault<int>("BasisSize", 30); bool accel = controlDict_.lookupOrDefault<bool>("useAccelerator", true); word mformat = controlDict_.lookupOrDefault<word>("MatrixFormat", "CSR"); word pformat = controlDict_.lookupOrDefault<word>("PrecondFormat", "CSR"); int ILUp = controlDict_.lookupOrDefault<int>("ILUp", 0); int ILUq = controlDict_.lookupOrDefault<int>("ILUq", 1); int MEp = controlDict_.lookupOrDefault<int>("MEp", 1); word LBPre = controlDict_.lookupOrDefault<word>("LastBlockPrecond", "paralution_Jacobi"); lduMatrix::solverPerformance solverPerf(typeName + '(' + precond_name + ')', fieldName_); register label nCells = psi.size(); scalarField pA(nCells); scalarField wA(nCells); // --- Calculate A.psi matrix_.Amul(wA, psi, interfaceBouCoeffs_, interfaces_, cmpt); // --- Calculate initial residual field scalarField rA(source - wA); // --- Calculate normalisation factor scalar normFactor = this->normFactor(psi, source, wA, pA); // --- Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA)/normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // TODO check why we cannot skip 1 iteration when initial residual < relTol_ or why initial residual actually // does not drop below relTol_ if (!solverPerf.checkConvergence(tolerance_, relTol_)) { paralution::_matrix_format mf = paralution::CSR; if (mformat == "CSR") mf = paralution::CSR; else if (mformat == "DIA") mf = paralution::DIA; else if (mformat == "HYB") mf = paralution::HYB; else if (mformat == "ELL") mf = paralution::ELL; else if (mformat == "MCSR") mf = paralution::MCSR; else if (mformat == "BCSR") mf = paralution::BCSR; else if (mformat == "COO") mf = paralution::COO; else if (mformat == "DENSE") mf = paralution::DENSE; paralution::init_paralution(); paralution::LocalVector<double> x; paralution::LocalVector<double> rhs; paralution::LocalMatrix<double> mat; paralution::FGMRES<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double> ls; import_openfoam_matrix(matrix(), &mat); import_openfoam_vector(source, &rhs); import_openfoam_vector(psi, &x); ls.Clear(); if (accel) { mat.MoveToAccelerator(); rhs.MoveToAccelerator(); x.MoveToAccelerator(); } paralution::Preconditioner<paralution::LocalMatrix<double>, paralution::LocalVector<double>, double > *precond = NULL; precond = GetPreconditioner<double>(precond_name, LBPre, pformat, ILUp, ILUq, MEp); if (precond != NULL) ls.SetPreconditioner(*precond); ls.SetOperator(mat); ls.SetBasisSize(basis); ls.Verbose(0); ls.Init(tolerance_*normFactor, // abs relTol_, // rel div, // div maxIter_); // max iter ls.Build(); switch(mf) { case paralution::DENSE: mat.ConvertToDENSE(); break; case paralution::CSR: mat.ConvertToCSR(); break; case paralution::MCSR: mat.ConvertToMCSR(); break; case paralution::BCSR: mat.ConvertToBCSR(); break; case paralution::COO: mat.ConvertToCOO(); break; case paralution::DIA: mat.ConvertToDIA(); break; case paralution::ELL: mat.ConvertToELL(); break; case paralution::HYB: mat.ConvertToHYB(); break; } // mat.info(); ls.Solve(rhs, &x); export_openfoam_vector(x, &psi); solverPerf.finalResidual() = ls.GetCurrentResidual() / normFactor; // divide by normFactor, see lduMatrixSolver.C solverPerf.nIterations() = ls.GetIterationCount(); solverPerf.checkConvergence(tolerance_, relTol_); ls.Clear(); if (precond != NULL) { precond->Clear(); delete precond; } paralution::stop_paralution(); } return solverPerf; }
/** Purpose ------- ZUNMQR overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by ZGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA COMPLEX_16 array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by ZGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX_16 array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by ZGEQRF. @param[in,out] dC COMPLEX_16 array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) COMPLEX_16 array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by ZHETRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgeqrf_comp ********************************************************************/ template<typename Ty> magma_int_t magma_unmqr2_gpu( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, cl_mem dA, size_t dA_offset, magma_int_t ldda, Ty *tau, cl_mem dC, size_t dC_offset, magma_int_t lddc, Ty *wA, magma_int_t ldwa, magma_queue_t queue, magma_int_t *info) { #define dA(i_,j_) (dA) , ((i_) + (j_)*ldda) + dA_offset #define dC(i_,j_) (dC) , ((i_) + (j_)*lddc) + dC_offset #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ cl_mem dwork; static const Ty c_zero = magma_zero<Ty>(); static const Ty c_one = magma_one<Ty>(); magma_int_t i, i__4, lddwork; Ty T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq; int left, notran; wA -= 1 + ldwa; dC_offset -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; //nw = n; magma_malloc<Ty>( &dwork, (n + 64)*64 ); // TODO after checking args, else memory leak! } else { nq = n; //nw = m; magma_malloc<Ty>( &dwork, (m + 64)*64 ); // TODO after checking args, else memory leak! } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < std::max(1,nq)) { *info = -7; } else if (lddc < std::max(1,m)) { *info = -10; } else if (ldwa < std::max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { //magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } cpu_lapack_larft_func<Ty> cpu_lapack_larft; // set nb-1 super-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // with the upper triangle parts already set to identity. magmablas_laset_band<Ty>( MagmaUpper, k, k, nb, c_zero, c_one, dA, dA_offset, ldda, queue); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = std::min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; LAPACKE_CHECK(cpu_lapack_larft( *MagmaForwardStr, *MagmaColumnwiseStr, i__4, ib, wA(i,i), ldwa, &tau[i], T, ib)); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_setmatrix<Ty>( ib, ib, T, ib, dwork, 0, ib, queue); magma_larfb_gpu<Ty>( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i-1,i-1), ldda, dwork, 0, ib, // dA using 0-based indices here dC(ic,jc), lddc, dwork, ib*ib, lddwork, queue); } magma_free( dwork ); return *info; } /* magma_zunmqr */
typename Foam::SolverPerformance<Type> Foam::PCICG<Type, DType, LUType>::solve(Field<Type>& psi) const { word preconditionerName(this->controlDict_.lookup("preconditioner")); // --- Setup class containing solver performance data SolverPerformance<Type> solverPerf ( preconditionerName + typeName, this->fieldName_ ); label nCells = psi.size(); Type* __restrict__ psiPtr = psi.begin(); Field<Type> pA(nCells); Type* __restrict__ pAPtr = pA.begin(); Field<Type> wA(nCells); Type* __restrict__ wAPtr = wA.begin(); Type wArA = solverPerf.great_*pTraits<Type>::one; Type wArAold = wArA; // --- Calculate A.psi this->matrix_.Amul(wA, psi); // --- Calculate initial residual field Field<Type> rA(this->matrix_.source() - wA); Type* __restrict__ rAPtr = rA.begin(); // --- Calculate normalisation factor Type normFactor = this->normFactor(psi, wA, pA); if (LduMatrix<Type, DType, LUType>::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = cmptDivide(gSumCmptMag(rA), normFactor); solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( this->minIter_ > 0 || !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) { // --- Select and construct the preconditioner autoPtr<typename LduMatrix<Type, DType, LUType>::preconditioner> preconPtr = LduMatrix<Type, DType, LUType>::preconditioner::New ( *this, this->controlDict_ ); // --- Solver iteration do { // --- Store previous wArA wArAold = wArA; // --- Precondition residual preconPtr->precondition(wA, rA); // --- Update search directions: wArA = gSumCmptProd(wA, rA); if (solverPerf.nIterations() == 0) { for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; } } else { Type beta = cmptDivide ( wArA, stabilise(wArAold, solverPerf.vsmall_) ); for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + cmptMultiply(beta, pAPtr[cell]); } } // --- Update preconditioned residual this->matrix_.Amul(wA, pA); Type wApA = gSumCmptProd(wA, pA); // --- Test for singularity if ( solverPerf.checkSingularity ( cmptDivide(cmptMag(wApA), normFactor) ) ) { break; } // --- Update solution and residual: Type alpha = cmptDivide ( wArA, stabilise(wApA, solverPerf.vsmall_) ); for (label cell=0; cell<nCells; cell++) { psiPtr[cell] += cmptMultiply(alpha, pAPtr[cell]); rAPtr[cell] -= cmptMultiply(alpha, wAPtr[cell]); } solverPerf.finalResidual() = cmptDivide(gSumCmptMag(rA), normFactor); } while ( ( solverPerf.nIterations()++ < this->maxIter_ && !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) || solverPerf.nIterations() < this->minIter_ ); } return solverPerf; }
Foam::solverPerformance Foam::PBiCG::solve ( scalarField& psi, const scalarField& source, const direction cmpt ) const { // --- Setup class containing solver performance data solverPerformance solverPerf ( lduMatrix::preconditioner::getName(controlDict_) + typeName, fieldName_ ); label nCells = psi.size(); scalar* __restrict__ psiPtr = psi.begin(); scalarField pA(nCells); scalar* __restrict__ pAPtr = pA.begin(); scalarField pT(nCells, 0.0); scalar* __restrict__ pTPtr = pT.begin(); scalarField wA(nCells); scalar* __restrict__ wAPtr = wA.begin(); scalarField wT(nCells); scalar* __restrict__ wTPtr = wT.begin(); scalar wArT = solverPerf.great_; scalar wArTold = wArT; // --- Calculate A.psi and T.psi matrix_.Amul(wA, psi, interfaceBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, psi, interfaceIntCoeffs_, interfaces_, cmpt); // --- Calculate initial residual and transpose residual fields scalarField rA(source - wA); scalarField rT(source - wT); scalar* __restrict__ rAPtr = rA.begin(); scalar* __restrict__ rTPtr = rT.begin(); // --- Calculate normalisation factor scalar normFactor = this->normFactor(psi, source, wA, pA); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA, matrix().mesh().comm()) /normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( minIter_ > 0 || !solverPerf.checkConvergence(tolerance_, relTol_) ) { // --- Select and construct the preconditioner autoPtr<lduMatrix::preconditioner> preconPtr = lduMatrix::preconditioner::New ( *this, controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA, cmpt); preconPtr->preconditionT(wT, rT, cmpt); // --- Update search directions: wArT = gSumProd(wA, rT, matrix().mesh().comm()); if (solverPerf.nIterations() == 0) { for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; pTPtr[cell] = wTPtr[cell]; } } else { scalar beta = wArT/wArTold; for (label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + beta*pAPtr[cell]; pTPtr[cell] = wTPtr[cell] + beta*pTPtr[cell]; } } // --- Update preconditioned residuals matrix_.Amul(wA, pA, interfaceBouCoeffs_, interfaces_, cmpt); matrix_.Tmul(wT, pT, interfaceIntCoeffs_, interfaces_, cmpt); scalar wApT = gSumProd(wA, pT, matrix().mesh().comm()); // --- Test for singularity if (solverPerf.checkSingularity(mag(wApT)/normFactor)) { break; } // --- Update solution and residual: scalar alpha = wArT/wApT; for (label cell=0; cell<nCells; cell++) { psiPtr[cell] += alpha*pAPtr[cell]; rAPtr[cell] -= alpha*wAPtr[cell]; rTPtr[cell] -= alpha*wTPtr[cell]; } solverPerf.finalResidual() = gSumMag(rA, matrix().mesh().comm()) /normFactor; } while ( ( solverPerf.nIterations()++ < maxIter_ && !solverPerf.checkConvergence(tolerance_, relTol_) ) || solverPerf.nIterations() < minIter_ ); } return solverPerf; }
Foam::lduMatrix::solverPerformance Foam::PCG::solve ( scalarField& psi, const scalarField& source, const direction cmpt ) const { // --- Setup class containing solver performance data lduMatrix::solverPerformance solverPerf ( lduMatrix::preconditioner::getName(controlDict_) + typeName, fieldName_ ); register label nCells = psi.size(); scalar* __restrict__ psiPtr = psi.begin(); scalarField pA(nCells); scalar* __restrict__ pAPtr = pA.begin(); scalarField wA(nCells); scalar* __restrict__ wAPtr = wA.begin(); scalar wArA = matrix_.great_; scalar wArAold = wArA; // --- Calculate A.psi matrix_.Amul(wA, psi, interfaceBouCoeffs_, interfaces_, cmpt); // --- Calculate initial residual field scalarField rA(source - wA); scalar* __restrict__ rAPtr = rA.begin(); // --- Calculate normalisation factor scalar normFactor = this->normFactor(psi, source, wA, pA); if (lduMatrix::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = gSumMag(rA)/normFactor; solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if (!solverPerf.checkConvergence(tolerance_, relTol_)) { // --- Select and construct the preconditioner autoPtr<lduMatrix::preconditioner> preconPtr = lduMatrix::preconditioner::New ( *this, controlDict_ ); // --- Solver iteration do { // --- Store previous wArA wArAold = wArA; // --- Precondition residual preconPtr->precondition(wA, rA, cmpt); // --- Update search directions: wArA = gSumProd(wA, rA); if (solverPerf.nIterations() == 0) { for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell]; } } else { scalar beta = wArA/wArAold; for (register label cell=0; cell<nCells; cell++) { pAPtr[cell] = wAPtr[cell] + beta*pAPtr[cell]; } } // --- Update preconditioned residual matrix_.Amul(wA, pA, interfaceBouCoeffs_, interfaces_, cmpt); scalar wApA = gSumProd(wA, pA); // --- Test for singularity if (solverPerf.checkSingularity(mag(wApA)/normFactor)) break; // --- Update solution and residual: scalar alpha = wArA/wApA; for (register label cell=0; cell<nCells; cell++) { psiPtr[cell] += alpha*pAPtr[cell]; rAPtr[cell] -= alpha*wAPtr[cell]; } solverPerf.finalResidual() = gSumMag(rA)/normFactor; } while ( solverPerf.nIterations()++ < maxIter_ && !(solverPerf.checkConvergence(tolerance_, relTol_)) ); } return solverPerf; }
Foam::SolverPerformance<Type> Foam::PBiCCCG<Type, DType, LUType>::solve ( gpuField<Type>& psi ) const { word preconditionerName(this->controlDict_.lookup("preconditioner")); // --- Setup class containing solver performance data SolverPerformance<Type> solverPerf ( preconditionerName + typeName, this->fieldName_ ); register label nCells = psi.size(); gpuField<Type> pA(nCells); gpuField<Type> pT(nCells, pTraits<Type>::zero); gpuField<Type> wA(nCells); gpuField<Type> wT(nCells); scalar wArT = 1e15; //this->matrix_.great_; scalar wArTold = wArT; // --- Calculate A.psi and T.psi this->matrix_.Amul(wA, psi); this->matrix_.Tmul(wT, psi); // --- Calculate initial residual and transpose residual fields gpuField<Type> rA(this->matrix_.source() - wA); gpuField<Type> rT(this->matrix_.source() - wT); // --- Calculate normalisation factor Type normFactor = this->normFactor(psi, wA, pA); if (LduMatrix<Type, DType, LUType>::debug >= 2) { Info<< " Normalisation factor = " << normFactor << endl; } // --- Calculate normalised residual norm solverPerf.initialResidual() = cmptDivide(gSumCmptMag(rA), normFactor); solverPerf.finalResidual() = solverPerf.initialResidual(); // --- Check convergence, solve if not converged if ( this->minIter_ > 0 || !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) { // --- Select and construct the preconditioner autoPtr<typename LduMatrix<Type, DType, LUType>::preconditioner> preconPtr = LduMatrix<Type, DType, LUType>::preconditioner::New ( *this, this->controlDict_ ); // --- Solver iteration do { // --- Store previous wArT wArTold = wArT; // --- Precondition residuals preconPtr->precondition(wA, rA); preconPtr->preconditionT(wT, rT); // --- Update search directions: wArT = gSumProd(wA, rT); if (solverPerf.nIterations() == 0) { thrust::copy(wA.begin(),wA.end(),pA.begin()); thrust::copy(wT.begin(),wT.end(),pT.begin()); } else { scalar beta = wArT/wArTold; thrust::transform ( wA.begin(), wA.end(), thrust::make_transform_iterator ( pA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(beta) ), pA.begin(), addOperatorFunctor<Type,Type,Type>() ); thrust::transform ( wT.begin(), wT.end(), thrust::make_transform_iterator ( pT.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(beta) ), pT.begin(), addOperatorFunctor<Type,Type,Type>() ); } // --- Update preconditioned residuals this->matrix_.Amul(wA, pA); this->matrix_.Tmul(wT, pT); scalar wApT = gSumProd(wA, pT); // --- Test for singularity if ( solverPerf.checkSingularity ( cmptDivide(pTraits<Type>::one*mag(wApT), normFactor) ) ) { break; } // --- Update solution and residual: scalar alpha = wArT/wApT; thrust::transform ( psi.begin(), psi.end(), thrust::make_transform_iterator ( pA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), psi.begin(), addOperatorFunctor<Type,Type,Type>() ); thrust::transform ( rA.begin(), rA.end(), thrust::make_transform_iterator ( wA.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), rA.begin(), subtractOperatorFunctor<Type,Type,Type>() ); thrust::transform ( rT.begin(), rT.end(), thrust::make_transform_iterator ( wT.begin(), multiplyOperatorSFFunctor<scalar,Type,Type>(alpha) ), rT.begin(), subtractOperatorFunctor<Type,Type,Type>() ); solverPerf.finalResidual() = cmptDivide(gSumCmptMag(rA), normFactor); } while ( ( solverPerf.nIterations()++ < this->maxIter_ && !solverPerf.checkConvergence(this->tolerance_, this->relTol_) ) || solverPerf.nIterations() < this->minIter_ ); } return solverPerf; }