void LinearOpScalarProd<Scalar>::scalarProdsImpl( const MultiVectorBase<Scalar>& X, const MultiVectorBase<Scalar>& Y, const ArrayView<Scalar> &scalarProds_out ) const { Teuchos::RCP<MultiVectorBase<Scalar> > T = createMembers(Y.range() ,Y.domain()->dim()); Thyra::apply(*op_, NOTRANS,Y, T.ptr()); dots(X, *T, scalarProds_out); }
void SpmdMultiVectorSerializer<Scalar>::serialize( const MultiVectorBase<Scalar>& mv, std::ostream& out ) const { Teuchos::RCP<const SpmdVectorSpaceBase<Scalar> > mpi_vec_spc = Teuchos::rcp_dynamic_cast<const SpmdVectorSpaceBase<Scalar> >(mv.range()); std::ios::fmtflags fmt(out.flags()); out.precision(std::numeric_limits<Scalar>::digits10+4); if( mpi_vec_spc.get() ) { // This is a mpi-based vector space so let's just write the local // multi-vector elements (row-by-row). const Ordinal localOffset = mpi_vec_spc->localOffset(), localSubDim = mpi_vec_spc->localSubDim(); const Range1D localRng( localOffset, localOffset+localSubDim-1 ); ConstDetachedMultiVectorView<Scalar> local_mv(mv,localRng,Range1D()); out << localSubDim << " " << local_mv.numSubCols() << std::endl; if( binaryMode() ) { // Write column-wise for better cache performance for( Ordinal j = 0; j < local_mv.numSubCols(); ++j ) out.write( reinterpret_cast<const char*>(&local_mv(0,j)), sizeof(Scalar)*localSubDim ); } else { // Write row-wise for better readability for( Ordinal i = 0; i < localSubDim; ++i ) { out << " " << i; for( Ordinal j = 0; j < local_mv.numSubCols(); ++j ) { out << " " << local_mv(i,j); } out << std::endl; } } } else { // This is a serial (or locally replicated) vector space so // just write all of the multi-vector elements here. TEUCHOS_TEST_FOR_EXCEPTION( true, std::logic_error, "Does not handle non-SPMD spaces yet" ); } out.flags(fmt); }
void doExplicitMultiVectorAdjoint( const MultiVectorBase<Scalar>& mvIn, MultiVectorBase<Scalar>* mvTransOut ) { typedef Teuchos::ScalarTraits<Scalar> ST; #ifdef TEUCHOS_DEBUG TEST_FOR_EXCEPT(0==mvTransOut); THYRA_ASSERT_VEC_SPACES("doExplicitMultiVectorAdjoint(...)", *mvIn.domain(), *mvTransOut->range() ); THYRA_ASSERT_VEC_SPACES("doExplicitMultiVectorAdjoint(...)", *mvIn.range(), *mvTransOut->domain() ); #endif ConstDetachedMultiVectorView<Scalar> dMvIn(mvIn); DetachedMultiVectorView<Scalar> dMvTransOut(*mvTransOut); const int m = dMvIn.subDim(); const int n = dMvIn.numSubCols(); for ( int j = 0; j < n; ++j ) { for ( int i = 0; i < m; ++i ) { dMvTransOut(j,i) = ST::conjugate(dMvIn(i,j)); } } }
SolveStatus<Scalar> BelosLinearOpWithSolve<Scalar>::solveImpl( const EOpTransp M_trans, const MultiVectorBase<Scalar> &B, const Ptr<MultiVectorBase<Scalar> > &X, const Ptr<const SolveCriteria<Scalar> > solveCriteria ) const { TEUCHOS_FUNC_TIME_MONITOR("BelosLOWS"); using Teuchos::rcp; using Teuchos::rcpFromRef; using Teuchos::rcpFromPtr; using Teuchos::FancyOStream; using Teuchos::OSTab; using Teuchos::describe; typedef Teuchos::ScalarTraits<Scalar> ST; typedef typename ST::magnitudeType ScalarMag; Teuchos::Time totalTimer(""), timer(""); totalTimer.start(true); assertSolveSupports(*this, M_trans, solveCriteria); // 2010/08/22: rabartl: Bug 4915 ToDo: Move the above into the NIV function // solve(...). const int numRhs = B.domain()->dim(); const int numEquations = B.range()->dim(); const RCP<FancyOStream> out = this->getOStream(); const Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel(); OSTab tab = this->getOSTab(); if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)) { *out << "\nStarting iterations with Belos:\n"; OSTab tab2(out); *out << "Using forward operator = " << describe(*fwdOpSrc_->getOp(),verbLevel); *out << "Using iterative solver = " << describe(*iterativeSolver_,verbLevel); *out << "With #Eqns="<<numEquations<<", #RHSs="<<numRhs<<" ...\n"; } // // Set RHS and LHS // bool ret = lp_->setProblem( rcpFromPtr(X), rcpFromRef(B) ); TEST_FOR_EXCEPTION( ret == false, CatastrophicSolveFailure ,"Error, the Belos::LinearProblem could not be set for the current solve!" ); // // Set the solution criteria // const RCP<Teuchos::ParameterList> tmpPL = Teuchos::parameterList(); SolveMeasureType solveMeasureType; RCP<GeneralSolveCriteriaBelosStatusTest<Scalar> > generalSolveCriteriaBelosStatusTest; if (nonnull(solveCriteria)) { solveMeasureType = solveCriteria->solveMeasureType; const ScalarMag requestedTol = solveCriteria->requestedTol; if (solveMeasureType.useDefault()) { tmpPL->set("Convergence Tolerance", defaultTol_); } else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_RHS)) { if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) { tmpPL->set("Convergence Tolerance", requestedTol); } else { tmpPL->set("Convergence Tolerance", defaultTol_); } tmpPL->set("Explicit Residual Scaling", "Norm of RHS"); } else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_INIT_RESIDUAL)) { if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) { tmpPL->set("Convergence Tolerance", requestedTol); } else { tmpPL->set("Convergence Tolerance", defaultTol_); } tmpPL->set("Explicit Residual Scaling", "Norm of Initial Residual"); } else { // Set the most generic (and inefficient) solve criteria generalSolveCriteriaBelosStatusTest = createGeneralSolveCriteriaBelosStatusTest( *solveCriteria, convergenceTestFrequency_); // Set the verbosity level (one level down) generalSolveCriteriaBelosStatusTest->setOStream(out); generalSolveCriteriaBelosStatusTest->setVerbLevel(incrVerbLevel(verbLevel, -1)); // Set the default convergence tolerance to always converged to allow // the above status test to control things. tmpPL->set("Convergence Tolerance", 1.0); } } else { // No solveCriteria was even passed in! tmpPL->set("Convergence Tolerance", defaultTol_); } // // Reset the blocksize if we adding more vectors than half the number of equations, // orthogonalization will fail on the first iteration! // RCP<const Teuchos::ParameterList> solverParams = iterativeSolver_->getCurrentParameters(); const int currBlockSize = Teuchos::getParameter<int>(*solverParams, "Block Size"); bool isNumBlocks = false; int currNumBlocks = 0; if (Teuchos::isParameterType<int>(*solverParams, "Num Blocks")) { currNumBlocks = Teuchos::getParameter<int>(*solverParams, "Num Blocks"); isNumBlocks = true; } const int newBlockSize = TEUCHOS_MIN(currBlockSize,numEquations/2); if (nonnull(out) && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE) && newBlockSize != currBlockSize) { *out << "\nAdjusted block size = " << newBlockSize << "\n"; } // tmpPL->set("Block Size",newBlockSize); // // Set the number of Krylov blocks if we are using a GMRES solver, or a solver // that recognizes "Num Blocks". Otherwise the solver will throw an error! // if (isNumBlocks) { const int Krylov_length = (currNumBlocks*currBlockSize)/newBlockSize; tmpPL->set("Num Blocks",Krylov_length); if (newBlockSize != currBlockSize) { if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE)) *out << "\nAdjusted max number of Krylov basis blocks = " << Krylov_length << "\n"; } } // // Solve the linear system // Belos::ReturnType belosSolveStatus; { RCP<std::ostream> outUsed = ( static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE) ? out : rcp(new FancyOStream(rcp(new Teuchos::oblackholestream()))) ); Teuchos::OSTab tab(outUsed,1,"BELOS"); tmpPL->set("Output Stream", outUsed); iterativeSolver_->setParameters(tmpPL); if (nonnull(generalSolveCriteriaBelosStatusTest)) { iterativeSolver_->setUserConvStatusTest(generalSolveCriteriaBelosStatusTest); } belosSolveStatus = iterativeSolver_->solve(); } // // Report the solve status // totalTimer.stop(); SolveStatus<Scalar> solveStatus; switch (belosSolveStatus) { case Belos::Unconverged: { solveStatus.solveStatus = SOLVE_STATUS_UNCONVERGED; break; } case Belos::Converged: { solveStatus.solveStatus = SOLVE_STATUS_CONVERGED; if (nonnull(generalSolveCriteriaBelosStatusTest)) { const ArrayView<const ScalarMag> achievedTol = generalSolveCriteriaBelosStatusTest->achievedTol(); solveStatus.achievedTol = ST::zero(); for (Ordinal i = 0; i < achievedTol.size(); ++i) { solveStatus.achievedTol = std::max(solveStatus.achievedTol, achievedTol[i]); } } else { solveStatus.achievedTol = tmpPL->get("Convergence Tolerance", defaultTol_); } break; } TEUCHOS_SWITCH_DEFAULT_DEBUG_ASSERT(); } std::ostringstream ossmessage; ossmessage << "The Belos solver of type \""<<iterativeSolver_->description() <<"\" returned a solve status of \""<< toString(solveStatus.solveStatus) << "\"" << " in " << iterativeSolver_->getNumIters() << " iterations" << " with total CPU time of " << totalTimer.totalElapsedTime() << " sec" ; if (out.get() && static_cast<int>(verbLevel) >=static_cast<int>(Teuchos::VERB_LOW)) *out << "\n" << ossmessage.str() << "\n"; solveStatus.message = ossmessage.str(); if (out.get() && static_cast<int>(verbLevel) >= static_cast<int>(Teuchos::VERB_LOW)) *out << "\nTotal solve time in Belos = "<<totalTimer.totalElapsedTime()<<" sec\n"; return solveStatus; }
SolveStatus<Scalar> BelosLinearOpWithSolve<Scalar>::solveImpl( const EOpTransp M_trans, const MultiVectorBase<Scalar> &B, const Ptr<MultiVectorBase<Scalar> > &X, const Ptr<const SolveCriteria<Scalar> > solveCriteria ) const { THYRA_FUNC_TIME_MONITOR("Stratimikos: BelosLOWS"); using Teuchos::rcp; using Teuchos::rcpFromRef; using Teuchos::rcpFromPtr; using Teuchos::FancyOStream; using Teuchos::OSTab; using Teuchos::ParameterList; using Teuchos::parameterList; using Teuchos::describe; typedef Teuchos::ScalarTraits<Scalar> ST; typedef typename ST::magnitudeType ScalarMag; Teuchos::Time totalTimer(""), timer(""); totalTimer.start(true); assertSolveSupports(*this, M_trans, solveCriteria); // 2010/08/22: rabartl: Bug 4915 ToDo: Move the above into the NIV function // solve(...). const RCP<FancyOStream> out = this->getOStream(); const Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel(); OSTab tab = this->getOSTab(); if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)) { *out << "\nStarting iterations with Belos:\n"; OSTab tab2(out); *out << "Using forward operator = " << describe(*fwdOpSrc_->getOp(),verbLevel); *out << "Using iterative solver = " << describe(*iterativeSolver_,verbLevel); *out << "With #Eqns="<<B.range()->dim()<<", #RHSs="<<B.domain()->dim()<<" ...\n"; } // // Set RHS and LHS // bool ret = lp_->setProblem( rcpFromPtr(X), rcpFromRef(B) ); TEUCHOS_TEST_FOR_EXCEPTION( ret == false, CatastrophicSolveFailure ,"Error, the Belos::LinearProblem could not be set for the current solve!" ); // // Set the solution criteria // // Parameter list for the current solve. const RCP<ParameterList> tmpPL = Teuchos::parameterList(); // The solver's valid parameter list. RCP<const ParameterList> validPL = iterativeSolver_->getValidParameters(); SolveMeasureType solveMeasureType; RCP<GeneralSolveCriteriaBelosStatusTest<Scalar> > generalSolveCriteriaBelosStatusTest; if (nonnull(solveCriteria)) { solveMeasureType = solveCriteria->solveMeasureType; const ScalarMag requestedTol = solveCriteria->requestedTol; if (solveMeasureType.useDefault()) { tmpPL->set("Convergence Tolerance", defaultTol_); } else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_RHS)) { if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) { tmpPL->set("Convergence Tolerance", requestedTol); } else { tmpPL->set("Convergence Tolerance", defaultTol_); } setResidualScalingType (tmpPL, validPL, "Norm of RHS"); } else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_INIT_RESIDUAL)) { if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) { tmpPL->set("Convergence Tolerance", requestedTol); } else { tmpPL->set("Convergence Tolerance", defaultTol_); } setResidualScalingType (tmpPL, validPL, "Norm of Initial Residual"); } else { // Set the most generic (and inefficient) solve criteria generalSolveCriteriaBelosStatusTest = createGeneralSolveCriteriaBelosStatusTest( *solveCriteria, convergenceTestFrequency_); // Set the verbosity level (one level down) generalSolveCriteriaBelosStatusTest->setOStream(out); generalSolveCriteriaBelosStatusTest->setVerbLevel(incrVerbLevel(verbLevel, -1)); // Set the default convergence tolerance to always converged to allow // the above status test to control things. tmpPL->set("Convergence Tolerance", 1.0); } // maximum iterations if (nonnull(solveCriteria->extraParameters)) { if (Teuchos::isParameterType<int>(*solveCriteria->extraParameters,"Maximum Iterations")) { tmpPL->set("Maximum Iterations", Teuchos::get<int>(*solveCriteria->extraParameters,"Maximum Iterations")); } } } else { // No solveCriteria was even passed in! tmpPL->set("Convergence Tolerance", defaultTol_); } // // Solve the linear system // Belos::ReturnType belosSolveStatus; { RCP<std::ostream> outUsed = ( static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW) ? out : rcp(new FancyOStream(rcp(new Teuchos::oblackholestream()))) ); Teuchos::OSTab tab1(outUsed,1,"BELOS"); tmpPL->set("Output Stream", outUsed); iterativeSolver_->setParameters(tmpPL); if (nonnull(generalSolveCriteriaBelosStatusTest)) { iterativeSolver_->setUserConvStatusTest(generalSolveCriteriaBelosStatusTest); } belosSolveStatus = iterativeSolver_->solve(); } // // Report the solve status // totalTimer.stop(); SolveStatus<Scalar> solveStatus; switch (belosSolveStatus) { case Belos::Unconverged: { solveStatus.solveStatus = SOLVE_STATUS_UNCONVERGED; // Set achievedTol even if the solver did not converge. This is // helpful for things like nonlinear solvers, which might be // able to use a partially converged result, and which would // like to know the achieved convergence tolerance for use in // computing bounds. It's also helpful for estimating whether a // small increase in the maximum iteration count might be // helpful next time. try { // Some solvers might not have implemented achievedTol(). // The default implementation throws std::runtime_error. solveStatus.achievedTol = iterativeSolver_->achievedTol(); } catch (std::runtime_error&) { // Do nothing; use the default value of achievedTol. } break; } case Belos::Converged: { solveStatus.solveStatus = SOLVE_STATUS_CONVERGED; if (nonnull(generalSolveCriteriaBelosStatusTest)) { // The user set a custom status test. This means that we // should ask the custom status test itself, rather than the // Belos solver, what the final achieved convergence tolerance // was. const ArrayView<const ScalarMag> achievedTol = generalSolveCriteriaBelosStatusTest->achievedTol(); solveStatus.achievedTol = ST::zero(); for (Ordinal i = 0; i < achievedTol.size(); ++i) { solveStatus.achievedTol = std::max(solveStatus.achievedTol, achievedTol[i]); } } else { try { // Some solvers might not have implemented achievedTol(). // The default implementation throws std::runtime_error. solveStatus.achievedTol = iterativeSolver_->achievedTol(); } catch (std::runtime_error&) { // Use the default convergence tolerance. This is a correct // upper bound, since we did actually converge. solveStatus.achievedTol = tmpPL->get("Convergence Tolerance", defaultTol_); } } break; } TEUCHOS_SWITCH_DEFAULT_DEBUG_ASSERT(); } std::ostringstream ossmessage; ossmessage << "The Belos solver of type \""<<iterativeSolver_->description() <<"\" returned a solve status of \""<< toString(solveStatus.solveStatus) << "\"" << " in " << iterativeSolver_->getNumIters() << " iterations" << " with total CPU time of " << totalTimer.totalElapsedTime() << " sec" ; if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE)) *out << "\n" << ossmessage.str() << "\n"; solveStatus.message = ossmessage.str(); // Dump the getNumIters() and the achieved convergence tolerance // into solveStatus.extraParameters, as the "Belos/Iteration Count" // resp. "Belos/Achieved Tolerance" parameters. if (solveStatus.extraParameters.is_null()) { solveStatus.extraParameters = parameterList (); } solveStatus.extraParameters->set ("Belos/Iteration Count", iterativeSolver_->getNumIters());\ // package independent version of the same solveStatus.extraParameters->set ("Iteration Count", iterativeSolver_->getNumIters());\ // NOTE (mfh 13 Dec 2011) Though the most commonly used Belos // solvers do implement achievedTol(), some Belos solvers currently // do not. In the latter case, if the solver did not converge, the // reported achievedTol() value may just be the default "invalid" // value -1, and if the solver did converge, the reported value will // just be the convergence tolerance (a correct upper bound). solveStatus.extraParameters->set ("Belos/Achieved Tolerance", solveStatus.achievedTol); // This information is in the previous line, which is printed anytime the verbosity // is not set to Teuchos::VERB_NONE, so I'm commenting this out for now. // if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE)) // *out << "\nTotal solve time in Belos = "<<totalTimer.totalElapsedTime()<<" sec\n"; return solveStatus; }
bool SpmdMultiVectorSerializer<Scalar>::isCompatible( const MultiVectorBase<Scalar> &mv ) const { return 0!=dynamic_cast<const SpmdVectorSpaceBase<Scalar>*>(&*mv.range()); }
void SpmdMultiVectorBase<Scalar>::euclideanApply( const EOpTransp M_trans, const MultiVectorBase<Scalar> &X, const Ptr<MultiVectorBase<Scalar> > &Y, const Scalar alpha, const Scalar beta ) const { typedef Teuchos::ScalarTraits<Scalar> ST; using Teuchos::Workspace; Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get(); #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES Teuchos::Time timerTotal("dummy",true); Teuchos::Time timer("dummy"); #endif // // This function performs one of two operations. // // The first operation (M_trans == NOTRANS) is: // // Y = beta * Y + alpha * M * X // // where Y and M have compatible (distributed?) range vector // spaces and X is a locally replicated serial multi-vector. This // operation does not require any global communication. // // The second operation (M_trans == TRANS) is: // // Y = beta * Y + alpha * M' * X // // where M and X have compatible (distributed?) range vector spaces // and Y is a locally replicated serial multi-vector. This operation // requires a local reduction. // // // Get spaces and validate compatibility // // Get the SpmdVectorSpace const SpmdVectorSpaceBase<Scalar> &spmdSpc = *this->spmdSpace(); // Get the Spmd communicator const RCP<const Teuchos::Comm<Ordinal> > comm = spmdSpc.getComm(); #ifdef TEUCHOS_DEBUG const VectorSpaceBase<Scalar> &Y_range = *Y->range(), &X_range = *X.range(); // std::cout << "SpmdMultiVectorBase<Scalar>::apply(...): comm = " << comm << std::endl; TEUCHOS_TEST_FOR_EXCEPTION( ( globalDim_ > localSubDim_ ) && comm.get()==NULL, std::logic_error ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!" ); // ToDo: Write a good general validation function that I can call that will replace // all of these TEUCHOS_TEST_FOR_EXCEPTION(...) uses TEUCHOS_TEST_FOR_EXCEPTION( real_trans(M_trans)==NOTRANS && !spmdSpc.isCompatible(Y_range), Exceptions::IncompatibleVectorSpaces ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!" ); TEUCHOS_TEST_FOR_EXCEPTION( real_trans(M_trans)==TRANS && !spmdSpc.isCompatible(X_range), Exceptions::IncompatibleVectorSpaces ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!" ); #endif // // Get explicit (local) views of Y, M and X // #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.start(); #endif DetachedMultiVectorView<Scalar> Y_local( *Y, real_trans(M_trans)==NOTRANS ? Range1D(localOffset_,localOffset_+localSubDim_-1) : Range1D(), Range1D() ); ConstDetachedMultiVectorView<Scalar> M_local( *this, Range1D(localOffset_,localOffset_+localSubDim_-1), Range1D() ); ConstDetachedMultiVectorView<Scalar> X_local( X ,real_trans(M_trans)==NOTRANS ? Range1D() : Range1D(localOffset_,localOffset_+localSubDim_-1) ,Range1D() ); #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.stop(); std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for getting view = " << timer.totalElapsedTime() << " seconds\n"; #endif #ifdef TEUCHOS_DEBUG TEUCHOS_TEST_FOR_EXCEPTION( real_trans(M_trans)==NOTRANS && ( M_local.numSubCols() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() ) , Exceptions::IncompatibleVectorSpaces ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!" ); TEUCHOS_TEST_FOR_EXCEPTION( real_trans(M_trans)==TRANS && ( M_local.subDim() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() ) , Exceptions::IncompatibleVectorSpaces ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!" ); #endif // // If nonlocal (i.e. M_trans==TRANS) then create temporary storage // for: // // Y_local_tmp = alpha * M(local) * X(local) : on nonroot processes // // or // // Y_local_tmp = beta*Y_local + alpha * M(local) * X(local) : on root process (localOffset_==0) // // and set // // localBeta = ( localOffset_ == 0 ? beta : 0.0 ) // // Above, we choose localBeta such that we will only perform // Y_local = beta * Y_local + ... on one process (the root // process where localOffset_==0x). Then, when we add up Y_local // on all of the processors and we will get the correct result. // // If strictly local (i.e. M_trans == NOTRANS) then set: // // Y_local_tmp = Y_local // localBeta = beta // #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.start(); #endif Workspace<Scalar> Y_local_tmp_store(wss, Y_local.subDim()*Y_local.numSubCols(), false); RTOpPack::SubMultiVectorView<Scalar> Y_local_tmp; Scalar localBeta; if( real_trans(M_trans) == TRANS && globalDim_ > localSubDim_ ) { // Nonlocal Y_local_tmp.initialize( 0, Y_local.subDim(), 0, Y_local.numSubCols(), Teuchos::arcpFromArrayView(Y_local_tmp_store()), Y_local.subDim() // leadingDim == subDim (columns are adjacent) ); if( localOffset_ == 0 ) { // Root process: Must copy Y_local into Y_local_tmp for( int j = 0; j < Y_local.numSubCols(); ++j ) { Scalar *Y_local_j = Y_local.values() + Y_local.leadingDim()*j; std::copy( Y_local_j, Y_local_j + Y_local.subDim(), Y_local_tmp.values() + Y_local_tmp.leadingDim()*j ); } localBeta = beta; } else { // Not the root process localBeta = 0.0; } } else { // Local Y_local_tmp = Y_local.smv(); // Shallow copy only! localBeta = beta; } #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.stop(); std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for setting up Y_local_tmp and localBeta = " << timer.totalElapsedTime() << " seconds\n"; #endif // // Perform the local multiplication: // // Y(local) = localBeta * Y(local) + alpha * op(M(local)) * X(local) // // or in BLAS lingo: // // C = beta * C + alpha * op(A) * op(B) // #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.start(); #endif Teuchos::ETransp t_transp; if(ST::isComplex) { switch(M_trans) { case NOTRANS: t_transp = Teuchos::NO_TRANS; break; case TRANS: t_transp = Teuchos::TRANS; break; case CONJTRANS: t_transp = Teuchos::CONJ_TRANS; break; default: TEUCHOS_TEST_FOR_EXCEPT(true); } } else { switch(real_trans(M_trans)) { case NOTRANS: t_transp = Teuchos::NO_TRANS; break; case TRANS: t_transp = Teuchos::TRANS; break; default: TEUCHOS_TEST_FOR_EXCEPT(true); } } if (M_local.numSubCols() > 0) { // AGS: Added std::max on ld? below, following what is done in // Epetra_MultiVector Multiply use of GEMM. Allows for 0 length. blas_.GEMM( t_transp // TRANSA ,Teuchos::NO_TRANS // TRANSB ,Y_local.subDim() // M ,Y_local.numSubCols() // N ,real_trans(M_trans)==NOTRANS ? M_local.numSubCols() : M_local.subDim() // K ,alpha // ALPHA ,const_cast<Scalar*>(M_local.values()) // A ,std::max((int) M_local.leadingDim(),1) // LDA ,const_cast<Scalar*>(X_local.values()) // B ,std::max((int) X_local.leadingDim(),1) // LDB ,localBeta // BETA ,Y_local_tmp.values().get() // C ,std::max((int) Y_local_tmp.leadingDim(),1) // LDC ); } else { std::fill( Y_local_tmp.values().begin(), Y_local_tmp.values().end(), ST::zero() ); } #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.stop(); std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for GEMM = " << timer.totalElapsedTime() << " seconds\n"; #endif if( comm.get() ) { // // Perform the global reduction of Y_local_tmp back into Y_local // if( real_trans(M_trans)==TRANS && globalDim_ > localSubDim_ ) { // Contiguous buffer for final reduction Workspace<Scalar> Y_local_final_buff(wss,Y_local.subDim()*Y_local.numSubCols(),false); // Perform the reduction Teuchos::reduceAll<Ordinal,Scalar>( *comm,Teuchos::REDUCE_SUM,Y_local_final_buff.size(),Y_local_tmp.values().get(), &Y_local_final_buff[0] ); // Load Y_local_final_buff back into Y_local const Scalar *Y_local_final_buff_ptr = &Y_local_final_buff[0]; for( int j = 0; j < Y_local.numSubCols(); ++j ) { Scalar *Y_local_ptr = Y_local.values() + Y_local.leadingDim()*j; for( int i = 0; i < Y_local.subDim(); ++i ) { (*Y_local_ptr++) = (*Y_local_final_buff_ptr++); } } } } else { // When you get here the view Y_local will be committed back to Y // in the destructor to Y_local } #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES timer.stop(); std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Total time = " << timerTotal.totalElapsedTime() << " seconds\n"; #endif }