コード例 #1
0
void LinearOpScalarProd<Scalar>::scalarProdsImpl(
  const MultiVectorBase<Scalar>& X, const MultiVectorBase<Scalar>& Y,
  const ArrayView<Scalar> &scalarProds_out
  ) const
{
  Teuchos::RCP<MultiVectorBase<Scalar> >
    T = createMembers(Y.range() ,Y.domain()->dim());
  Thyra::apply(*op_, NOTRANS,Y, T.ptr());
  dots(X, *T, scalarProds_out);
}
コード例 #2
0
void SpmdMultiVectorSerializer<Scalar>::serialize(
  const MultiVectorBase<Scalar>& mv, std::ostream& out
  ) const
{
  Teuchos::RCP<const SpmdVectorSpaceBase<Scalar> >
    mpi_vec_spc
    = Teuchos::rcp_dynamic_cast<const SpmdVectorSpaceBase<Scalar> >(mv.range());
  std::ios::fmtflags fmt(out.flags());
  out.precision(std::numeric_limits<Scalar>::digits10+4);
  if( mpi_vec_spc.get() ) {
    // This is a mpi-based vector space so let's just write the local
    // multi-vector elements (row-by-row).
    const Ordinal
      localOffset = mpi_vec_spc->localOffset(),
      localSubDim = mpi_vec_spc->localSubDim();
    const Range1D localRng( localOffset, localOffset+localSubDim-1 );
    ConstDetachedMultiVectorView<Scalar> local_mv(mv,localRng,Range1D());
    out << localSubDim << " " << local_mv.numSubCols() << std::endl;
    if( binaryMode() ) {
      // Write column-wise for better cache performance
      for( Ordinal j = 0; j < local_mv.numSubCols(); ++j )
        out.write( reinterpret_cast<const char*>(&local_mv(0,j)), sizeof(Scalar)*localSubDim );
    }
    else {
      // Write row-wise for better readability
      for( Ordinal i = 0; i < localSubDim; ++i ) {
        out << " " << i;
        for( Ordinal j = 0; j < local_mv.numSubCols(); ++j ) {
          out << " " << local_mv(i,j);
        }
        out << std::endl;
      }
    }
  }
  else {
    //  This is a serial (or locally replicated) vector space so
    // just write all of the multi-vector elements here.
    TEUCHOS_TEST_FOR_EXCEPTION( true, std::logic_error, "Does not handle non-SPMD spaces yet" );
  }
  out.flags(fmt);
}
コード例 #3
0
void doExplicitMultiVectorAdjoint(
  const MultiVectorBase<Scalar>& mvIn, MultiVectorBase<Scalar>* mvTransOut
  )
{
  typedef Teuchos::ScalarTraits<Scalar> ST;
#ifdef TEUCHOS_DEBUG
  TEST_FOR_EXCEPT(0==mvTransOut);
  THYRA_ASSERT_VEC_SPACES("doExplicitMultiVectorAdjoint(...)",
    *mvIn.domain(), *mvTransOut->range()
    );
  THYRA_ASSERT_VEC_SPACES("doExplicitMultiVectorAdjoint(...)",
    *mvIn.range(), *mvTransOut->domain()
    );
#endif
  ConstDetachedMultiVectorView<Scalar> dMvIn(mvIn);
  DetachedMultiVectorView<Scalar> dMvTransOut(*mvTransOut);
  const int m = dMvIn.subDim();
  const int n = dMvIn.numSubCols();
  for ( int j = 0; j < n; ++j ) {
    for ( int i = 0; i < m; ++i ) {
      dMvTransOut(j,i) = ST::conjugate(dMvIn(i,j));
    }
  }
}
コード例 #4
0
SolveStatus<Scalar>
BelosLinearOpWithSolve<Scalar>::solveImpl(
  const EOpTransp M_trans,
  const MultiVectorBase<Scalar> &B,
  const Ptr<MultiVectorBase<Scalar> > &X,
  const Ptr<const SolveCriteria<Scalar> > solveCriteria
  ) const
{

  TEUCHOS_FUNC_TIME_MONITOR("BelosLOWS");

  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::rcpFromPtr;
  using Teuchos::FancyOStream;
  using Teuchos::OSTab;
  using Teuchos::describe;
  typedef Teuchos::ScalarTraits<Scalar> ST;
  typedef typename ST::magnitudeType ScalarMag;
  Teuchos::Time totalTimer(""), timer("");
  totalTimer.start(true);

  assertSolveSupports(*this, M_trans, solveCriteria);
  // 2010/08/22: rabartl: Bug 4915 ToDo: Move the above into the NIV function
  // solve(...).

  const int numRhs = B.domain()->dim();
  const int numEquations = B.range()->dim();

  const RCP<FancyOStream> out = this->getOStream();
  const Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel();
  OSTab tab = this->getOSTab();
  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)) {
    *out << "\nStarting iterations with Belos:\n";
    OSTab tab2(out);
    *out << "Using forward operator = " << describe(*fwdOpSrc_->getOp(),verbLevel);
    *out << "Using iterative solver = " << describe(*iterativeSolver_,verbLevel);
    *out << "With #Eqns="<<numEquations<<", #RHSs="<<numRhs<<" ...\n";
  }

  //
  // Set RHS and LHS
  //

  bool ret = lp_->setProblem( rcpFromPtr(X), rcpFromRef(B) );
  TEST_FOR_EXCEPTION(
    ret == false, CatastrophicSolveFailure
    ,"Error, the Belos::LinearProblem could not be set for the current solve!"
    );

  //
  // Set the solution criteria
  //

  const RCP<Teuchos::ParameterList> tmpPL = Teuchos::parameterList();

  SolveMeasureType solveMeasureType;
  RCP<GeneralSolveCriteriaBelosStatusTest<Scalar> > generalSolveCriteriaBelosStatusTest;
  if (nonnull(solveCriteria)) {
    solveMeasureType = solveCriteria->solveMeasureType;
    const ScalarMag requestedTol = solveCriteria->requestedTol;
    if (solveMeasureType.useDefault()) {
      tmpPL->set("Convergence Tolerance", defaultTol_);
    }
    else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_RHS)) {
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      }
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      }
      tmpPL->set("Explicit Residual Scaling", "Norm of RHS");
    }
    else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_INIT_RESIDUAL)) {
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      }
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      }
      tmpPL->set("Explicit Residual Scaling", "Norm of Initial Residual");
    }
    else {
      // Set the most generic (and inefficient) solve criteria
      generalSolveCriteriaBelosStatusTest = createGeneralSolveCriteriaBelosStatusTest(
        *solveCriteria, convergenceTestFrequency_);
      // Set the verbosity level (one level down)
      generalSolveCriteriaBelosStatusTest->setOStream(out);
      generalSolveCriteriaBelosStatusTest->setVerbLevel(incrVerbLevel(verbLevel, -1));
      // Set the default convergence tolerance to always converged to allow
      // the above status test to control things.
      tmpPL->set("Convergence Tolerance", 1.0);
    }
  }
  else {
    // No solveCriteria was even passed in!
    tmpPL->set("Convergence Tolerance", defaultTol_);
  }

  //
  // Reset the blocksize if we adding more vectors than half the number of equations,
  // orthogonalization will fail on the first iteration!
  //

  RCP<const Teuchos::ParameterList> solverParams = iterativeSolver_->getCurrentParameters();
  const int currBlockSize = Teuchos::getParameter<int>(*solverParams, "Block Size");
  bool isNumBlocks = false;
  int currNumBlocks = 0;
  if (Teuchos::isParameterType<int>(*solverParams, "Num Blocks")) {
    currNumBlocks = Teuchos::getParameter<int>(*solverParams, "Num Blocks");
    isNumBlocks = true;
  }
  const int newBlockSize = TEUCHOS_MIN(currBlockSize,numEquations/2);
  if (nonnull(out)
    && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE)
    && newBlockSize != currBlockSize)
  {
    *out << "\nAdjusted block size = " << newBlockSize << "\n";
  }
  //
  tmpPL->set("Block Size",newBlockSize);

  //
  // Set the number of Krylov blocks if we are using a GMRES solver, or a solver
  // that recognizes "Num Blocks". Otherwise the solver will throw an error!
  //

  if (isNumBlocks) {
    const int Krylov_length = (currNumBlocks*currBlockSize)/newBlockSize;
    tmpPL->set("Num Blocks",Krylov_length);
  
    if (newBlockSize != currBlockSize) {
      if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE))
        *out
          << "\nAdjusted max number of Krylov basis blocks = " << Krylov_length << "\n";
    }
  }

  //
  // Solve the linear system
  //

  Belos::ReturnType belosSolveStatus;
  {
    RCP<std::ostream>
      outUsed =
      ( static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE)
        ? out
        : rcp(new FancyOStream(rcp(new Teuchos::oblackholestream())))
        );
    Teuchos::OSTab tab(outUsed,1,"BELOS");
    tmpPL->set("Output Stream", outUsed);
    iterativeSolver_->setParameters(tmpPL);
    if (nonnull(generalSolveCriteriaBelosStatusTest)) {
      iterativeSolver_->setUserConvStatusTest(generalSolveCriteriaBelosStatusTest);
    }
    belosSolveStatus = iterativeSolver_->solve();
  }

  //
  // Report the solve status
  //

  totalTimer.stop();

  SolveStatus<Scalar> solveStatus;

  switch (belosSolveStatus) {
    case Belos::Unconverged: {
      solveStatus.solveStatus = SOLVE_STATUS_UNCONVERGED;
      break;
    }
    case Belos::Converged: {
      solveStatus.solveStatus = SOLVE_STATUS_CONVERGED;
      if (nonnull(generalSolveCriteriaBelosStatusTest)) {
        const ArrayView<const ScalarMag> achievedTol = 
          generalSolveCriteriaBelosStatusTest->achievedTol();
        solveStatus.achievedTol = ST::zero();
        for (Ordinal i = 0; i < achievedTol.size(); ++i) {
          solveStatus.achievedTol = std::max(solveStatus.achievedTol, achievedTol[i]);
        }
      }
      else {
        solveStatus.achievedTol = tmpPL->get("Convergence Tolerance", defaultTol_);
      }
      break;
    }
    TEUCHOS_SWITCH_DEFAULT_DEBUG_ASSERT();
  }

  std::ostringstream ossmessage;
  ossmessage
    << "The Belos solver of type \""<<iterativeSolver_->description()
    <<"\" returned a solve status of \""<< toString(solveStatus.solveStatus) << "\""
    << " in " << iterativeSolver_->getNumIters() << " iterations"
    << " with total CPU time of " << totalTimer.totalElapsedTime() << " sec" ;
  if (out.get() && static_cast<int>(verbLevel) >=static_cast<int>(Teuchos::VERB_LOW))
    *out << "\n" << ossmessage.str() << "\n";

  solveStatus.message = ossmessage.str();

  if (out.get() && static_cast<int>(verbLevel) >= static_cast<int>(Teuchos::VERB_LOW))
    *out << "\nTotal solve time in Belos = "<<totalTimer.totalElapsedTime()<<" sec\n";

  return solveStatus;

}
コード例 #5
0
SolveStatus<Scalar>
BelosLinearOpWithSolve<Scalar>::solveImpl(
  const EOpTransp M_trans,
  const MultiVectorBase<Scalar> &B,
  const Ptr<MultiVectorBase<Scalar> > &X,
  const Ptr<const SolveCriteria<Scalar> > solveCriteria
  ) const
{

  THYRA_FUNC_TIME_MONITOR("Stratimikos: BelosLOWS");

  using Teuchos::rcp;
  using Teuchos::rcpFromRef;
  using Teuchos::rcpFromPtr;
  using Teuchos::FancyOStream;
  using Teuchos::OSTab;
  using Teuchos::ParameterList;
  using Teuchos::parameterList;
  using Teuchos::describe;
  typedef Teuchos::ScalarTraits<Scalar> ST;
  typedef typename ST::magnitudeType ScalarMag;
  Teuchos::Time totalTimer(""), timer("");
  totalTimer.start(true);

  assertSolveSupports(*this, M_trans, solveCriteria);
  // 2010/08/22: rabartl: Bug 4915 ToDo: Move the above into the NIV function
  // solve(...).

  const RCP<FancyOStream> out = this->getOStream();
  const Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel();
  OSTab tab = this->getOSTab();
  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)) {
    *out << "\nStarting iterations with Belos:\n";
    OSTab tab2(out);
    *out << "Using forward operator = " << describe(*fwdOpSrc_->getOp(),verbLevel);
    *out << "Using iterative solver = " << describe(*iterativeSolver_,verbLevel);
    *out << "With #Eqns="<<B.range()->dim()<<", #RHSs="<<B.domain()->dim()<<" ...\n";
  }

  //
  // Set RHS and LHS
  //

  bool ret = lp_->setProblem( rcpFromPtr(X), rcpFromRef(B) );
  TEUCHOS_TEST_FOR_EXCEPTION(
    ret == false, CatastrophicSolveFailure
    ,"Error, the Belos::LinearProblem could not be set for the current solve!"
    );

  //
  // Set the solution criteria
  //

  // Parameter list for the current solve.
  const RCP<ParameterList> tmpPL = Teuchos::parameterList();

  // The solver's valid parameter list.
  RCP<const ParameterList> validPL = iterativeSolver_->getValidParameters();

  SolveMeasureType solveMeasureType;
  RCP<GeneralSolveCriteriaBelosStatusTest<Scalar> > generalSolveCriteriaBelosStatusTest;
  if (nonnull(solveCriteria)) {
    solveMeasureType = solveCriteria->solveMeasureType;
    const ScalarMag requestedTol = solveCriteria->requestedTol;
    if (solveMeasureType.useDefault()) {
      tmpPL->set("Convergence Tolerance", defaultTol_);
    }
    else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_RHS)) {
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      }
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      }
      setResidualScalingType (tmpPL, validPL, "Norm of RHS");
    }
    else if (solveMeasureType(SOLVE_MEASURE_NORM_RESIDUAL, SOLVE_MEASURE_NORM_INIT_RESIDUAL)) {
      if (requestedTol != SolveCriteria<Scalar>::unspecifiedTolerance()) {
        tmpPL->set("Convergence Tolerance", requestedTol);
      }
      else {
        tmpPL->set("Convergence Tolerance", defaultTol_);
      }
      setResidualScalingType (tmpPL, validPL, "Norm of Initial Residual");
    }
    else {
      // Set the most generic (and inefficient) solve criteria
      generalSolveCriteriaBelosStatusTest = createGeneralSolveCriteriaBelosStatusTest(
        *solveCriteria, convergenceTestFrequency_);
      // Set the verbosity level (one level down)
      generalSolveCriteriaBelosStatusTest->setOStream(out);
      generalSolveCriteriaBelosStatusTest->setVerbLevel(incrVerbLevel(verbLevel, -1));
      // Set the default convergence tolerance to always converged to allow
      // the above status test to control things.
      tmpPL->set("Convergence Tolerance", 1.0);
    }
    // maximum iterations
    if (nonnull(solveCriteria->extraParameters)) {
      if (Teuchos::isParameterType<int>(*solveCriteria->extraParameters,"Maximum Iterations")) {
        tmpPL->set("Maximum Iterations", Teuchos::get<int>(*solveCriteria->extraParameters,"Maximum Iterations"));
      }
    }
  }
  else {
    // No solveCriteria was even passed in!
    tmpPL->set("Convergence Tolerance", defaultTol_);
  }

  //
  // Solve the linear system
  //

  Belos::ReturnType belosSolveStatus;
  {
    RCP<std::ostream>
      outUsed =
      ( static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_LOW)
        ? out
        : rcp(new FancyOStream(rcp(new Teuchos::oblackholestream())))
        );
    Teuchos::OSTab tab1(outUsed,1,"BELOS");
    tmpPL->set("Output Stream", outUsed);
    iterativeSolver_->setParameters(tmpPL);
    if (nonnull(generalSolveCriteriaBelosStatusTest)) {
      iterativeSolver_->setUserConvStatusTest(generalSolveCriteriaBelosStatusTest);
    }
    belosSolveStatus = iterativeSolver_->solve();
  }

  //
  // Report the solve status
  //

  totalTimer.stop();

  SolveStatus<Scalar> solveStatus;

  switch (belosSolveStatus) {
    case Belos::Unconverged: {
      solveStatus.solveStatus = SOLVE_STATUS_UNCONVERGED;
      // Set achievedTol even if the solver did not converge.  This is
      // helpful for things like nonlinear solvers, which might be
      // able to use a partially converged result, and which would
      // like to know the achieved convergence tolerance for use in
      // computing bounds.  It's also helpful for estimating whether a
      // small increase in the maximum iteration count might be
      // helpful next time.
      try {
	// Some solvers might not have implemented achievedTol(). 
	// The default implementation throws std::runtime_error.
	solveStatus.achievedTol = iterativeSolver_->achievedTol();
      } catch (std::runtime_error&) {
	// Do nothing; use the default value of achievedTol.
      }
      break;
    }
    case Belos::Converged: {
      solveStatus.solveStatus = SOLVE_STATUS_CONVERGED;
      if (nonnull(generalSolveCriteriaBelosStatusTest)) {
	// The user set a custom status test.  This means that we
	// should ask the custom status test itself, rather than the
	// Belos solver, what the final achieved convergence tolerance
	// was.
        const ArrayView<const ScalarMag> achievedTol = 
          generalSolveCriteriaBelosStatusTest->achievedTol();
        solveStatus.achievedTol = ST::zero();
        for (Ordinal i = 0; i < achievedTol.size(); ++i) {
          solveStatus.achievedTol = std::max(solveStatus.achievedTol, achievedTol[i]);
        }
      }
      else {
	try {
	  // Some solvers might not have implemented achievedTol(). 
	  // The default implementation throws std::runtime_error.
	  solveStatus.achievedTol = iterativeSolver_->achievedTol();
	} catch (std::runtime_error&) {
	  // Use the default convergence tolerance.  This is a correct
	  // upper bound, since we did actually converge.
	  solveStatus.achievedTol = tmpPL->get("Convergence Tolerance", defaultTol_);
	}
      }
      break;
    }
    TEUCHOS_SWITCH_DEFAULT_DEBUG_ASSERT();
  }

  std::ostringstream ossmessage;
  ossmessage
    << "The Belos solver of type \""<<iterativeSolver_->description()
    <<"\" returned a solve status of \""<< toString(solveStatus.solveStatus) << "\""
    << " in " << iterativeSolver_->getNumIters() << " iterations"
    << " with total CPU time of " << totalTimer.totalElapsedTime() << " sec" ;
  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE))
    *out << "\n" << ossmessage.str() << "\n";

  solveStatus.message = ossmessage.str();

  // Dump the getNumIters() and the achieved convergence tolerance
  // into solveStatus.extraParameters, as the "Belos/Iteration Count"
  // resp. "Belos/Achieved Tolerance" parameters.
  if (solveStatus.extraParameters.is_null()) {
    solveStatus.extraParameters = parameterList ();
  }
  solveStatus.extraParameters->set ("Belos/Iteration Count", 
				    iterativeSolver_->getNumIters());\
  // package independent version of the same
  solveStatus.extraParameters->set ("Iteration Count", 
				    iterativeSolver_->getNumIters());\
  // NOTE (mfh 13 Dec 2011) Though the most commonly used Belos
  // solvers do implement achievedTol(), some Belos solvers currently
  // do not.  In the latter case, if the solver did not converge, the
  // reported achievedTol() value may just be the default "invalid"
  // value -1, and if the solver did converge, the reported value will
  // just be the convergence tolerance (a correct upper bound).
  solveStatus.extraParameters->set ("Belos/Achieved Tolerance", 
				    solveStatus.achievedTol);

//  This information is in the previous line, which is printed anytime the verbosity
//  is not set to Teuchos::VERB_NONE, so I'm commenting this out for now.
//  if (out.get() && static_cast<int>(verbLevel) > static_cast<int>(Teuchos::VERB_NONE))
//    *out << "\nTotal solve time in Belos = "<<totalTimer.totalElapsedTime()<<" sec\n";
  
  return solveStatus;

}
コード例 #6
0
bool SpmdMultiVectorSerializer<Scalar>::isCompatible(
  const MultiVectorBase<Scalar> &mv
  ) const
{
  return 0!=dynamic_cast<const SpmdVectorSpaceBase<Scalar>*>(&*mv.range());
}
コード例 #7
0
void SpmdMultiVectorBase<Scalar>::euclideanApply(
  const EOpTransp M_trans,
  const MultiVectorBase<Scalar> &X,
  const Ptr<MultiVectorBase<Scalar> > &Y,
  const Scalar alpha,
  const Scalar beta
  ) const
{
  typedef Teuchos::ScalarTraits<Scalar> ST;
  using Teuchos::Workspace;
  Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  Teuchos::Time timerTotal("dummy",true);
  Teuchos::Time timer("dummy");
#endif

  //
  // This function performs one of two operations.
  //
  // The first operation (M_trans == NOTRANS) is:
  //
  // Y = beta * Y + alpha * M * X
  //
  // where Y and M have compatible (distributed?) range vector
  // spaces and X is a locally replicated serial multi-vector. This
  // operation does not require any global communication.
  //
  // The second operation (M_trans == TRANS) is:
  //
  // Y = beta * Y + alpha * M' * X
  //
  // where M and X have compatible (distributed?) range vector spaces
  // and Y is a locally replicated serial multi-vector. This operation
  // requires a local reduction.
  //

  //
  // Get spaces and validate compatibility
  //

  // Get the SpmdVectorSpace
  const SpmdVectorSpaceBase<Scalar> &spmdSpc = *this->spmdSpace();

  // Get the Spmd communicator
  const RCP<const Teuchos::Comm<Ordinal> >
    comm = spmdSpc.getComm();
#ifdef TEUCHOS_DEBUG
  const VectorSpaceBase<Scalar>
    &Y_range = *Y->range(),
    &X_range = *X.range();
//	std::cout << "SpmdMultiVectorBase<Scalar>::apply(...): comm = " << comm << std::endl;
  TEUCHOS_TEST_FOR_EXCEPTION(
    ( globalDim_ > localSubDim_ ) && comm.get()==NULL, std::logic_error
    ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
    );
  // ToDo: Write a good general validation function that I can call that will replace
  // all of these TEUCHOS_TEST_FOR_EXCEPTION(...) uses

  TEUCHOS_TEST_FOR_EXCEPTION(
    real_trans(M_trans)==NOTRANS && !spmdSpc.isCompatible(Y_range), Exceptions::IncompatibleVectorSpaces
    ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
    );
  TEUCHOS_TEST_FOR_EXCEPTION(
    real_trans(M_trans)==TRANS && !spmdSpc.isCompatible(X_range), Exceptions::IncompatibleVectorSpaces
    ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
    );
#endif

  //
  // Get explicit (local) views of Y, M and X
  //

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.start();
#endif
 
  DetachedMultiVectorView<Scalar>
    Y_local(
      *Y,
      real_trans(M_trans)==NOTRANS ? Range1D(localOffset_,localOffset_+localSubDim_-1) : Range1D(),
      Range1D()
      );
  ConstDetachedMultiVectorView<Scalar>
    M_local(
      *this,
      Range1D(localOffset_,localOffset_+localSubDim_-1),
      Range1D()
      );
  ConstDetachedMultiVectorView<Scalar>
    X_local(
      X
      ,real_trans(M_trans)==NOTRANS ? Range1D() : Range1D(localOffset_,localOffset_+localSubDim_-1)
      ,Range1D()
      );
#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.stop();
  std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for getting view = " << timer.totalElapsedTime() << " seconds\n";
#endif
#ifdef TEUCHOS_DEBUG		
  TEUCHOS_TEST_FOR_EXCEPTION(
    real_trans(M_trans)==NOTRANS && ( M_local.numSubCols() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() )
    , Exceptions::IncompatibleVectorSpaces
    ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
    );
  TEUCHOS_TEST_FOR_EXCEPTION(
    real_trans(M_trans)==TRANS && ( M_local.subDim() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() )
    , Exceptions::IncompatibleVectorSpaces
    ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
    );
#endif

  //
  // If nonlocal (i.e. M_trans==TRANS) then create temporary storage
  // for:
  //
  // Y_local_tmp = alpha * M(local) * X(local) : on nonroot processes
  //
  // or
  //
  // Y_local_tmp = beta*Y_local + alpha * M(local) * X(local) : on root process (localOffset_==0)
  // 
  // and set
  //
  // localBeta = ( localOffset_ == 0 ? beta : 0.0 )
  //
  // Above, we choose localBeta such that we will only perform
  // Y_local = beta * Y_local + ... on one process (the root
  // process where localOffset_==0x). Then, when we add up Y_local
  // on all of the processors and we will get the correct result.
  //
  // If strictly local (i.e. M_trans == NOTRANS) then set:
  //
  // Y_local_tmp = Y_local
  // localBeta = beta
  //

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.start();
#endif
 
  Workspace<Scalar> Y_local_tmp_store(wss, Y_local.subDim()*Y_local.numSubCols(), false);
  RTOpPack::SubMultiVectorView<Scalar> Y_local_tmp;
  Scalar localBeta;
  if( real_trans(M_trans) == TRANS && globalDim_ > localSubDim_ ) {
    // Nonlocal
    Y_local_tmp.initialize(
      0, Y_local.subDim(),
      0, Y_local.numSubCols(),
      Teuchos::arcpFromArrayView(Y_local_tmp_store()),
      Y_local.subDim() // leadingDim == subDim (columns are adjacent)
      );
    if( localOffset_ == 0 ) {
      // Root process: Must copy Y_local into Y_local_tmp
      for( int j = 0; j < Y_local.numSubCols(); ++j ) {
        Scalar *Y_local_j = Y_local.values() + Y_local.leadingDim()*j;
        std::copy( Y_local_j, Y_local_j + Y_local.subDim(), Y_local_tmp.values() + Y_local_tmp.leadingDim()*j );
      }
      localBeta = beta;
    }
    else {
      // Not the root process
      localBeta = 0.0;
    }
  }
  else {
    // Local
    Y_local_tmp = Y_local.smv(); // Shallow copy only!
    localBeta = beta;
  }

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.stop();
  std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for setting up Y_local_tmp and localBeta = " << timer.totalElapsedTime() << " seconds\n";
#endif
 
  //
  // Perform the local multiplication:
  //
  // Y(local) = localBeta * Y(local) + alpha * op(M(local)) * X(local)
  //
  // or in BLAS lingo:
  //
  // C = beta * C + alpha * op(A) * op(B)
  //

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.start();
#endif
  Teuchos::ETransp t_transp;
  if(ST::isComplex) {
    switch(M_trans) {
      case NOTRANS: t_transp = Teuchos::NO_TRANS; break;
      case TRANS: t_transp = Teuchos::TRANS; break;
      case CONJTRANS: t_transp = Teuchos::CONJ_TRANS; break;
      default: TEUCHOS_TEST_FOR_EXCEPT(true);
    }
  }
  else {
    switch(real_trans(M_trans)) {
      case NOTRANS: t_transp = Teuchos::NO_TRANS; break;
      case TRANS: t_transp = Teuchos::TRANS; break;
      default: TEUCHOS_TEST_FOR_EXCEPT(true);
    }
  }
  if (M_local.numSubCols() > 0) {
    // AGS: Added std::max on ld? below, following what is done in
    // Epetra_MultiVector Multiply use of GEMM. Allows for 0 length.
    blas_.GEMM(
      t_transp // TRANSA
      ,Teuchos::NO_TRANS // TRANSB
      ,Y_local.subDim() // M
      ,Y_local.numSubCols() // N
      ,real_trans(M_trans)==NOTRANS ? M_local.numSubCols() : M_local.subDim() // K
      ,alpha // ALPHA
      ,const_cast<Scalar*>(M_local.values()) // A
      ,std::max((int) M_local.leadingDim(),1) // LDA
      ,const_cast<Scalar*>(X_local.values()) // B
      ,std::max((int) X_local.leadingDim(),1) // LDB
      ,localBeta // BETA
      ,Y_local_tmp.values().get() // C
      ,std::max((int) Y_local_tmp.leadingDim(),1) // LDC
      );
  }
  else {
    std::fill( Y_local_tmp.values().begin(), Y_local_tmp.values().end(),
      ST::zero() );
  }
#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.stop();
  std::cout
    << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for GEMM = "
    << timer.totalElapsedTime() << " seconds\n";
#endif

  if( comm.get() ) {
 
    //
    // Perform the global reduction of Y_local_tmp back into Y_local
    //
 
    if( real_trans(M_trans)==TRANS && globalDim_ > localSubDim_ ) {
      // Contiguous buffer for final reduction
      Workspace<Scalar> Y_local_final_buff(wss,Y_local.subDim()*Y_local.numSubCols(),false);
      // Perform the reduction
      Teuchos::reduceAll<Ordinal,Scalar>(
        *comm,Teuchos::REDUCE_SUM,Y_local_final_buff.size(),Y_local_tmp.values().get(),
        &Y_local_final_buff[0]
        );
      // Load Y_local_final_buff back into Y_local
      const Scalar *Y_local_final_buff_ptr = &Y_local_final_buff[0];
      for( int j = 0; j < Y_local.numSubCols(); ++j ) {
        Scalar *Y_local_ptr = Y_local.values() + Y_local.leadingDim()*j;
        for( int i = 0; i < Y_local.subDim(); ++i ) {
          (*Y_local_ptr++) = (*Y_local_final_buff_ptr++);
        }
      }
    }
  }
  else {

    // When you get here the view Y_local will be committed back to Y
    // in the destructor to Y_local

  }

#ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
  timer.stop();
  std::cout 
    << "\nSpmdMultiVectorBase<Scalar>::apply(...): Total time = "
    << timerTotal.totalElapsedTime() << " seconds\n";
#endif

}