void
LocalSparseTriangularSolver<MatrixType>::
localApply (const MV& X,
            MV& Y,
            const Teuchos::ETransp mode,
            const scalar_type& alpha,
            const scalar_type& beta) const
{
  using Teuchos::RCP;
  typedef scalar_type ST;
  typedef Teuchos::ScalarTraits<ST> STS;

  if (beta == STS::zero ()) {
    if (alpha == STS::zero ()) {
      Y.putScalar (STS::zero ()); // Y := 0 * Y (ignore contents of Y)
    }
    else { // alpha != 0
      A_crs_->template localSolve<ST, ST> (X, Y, mode);
      if (alpha != STS::one ()) {
        Y.scale (alpha);
      }
    }
  }
  else { // beta != 0
    if (alpha == STS::zero ()) {
      Y.scale (beta); // Y := beta * Y
    }
    else { // alpha != 0
      MV Y_tmp (Y, Teuchos::Copy);
      A_crs_->template localSolve<ST, ST> (X, Y_tmp, mode); // Y_tmp := M * X
      Y.update (alpha, Y_tmp, beta); // Y := beta * Y + alpha * Y_tmp
    }
  }
}
    //! Do the transpose or conjugate transpose solve.
    void applyTranspose (const MV& X_in, MV& Y_in, const Teuchos::ETransp mode) const
    {
      typedef Teuchos::ScalarTraits<Scalar> ST;
      using Teuchos::null;

      TEUCHOS_TEST_FOR_EXCEPTION
        (mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS, std::logic_error,
         "Tpetra::CrsMatrixSolveOp::applyTranspose: mode is neither TRANS nor "
         "CONJ_TRANS.  Should never get here!  Please report this bug to the "
         "Tpetra developers.");

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
          importMV_ = null;
        }
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp( new MV(matrix_->getColMap(),numVectors) );
        }
      }
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
          exportMV_ = null;
        }
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp( new MV(matrix_->getRowMap(),numVectors) );
        }
      }

      // solve(TRANS): DomainMap -> RangeMap
      // lclMatSolve_(TRANS): ColMap -> RowMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      //
      // solve = importer o   lclMatSolve_  o  exporter
      //         Domainmap -> ColMap     ->      RowMap -> RangeMap
      //
      // If we have a non-trivial importer, we must import elements that
      // are permuted or are on other processes.
      if (importer != null) {
        importMV_->doImport(X_in,*importer,INSERT);
        X = importMV_;
      }
      else if (X_in.isConstantStride() == false) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp(new MV(X_in));
      }
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);
      }


      // If we have a non-trivial exporter, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV; get a view.
      if (exporter != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *exportMV_,
                                                      Teuchos::CONJ_TRANS);
        // Make sure target is zero: necessary because we are adding
        Y_in.putScalar(ST::zero());
        Y_in.doExport(*importMV_, *importer, ADD);
      }
      // otherwise, solve into Y
      else {
        if (Y_in.isConstantStride() == false) {
          // generate a strided copy of Y
          MV Y(Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, Teuchos::CONJ_TRANS);
          Y_in = Y;
        }
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, Teuchos::CONJ_TRANS);
        }
      }
    }
    //! Do the non-transpose solve.
    void applyNonTranspose (const MV& X_in, MV& Y_in) const
    {
      using Teuchos::NO_TRANS;
      using Teuchos::null;
      typedef Teuchos::ScalarTraits<Scalar> ST;

      // Solve U X = Y  or  L X = Y
      // X belongs to domain map, while Y belongs to range map

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
          importMV_ = null;
        }
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp (new MV (matrix_->getColMap (), numVectors));
        }
      }
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
          exportMV_ = null;
        }
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp (new MV (matrix_->getRowMap (), numVectors));
        }
      }

      // solve(NO_TRANS): RangeMap -> DomainMap
      // lclMatSolve_: RowMap -> ColMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      //
      // solve = reverse(exporter)  o   lclMatSolve_  o reverse(importer)
      //         RangeMap   ->    RowMap     ->     ColMap         ->    DomainMap
      //
      // If we have a non-trivial exporter, we must import elements that
      // are permuted or are on other processors
      if (exporter != null) {
        exportMV_->doImport (X_in, *exporter, INSERT);
        X = exportMV_;
      }
      else if (! X_in.isConstantStride ()) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp (new MV (X_in));
      }
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);
      }

      // If we have a non-trivial importer, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV.
      if (importer != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *importMV_, NO_TRANS);
        // Make sure target is zero: necessary because we are adding.
        Y_in.putScalar (ST::zero ());
        Y_in.doExport (*importMV_, *importer, ADD);
      }
      // otherwise, solve into Y
      else {
        // can't solve into non-strided multivector
        if (! Y_in.isConstantStride ()) {
          // generate a strided copy of Y
          MV Y (Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, NO_TRANS);
          Tpetra::deep_copy (Y_in, Y);
        }
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, NO_TRANS);
        }
      }
    }
void 
Chebyshev<MatrixType>::
applyImpl (const MV& X,
	   MV& Y,
	   Teuchos::ETransp mode,
	   scalar_type alpha,
	   scalar_type beta) const 
{
  using Teuchos::ArrayRCP;
  using Teuchos::as;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcp_const_cast;
  using Teuchos::rcpFromRef;

  const scalar_type zero = STS::zero();
  const scalar_type one = STS::one();

  // Y = beta*Y + alpha*M*X.

  // If alpha == 0, then we don't need to do Chebyshev at all.
  if (alpha == zero) {
    if (beta == zero) { // Obey Sparse BLAS rules; avoid 0*NaN.
      Y.putScalar (zero);
    }
    else {
      Y.scale (beta);
    }
    return;
  }

  // If beta != 0, then we need to keep a copy of the initial value of
  // Y, so that we can add beta*it to the Chebyshev result at the end.
  // Usually this method is called with beta == 0, so we don't have to 
  // worry about caching Y_org.
  RCP<MV> Y_orig;
  if (beta != zero) {
    Y_orig = rcp (new MV (Y));
  }

  // If X and Y point to the same memory location, we need to use a
  // copy of X (X_copy) as the input MV.  Otherwise, just let X_copy
  // point to X.
  //
  // This is hopefully an uncommon use case, so we don't bother to
  // optimize for it by caching X_copy.
  RCP<const MV> X_copy;
  bool copiedInput = false;
  if (X.getLocalMV().getValues() == Y.getLocalMV().getValues()) {
    X_copy = rcp (new MV (X));
    copiedInput = true;
  }
  else {
    X_copy = rcpFromRef (X);
  }
  
  // If alpha != 1, fold alpha into (a copy of) X.
  //
  // This is an uncommon use case, so we don't bother to optimize for
  // it by caching X_copy.  However, we do check whether we've already
  // copied X above, to avoid a second copy.
  if (alpha != one) {
    RCP<MV> X_copy_nonConst = rcp_const_cast<MV> (X_copy);
    if (! copiedInput) {
      X_copy_nonConst = rcp (new MV (X));
      copiedInput = true;
    }
    X_copy_nonConst->scale (alpha);
    X_copy = rcp_const_cast<const MV> (X_copy_nonConst);
  }

  impl_.apply (*X_copy, Y);

  if (beta != zero) {
    Y.update (beta, *Y_orig, one); // Y = beta * Y_orig + 1 * Y
  }
}