/// \brief Extract A's underlying KokkosClassic::MultiVector instance.
 ///
 /// TSQR represents the local (to each MPI process) part of a
 /// multivector as a KokkosClassic::MultiVector (KMV), which gives a
 /// nonconstant view of the original multivector's data.  This
 /// class method tells TSQR how to get the KMV from the input
 /// multivector.  The KMV is not a persistent view of the data;
 /// its scope is contained within the scope of the multivector.
 ///
 /// \warning TSQR does not currently support multivectors with
 ///   nonconstant stride.  If A has nonconstant stride, this
 ///   method will throw an exception.
 static KokkosClassic::MultiVector<scalar_type, node_type>
 getNonConstView (MV& A)
 {
   // FIXME (mfh 25 Oct 2010) We should be able to run TSQR even if
   // storage of A uses nonconstant stride internally.  We would
   // have to copy and pack into a matrix with constant stride, and
   // then unpack on exit.  For now we choose just to raise an
   // exception.
   TEUCHOS_TEST_FOR_EXCEPTION(
     ! A.isConstantStride(), std::invalid_argument,
     "Tpetra::TsqrAdaptor::getNonConstView: TSQR does not currently "
     "support Tpetra::MultiVector inputs that do not have constant "
     "stride.");
   return A.getLocalMV ();
 }
void 
Chebyshev<MatrixType>::
applyImpl (const MV& X,
	   MV& Y,
	   Teuchos::ETransp mode,
	   scalar_type alpha,
	   scalar_type beta) const 
{
  using Teuchos::ArrayRCP;
  using Teuchos::as;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::rcp_const_cast;
  using Teuchos::rcpFromRef;

  const scalar_type zero = STS::zero();
  const scalar_type one = STS::one();

  // Y = beta*Y + alpha*M*X.

  // If alpha == 0, then we don't need to do Chebyshev at all.
  if (alpha == zero) {
    if (beta == zero) { // Obey Sparse BLAS rules; avoid 0*NaN.
      Y.putScalar (zero);
    }
    else {
      Y.scale (beta);
    }
    return;
  }

  // If beta != 0, then we need to keep a copy of the initial value of
  // Y, so that we can add beta*it to the Chebyshev result at the end.
  // Usually this method is called with beta == 0, so we don't have to 
  // worry about caching Y_org.
  RCP<MV> Y_orig;
  if (beta != zero) {
    Y_orig = rcp (new MV (Y));
  }

  // If X and Y point to the same memory location, we need to use a
  // copy of X (X_copy) as the input MV.  Otherwise, just let X_copy
  // point to X.
  //
  // This is hopefully an uncommon use case, so we don't bother to
  // optimize for it by caching X_copy.
  RCP<const MV> X_copy;
  bool copiedInput = false;
  if (X.getLocalMV().getValues() == Y.getLocalMV().getValues()) {
    X_copy = rcp (new MV (X));
    copiedInput = true;
  }
  else {
    X_copy = rcpFromRef (X);
  }
  
  // If alpha != 1, fold alpha into (a copy of) X.
  //
  // This is an uncommon use case, so we don't bother to optimize for
  // it by caching X_copy.  However, we do check whether we've already
  // copied X above, to avoid a second copy.
  if (alpha != one) {
    RCP<MV> X_copy_nonConst = rcp_const_cast<MV> (X_copy);
    if (! copiedInput) {
      X_copy_nonConst = rcp (new MV (X));
      copiedInput = true;
    }
    X_copy_nonConst->scale (alpha);
    X_copy = rcp_const_cast<const MV> (X_copy_nonConst);
  }

  impl_.apply (*X_copy, Y);

  if (beta != zero) {
    Y.update (beta, *Y_orig, one); // Y = beta * Y_orig + 1 * Y
  }
}