//! Do the transpose or conjugate transpose solve.
    void applyTranspose (const MV& X_in, MV& Y_in, const Teuchos::ETransp mode) const
    {
      typedef Teuchos::ScalarTraits<Scalar> ST;
      using Teuchos::null;

      TEUCHOS_TEST_FOR_EXCEPTION
        (mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS, std::logic_error,
         "Tpetra::CrsMatrixSolveOp::applyTranspose: mode is neither TRANS nor "
         "CONJ_TRANS.  Should never get here!  Please report this bug to the "
         "Tpetra developers.");

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
          importMV_ = null;
        }
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp( new MV(matrix_->getColMap(),numVectors) );
        }
      }
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
          exportMV_ = null;
        }
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp( new MV(matrix_->getRowMap(),numVectors) );
        }
      }

      // solve(TRANS): DomainMap -> RangeMap
      // lclMatSolve_(TRANS): ColMap -> RowMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      //
      // solve = importer o   lclMatSolve_  o  exporter
      //         Domainmap -> ColMap     ->      RowMap -> RangeMap
      //
      // If we have a non-trivial importer, we must import elements that
      // are permuted or are on other processes.
      if (importer != null) {
        importMV_->doImport(X_in,*importer,INSERT);
        X = importMV_;
      }
      else if (X_in.isConstantStride() == false) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp(new MV(X_in));
      }
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);
      }


      // If we have a non-trivial exporter, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV; get a view.
      if (exporter != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *exportMV_,
                                                      Teuchos::CONJ_TRANS);
        // Make sure target is zero: necessary because we are adding
        Y_in.putScalar(ST::zero());
        Y_in.doExport(*importMV_, *importer, ADD);
      }
      // otherwise, solve into Y
      else {
        if (Y_in.isConstantStride() == false) {
          // generate a strided copy of Y
          MV Y(Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, Teuchos::CONJ_TRANS);
          Y_in = Y;
        }
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, Teuchos::CONJ_TRANS);
        }
      }
    }
    //! Do the non-transpose solve.
    void applyNonTranspose (const MV& X_in, MV& Y_in) const
    {
      using Teuchos::NO_TRANS;
      using Teuchos::null;
      typedef Teuchos::ScalarTraits<Scalar> ST;

      // Solve U X = Y  or  L X = Y
      // X belongs to domain map, while Y belongs to range map

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
          importMV_ = null;
        }
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp (new MV (matrix_->getColMap (), numVectors));
        }
      }
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
          exportMV_ = null;
        }
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp (new MV (matrix_->getRowMap (), numVectors));
        }
      }

      // solve(NO_TRANS): RangeMap -> DomainMap
      // lclMatSolve_: RowMap -> ColMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      //
      // solve = reverse(exporter)  o   lclMatSolve_  o reverse(importer)
      //         RangeMap   ->    RowMap     ->     ColMap         ->    DomainMap
      //
      // If we have a non-trivial exporter, we must import elements that
      // are permuted or are on other processors
      if (exporter != null) {
        exportMV_->doImport (X_in, *exporter, INSERT);
        X = exportMV_;
      }
      else if (! X_in.isConstantStride ()) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp (new MV (X_in));
      }
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);
      }

      // If we have a non-trivial importer, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV.
      if (importer != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *importMV_, NO_TRANS);
        // Make sure target is zero: necessary because we are adding.
        Y_in.putScalar (ST::zero ());
        Y_in.doExport (*importMV_, *importer, ADD);
      }
      // otherwise, solve into Y
      else {
        // can't solve into non-strided multivector
        if (! Y_in.isConstantStride ()) {
          // generate a strided copy of Y
          MV Y (Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, NO_TRANS);
          Tpetra::deep_copy (Y_in, Y);
        }
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, NO_TRANS);
        }
      }
    }