/// \brief Compute QR factorization [Q,R] = qr(A,0).
    /// \param A [in/out] On input: the multivector to factor.
    ///   Overwritten with garbage on output.
    /// \param Q [out] On output: the (explicitly stored) Q factor in
    ///   the QR factorization of the (input) multivector A.
    /// \param R [out] On output: the R factor in the QR factorization
    ///   of the (input) multivector A.
    /// \param forceNonnegativeDiagonal [in] If true, then (if
    ///   necessary) do extra work (modifying both the Q and R
    ///   factors) in order to force the R factor to have a
    ///   nonnegative diagonal.
    /// \warning Currently, this method only works if A and Q have the
    ///   same communicator and row distribution ("Map," in Petra
    ///   terms) as those of the multivector given to this adapter
    ///   instance's constructor.  Otherwise, the result of this
    ///   method is undefined.
    factorExplicit (MV& A,
                    MV& Q,
                    dense_matrix_type& R,
                    const bool forceNonnegativeDiagonal=false)
        (! A.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
         "factorExplicit: Input MultiVector A must have constant stride.");
        (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
         "factorExplicit: Input MultiVector Q must have constant stride.");
      prepareTsqr (Q); // Finish initializing TSQR.

      // FIXME (mfh 16 Jan 2016) Currently, TSQR is a host-only
      // implementation.
      A.template sync<Kokkos::HostSpace> ();
      A.template modify<Kokkos::HostSpace> ();
      Q.template sync<Kokkos::HostSpace> ();
      Q.template modify<Kokkos::HostSpace> ();
      auto A_view = A.template getLocalView<Kokkos::HostSpace> ();
      auto Q_view = Q.template getLocalView<Kokkos::HostSpace> ();
      scalar_type* const A_ptr =
        reinterpret_cast<scalar_type*> (A_view.ptr_on_device ());
      scalar_type* const Q_ptr =
        reinterpret_cast<scalar_type*> (Q_view.ptr_on_device ());
      const bool contiguousCacheBlocks = false;
      tsqr_->factorExplicitRaw (A_view.dimension_0 (),
                                A_view.dimension_1 (),
                                A_ptr, A.getStride (),
                                Q_ptr, Q.getStride (),
                                R.values (), R.stride (),
    /// \brief Rank-revealing decomposition
    /// Using the R factor and explicit Q factor from
    /// factorExplicit(), compute the singular value decomposition
    /// (SVD) of R: \f$R = U \Sigma V^*\f$.  If R is full rank (with
    /// respect to the given relative tolerance \c tol), do not modify
    /// Q or R.  Otherwise, compute \f$Q := Q \cdot U\f$ and \f$R :=
    /// \Sigma V^*\f$ in place.  If R was modified, then it may not
    /// necessarily be upper triangular on output.
    /// \param Q [in/out] On input: explicit Q factor computed by
    ///   factorExplicit().  (Must be an orthogonal resp. unitary
    ///   matrix.)  On output: If R is of full numerical rank with
    ///   respect to the tolerance tol, Q is unmodified.  Otherwise, Q
    ///   is updated so that the first \c rank columns of Q are a
    ///   basis for the column space of A (the original matrix whose
    ///   QR factorization was computed by factorExplicit()).  The
    ///   remaining columns of Q are a basis for the null space of A.
    /// \param R [in/out] On input: N by N upper triangular matrix
    ///   with leading dimension LDR >= N.  On output: if input is
    ///   full rank, R is unchanged on output.  Otherwise, if \f$R = U
    ///   \Sigma V^*\f$ is the SVD of R, on output R is overwritten
    ///   with \f$\Sigma \cdot V^*\f$.  This is also an N by N matrix,
    ///   but it may not necessarily be upper triangular.
    /// \param tol [in] Relative tolerance for computing the numerical
    ///   rank of the matrix R.
    /// \return Rank \f$r\f$ of R: \f$ 0 \leq r \leq N\f$.
    revealRank (MV& Q,
                dense_matrix_type& R,
                const magnitude_type& tol)
        (! Q.isConstantStride (), std::invalid_argument, "TsqrAdaptor::"
         "revealRank: Input MultiVector Q must have constant stride.");
      prepareTsqr (Q); // Finish initializing TSQR.
      // FIXME (mfh 18 Oct 2010) Check Teuchos::Comm<int> object in Q
      // to make sure it is the same communicator as the one we are
      // using in our dist_tsqr_type implementation.

      Q.template sync<Kokkos::HostSpace> ();
      Q.template modify<Kokkos::HostSpace> ();
      auto Q_view = Q.template getLocalView<Kokkos::HostSpace> ();
      scalar_type* const Q_ptr =
        reinterpret_cast<scalar_type*> (Q_view.ptr_on_device ());
      const bool contiguousCacheBlocks = false;
      return tsqr_->revealRankRaw (Q_view.dimension_0 (),
                                   Q_view.dimension_1 (),
                                   Q_ptr, Q.getStride (),
                                   R.values (), R.stride (),
                                   tol, contiguousCacheBlocks);
 /// \brief Extract A's underlying KokkosClassic::MultiVector instance.
 /// TSQR represents the local (to each MPI process) part of a
 /// multivector as a KokkosClassic::MultiVector (KMV), which gives a
 /// nonconstant view of the original multivector's data.  This
 /// class method tells TSQR how to get the KMV from the input
 /// multivector.  The KMV is not a persistent view of the data;
 /// its scope is contained within the scope of the multivector.
 /// \warning TSQR does not currently support multivectors with
 ///   nonconstant stride.  If A has nonconstant stride, this
 ///   method will throw an exception.
 static KokkosClassic::MultiVector<scalar_type, node_type>
 getNonConstView (MV& A)
   // FIXME (mfh 25 Oct 2010) We should be able to run TSQR even if
   // storage of A uses nonconstant stride internally.  We would
   // have to copy and pack into a matrix with constant stride, and
   // then unpack on exit.  For now we choose just to raise an
   // exception.
   TEUCHOS_TEST_FOR_EXCEPTION(! A.isConstantStride(), std::invalid_argument,
                              "TSQR does not currently support Tpetra::MultiVector "
                              "inputs that do not have constant stride.");
   return A.getLocalMVNonConst();
    /// \brief Extract A's underlying KokkosClassic::MultiVector instance.
    /// TSQR represents the local (to each MPI process) part of a
    /// multivector as a KokkosClassic::MultiVector (KMV), which gives a
    /// nonconstant view of the original multivector's data.  This
    /// class method tells TSQR how to get the KMV from the input
    /// multivector.  The KMV is not a persistent view of the data;
    /// its scope is contained within the scope of the multivector.
    /// \warning TSQR does not currently support multivectors with
    ///   nonconstant stride.  If A has nonconstant stride, this
    ///   method will throw an exception.
    static KokkosClassic::MultiVector<scalar_type, node_type>
    getNonConstView (MV& A)
        // FIXME (mfh 25 Oct 2010) We should be able to run TSQR even if
        // storage of A uses nonconstant stride internally.  We would
        // have to copy and pack into a matrix with constant stride, and
        // then unpack on exit.  For now we choose just to raise an
        // exception.
        TEUCHOS_TEST_FOR_EXCEPTION(! A.isConstantStride(), std::invalid_argument,
                                   "TSQR does not currently support Tpetra::MultiVector "
                                   "inputs that do not have constant stride.");

        typedef typename Teuchos::ArrayRCP<mp_scalar_type>::size_type size_type;
        typedef typename MV::dual_view_type view_type;
        typedef typename view_type::t_dev::array_type flat_array_type;

        // Create new Kokkos::MultiVector reinterpreting the data as a longer
        // array of the base scalar type

        // Create new ArrayRCP holding data
        view_type pce_mv = A.getDualView();
        flat_array_type flat_mv = pce_mv.d_view;
        const size_t num_rows = flat_mv.dimension_0();
        const size_t num_cols = flat_mv.dimension_1();
        const size_t size = num_rows * num_cols;
        ArrayRCP<scalar_type> vals =
            Teuchos::arcp(flat_mv.ptr_on_device(), size_type(0), size, false);

        // Create new MultiVector
        // Owing to the above comment, we don't need to worry about
        // non-constant stride
        size_t strides[2];
        const size_t stride = strides[0];
        KokkosClassic::MultiVector<scalar_type, node_type> mv(A.getMap()->getNode());
        mv.initializeValues(num_rows, num_cols, vals, stride);

        return mv;
    //! Do the transpose or conjugate transpose solve.
    void applyTranspose (const MV& X_in, MV& Y_in, const Teuchos::ETransp mode) const
      typedef Teuchos::ScalarTraits<Scalar> ST;
      using Teuchos::null;

        (mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS, std::logic_error,
         "Tpetra::CrsMatrixSolveOp::applyTranspose: mode is neither TRANS nor "
         "CONJ_TRANS.  Should never get here!  Please report this bug to the "
         "Tpetra developers.");

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
          importMV_ = null;
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp( new MV(matrix_->getColMap(),numVectors) );
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
          exportMV_ = null;
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp( new MV(matrix_->getRowMap(),numVectors) );

      // solve(TRANS): DomainMap -> RangeMap
      // lclMatSolve_(TRANS): ColMap -> RowMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      // solve = importer o   lclMatSolve_  o  exporter
      //         Domainmap -> ColMap     ->      RowMap -> RangeMap
      // If we have a non-trivial importer, we must import elements that
      // are permuted or are on other processes.
      if (importer != null) {
        X = importMV_;
      else if (X_in.isConstantStride() == false) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp(new MV(X_in));
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);

      // If we have a non-trivial exporter, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV; get a view.
      if (exporter != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *exportMV_,
        // Make sure target is zero: necessary because we are adding
        Y_in.doExport(*importMV_, *importer, ADD);
      // otherwise, solve into Y
      else {
        if (Y_in.isConstantStride() == false) {
          // generate a strided copy of Y
          MV Y(Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, Teuchos::CONJ_TRANS);
          Y_in = Y;
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, Teuchos::CONJ_TRANS);
    //! Do the non-transpose solve.
    void applyNonTranspose (const MV& X_in, MV& Y_in) const
      using Teuchos::NO_TRANS;
      using Teuchos::null;
      typedef Teuchos::ScalarTraits<Scalar> ST;

      // Solve U X = Y  or  L X = Y
      // X belongs to domain map, while Y belongs to range map

      const size_t numVectors = X_in.getNumVectors();
      Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > importer =
        matrix_->getGraph ()->getImporter ();
      Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Node> > exporter =
        matrix_->getGraph ()->getExporter ();
      Teuchos::RCP<const MV> X;

      // it is okay if X and Y reference the same data, because we can
      // perform a triangular solve in-situ.  however, we require that
      // column access to each is strided.

      // set up import/export temporary multivectors
      if (importer != null) {
        if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
          importMV_ = null;
        if (importMV_ == null) {
          importMV_ = Teuchos::rcp (new MV (matrix_->getColMap (), numVectors));
      if (exporter != null) {
        if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
          exportMV_ = null;
        if (exportMV_ == null) {
          exportMV_ = Teuchos::rcp (new MV (matrix_->getRowMap (), numVectors));

      // solve(NO_TRANS): RangeMap -> DomainMap
      // lclMatSolve_: RowMap -> ColMap
      // importer: DomainMap -> ColMap
      // exporter: RowMap -> RangeMap
      // solve = reverse(exporter)  o   lclMatSolve_  o reverse(importer)
      //         RangeMap   ->    RowMap     ->     ColMap         ->    DomainMap
      // If we have a non-trivial exporter, we must import elements that
      // are permuted or are on other processors
      if (exporter != null) {
        exportMV_->doImport (X_in, *exporter, INSERT);
        X = exportMV_;
      else if (! X_in.isConstantStride ()) {
        // cannot handle non-constant stride right now
        // generate a copy of X_in
        X = Teuchos::rcp (new MV (X_in));
      else {
        // just temporary, so this non-owning RCP is okay
        X = Teuchos::rcpFromRef (X_in);

      // If we have a non-trivial importer, we must export elements that
      // are permuted or belong to other processes.  We will compute
      // solution into the to-be-exported MV.
      if (importer != null) {
        matrix_->template localSolve<Scalar, Scalar> (*X, *importMV_, NO_TRANS);
        // Make sure target is zero: necessary because we are adding.
        Y_in.putScalar (ST::zero ());
        Y_in.doExport (*importMV_, *importer, ADD);
      // otherwise, solve into Y
      else {
        // can't solve into non-strided multivector
        if (! Y_in.isConstantStride ()) {
          // generate a strided copy of Y
          MV Y (Y_in);
          matrix_->template localSolve<Scalar, Scalar> (*X, Y, NO_TRANS);
          Tpetra::deep_copy (Y_in, Y);
        else {
          matrix_->template localSolve<Scalar, Scalar> (*X, Y_in, NO_TRANS);