void IndefBlockedDiagonalSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const
  {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::IndefBlockedDiagonalSmoother::Apply(): Setup() has not been called");

    Teuchos::RCP<Teuchos::FancyOStream> fos = Teuchos::getFancyOStream(Teuchos::rcpFromRef(std::cout));

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    // extract parameters from internal parameter list
    const ParameterList & pL = Factory::GetParameterList();
    LocalOrdinal nSweeps = pL.get<LocalOrdinal>("Sweeps");
    Scalar omega = pL.get<Scalar>("Damping factor");

    // wrap current solution vector in RCP
    RCP<MultiVector> rcpX = Teuchos::rcpFromRef(X);

    // create residual vector
    // contains current residual of current solution X with rhs B
    RCP<MultiVector> residual = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());

    // incrementally improve solution vector X
    for (LocalOrdinal run = 0; run < nSweeps; ++run) {
      // 1) calculate current residual
      residual->update(one,B,zero); // residual = B
      A_->apply(*rcpX, *residual, Teuchos::NO_TRANS, -one, one);

      // split residual vector
      Teuchos::RCP<MultiVector> r1 = rangeMapExtractor_->ExtractVector(residual, 0);
      Teuchos::RCP<MultiVector> r2 = rangeMapExtractor_->ExtractVector(residual, 1);

      // 2) solve F * \Delta \tilde{x}_1 = r_1
      //    start with zero guess \Delta \tilde{x}_1
      RCP<MultiVector> xtilde1 = MultiVectorFactory::Build(F_->getRowMap(),1);
      xtilde1->putScalar(zero);
      velPredictSmoo_->Apply(*xtilde1,*r1);

      // 3) solve SchurComp equation
      //    start with zero guess \Delta \tilde{x}_2
      RCP<MultiVector> xtilde2 = MultiVectorFactory::Build(Z_->getRowMap(),1);
      xtilde2->putScalar(zero);
      schurCompSmoo_->Apply(*xtilde2,*r2);

      // 4) extract parts of solution vector X
      Teuchos::RCP<MultiVector> x1 = domainMapExtractor_->ExtractVector(rcpX, 0);
      Teuchos::RCP<MultiVector> x2 = domainMapExtractor_->ExtractVector(rcpX, 1);

      // 5) update solution vector with increments xhat1 and xhat2
      //    rescale increment for x2 with omega_
      x1->update(omega,*xtilde1,one); // x1 = x1_old + omega xtilde1
      x2->update(omega,*xtilde2,one); // x2 = x2_old + omega xtilde2

      // write back solution in global vector X
      domainMapExtractor_->InsertVector(x1, 0, rcpX);
      domainMapExtractor_->InsertVector(x2, 1, rcpX);

    }

  }
Esempio n. 2
0
  void PermutingSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(MultiVector &X, MultiVector const &B, bool const &InitialGuessIsZero) const
  {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::PermutingSmoother::Apply(): Setup() has not been called");
    TEUCHOS_TEST_FOR_EXCEPTION(s_ == Teuchos::null, Exceptions::RuntimeError, "IsSetup() == true but s_ == Teuchos::null. This does not make sense");

    Teuchos::RCP<MultiVector> Xtemp = MultiVectorFactory::Build(X.getMap(),1,true);
    Xtemp->update(1.0,X,0.0);

    // TODO: unify scaling and left permutation operator
    Teuchos::RCP<MultiVector> Btemp = MultiVectorFactory::Build(B.getMap(),1,true);
    Teuchos::RCP<MultiVector> Btemp2 = MultiVectorFactory::Build(B.getMap(),1,true);
    permP_->apply(B, *Btemp, Teuchos::NO_TRANS);   // apply permutation operator to rhs
    diagScalingOp_->apply(*Btemp,*Btemp2, Teuchos::NO_TRANS);  // apply scaling operator to rhs

    // apply smoother to permuted linear system
    s_->Apply(*Xtemp, *Btemp2, InitialGuessIsZero);

    // retransform smooth solution
    permQT_->apply(*Xtemp, X, Teuchos::NO_TRANS);
  }
  void PermutingSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::PermutingSmoother::Apply(): Setup() has not been called");

    typedef Teuchos::ScalarTraits<Scalar> STS;

    Teuchos::RCP<MultiVector> Xtemp = MultiVectorFactory::Build(X.getMap(), 1, true);
    Xtemp->update(STS::one(), X, STS::zero());

    // TODO: unify scaling and left permutation operator
    Teuchos::RCP<MultiVector> Btemp  = MultiVectorFactory::Build(B.getMap(), 1, true);
    Teuchos::RCP<MultiVector> Btemp2 = MultiVectorFactory::Build(B.getMap(), 1, true);
    permP_->apply(B, *Btemp, Teuchos::NO_TRANS);                // apply permutation operator to rhs
    diagScalingOp_->apply(*Btemp, *Btemp2, Teuchos::NO_TRANS);  // apply scaling operator to rhs

    // apply smoother to permuted linear system
    s_->Apply(*Xtemp, *Btemp2, InitialGuessIsZero);

    // retransform smooth solution
    permQT_->apply(*Xtemp, X, Teuchos::NO_TRANS);
  }
  void BraessSarazinSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError,
                               "MueLu::BraessSarazinSmoother::Apply(): Setup() has not been called");

    RCP<MultiVector> rcpX    = rcpFromRef(X);
    RCP<MultiVector> deltaX0 = MultiVectorFactory::Build(A00_->getRowMap(), 1);
    RCP<MultiVector> deltaX1 = MultiVectorFactory::Build(A10_->getRowMap(), 1);
    RCP<MultiVector> Rtmp    = MultiVectorFactory::Build(A10_->getRowMap(), 1);

    typedef Teuchos::ScalarTraits<SC> STS;
    SC one = STS::one(), zero = STS::zero();

    // extract parameters from internal parameter list
    const ParameterList& pL = Factory::GetParameterList();
    LO nSweeps = pL.get<LO>("Sweeps");

    RCP<MultiVector> R;
    if (InitialGuessIsZero)  {
      R = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());
      R->update(one, B, zero);
    } else {
      R = Utilities::Residual(*A_, X, B);
    }

    for (LO run = 0; run < nSweeps; ++run) {
      // Extract corresponding subvectors from X and R
      RCP<MultiVector> R0 = rangeMapExtractor_ ->ExtractVector(R, 0);
      RCP<MultiVector> R1 = rangeMapExtractor_ ->ExtractVector(R, 1);

      RCP<MultiVector> X0 = domainMapExtractor_->ExtractVector(rcpX, 0);
      RCP<MultiVector> X1 = domainMapExtractor_->ExtractVector(rcpX, 1);

      // Calculate Rtmp = R1 - D * deltaX0 (equation 8.14)
      deltaX0->putScalar(zero);
      deltaX0->elementWiseMultiply(one, *D_, *R0, zero);    // deltaX0 = D * R0 (equation 8.13)
      A10_->apply(*deltaX0, *Rtmp);                         // Rtmp    = A10*D*deltaX0 (intermediate step)
      Rtmp->update(one, *R1, -one);                         // Rtmp    = R1 - A10*D*deltaX0

      // Compute deltaX1 (pressure correction)
      // We use user provided preconditioner
      deltaX1->putScalar(zero);                             // just for safety
      smoo_->Apply(*deltaX1, *Rtmp);

      // Compute deltaX0
      deltaX0->putScalar(zero);                             // just for safety
      A01_->apply(*deltaX1, *deltaX0);                      // deltaX0 = A01*deltaX1
      deltaX0->update(one, *R0, -one);                      // deltaX0 = R0 - A01*deltaX1
      R0.swap(deltaX0);
      deltaX0->elementWiseMultiply(one, *D_, *R0, zero);    // deltaX0 = D*(R0 - A01*deltaX1)

      // Update solution
      X0->update(one, *deltaX0, one);
      X1->update(one, *deltaX1, one);

      domainMapExtractor_->InsertVector(X0, 0, rcpX);
      domainMapExtractor_->InsertVector(X1, 1, rcpX);

      if (run < nSweeps-1)
        R = Utilities::Residual(*A_, X, B);
    }
  }
  void BlockedGaussSeidelSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(MultiVector &X, const MultiVector& B, bool InitialGuessIsZero) const
  {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::BlockedGaussSeidelSmoother::Apply(): Setup() has not been called");

    RCP<Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > residual = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());
    RCP<Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > tempres = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());
    RCP<MultiVector> rcpX = Teuchos::rcpFromRef(X);


    //Teuchos::RCP<Teuchos::FancyOStream> fos = Teuchos::getFancyOStream(Teuchos::rcpFromRef(std::cout));

    // extract parameters from internal parameter list
    const ParameterList & pL = Factory::GetParameterList();
    LocalOrdinal nSweeps = pL.get<LocalOrdinal>("Sweeps");
    Scalar omega = pL.get<Scalar>("Damping factor");

    // outer Richardson loop
    for (LocalOrdinal run = 0; run < nSweeps; ++run) {
      // one BGS sweep
      // loop over all block rows
      for(size_t i = 0; i<Inverse_.size(); i++) {

        // calculate block residual r = B-A*X
        // note: A_ is the full blocked operator
        residual->update(1.0,B,0.0); // r = B
        A_->apply(X, *residual, Teuchos::NO_TRANS, -1.0, 1.0);

        // extract corresponding subvectors from X and residual
        size_t blockRowIndex = at(bgsOrderingIndex2blockRowIndex_, i); // == bgsOrderingIndex2blockRowIndex_.at(i) (only available since C++11)
        Teuchos::RCP<MultiVector> Xi = domainMapExtractor_->ExtractVector(rcpX, blockRowIndex);
        Teuchos::RCP<MultiVector> ri = rangeMapExtractor_->ExtractVector(residual, blockRowIndex);

        Teuchos::RCP<MultiVector> tXi = domainMapExtractor_->getVector(blockRowIndex, X.getNumVectors());

        // apply solver/smoother
        Inverse_.at(i)->Apply(*tXi, *ri, false);

        // update vector
        Xi->update(omega,*tXi,1.0);  // X_{i+1} = X_i + omega \Delta X_i

        // update corresponding part of rhs and lhs
        domainMapExtractor_->InsertVector(Xi, blockRowIndex, rcpX); // TODO wrong! fix me
      }
    }

  }
  void IndefBlockedDiagonalSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const
  {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::IndefBlockedDiagonalSmoother::Apply(): Setup() has not been called");

    Teuchos::RCP<Teuchos::FancyOStream> fos = Teuchos::getFancyOStream(Teuchos::rcpFromRef(std::cout));

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    // The following boolean flags catch the case where we need special transformation
    // for the GIDs when calling the subsmoothers.
    RCP<BlockedCrsMatrix> bA00 = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(F_);
    RCP<BlockedCrsMatrix> bA11 = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(Z_);
    bool bA00ThyraSpecialTreatment = false;
    bool bA11ThyraSpecialTreatment = false;
    if (bA00 != Teuchos::null) {
      if(bA00->Rows() == 1 && bA00->Cols() == 1 && rangeMapExtractor_->getThyraMode() == true) bA00ThyraSpecialTreatment = true;
    }
    if (bA11 != Teuchos::null) {
      if(bA11->Rows() == 1 && bA11->Cols() == 1 && rangeMapExtractor_->getThyraMode() == true) bA11ThyraSpecialTreatment = true;
    }

    // extract parameters from internal parameter list
    const ParameterList & pL = Factory::GetParameterList();
    LocalOrdinal nSweeps = pL.get<LocalOrdinal>("Sweeps");
    Scalar omega = pL.get<Scalar>("Damping factor");

    // wrap current solution vector in RCP
    RCP<MultiVector> rcpX = Teuchos::rcpFromRef(X);

    // create residual vector
    // contains current residual of current solution X with rhs B
    RCP<MultiVector> residual = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());

    // incrementally improve solution vector X
    for (LocalOrdinal run = 0; run < nSweeps; ++run) {
      // 1) calculate current residual
      residual->update(one,B,zero); // residual = B
      A_->apply(*rcpX, *residual, Teuchos::NO_TRANS, -one, one);

      // split residual vector
      Teuchos::RCP<MultiVector> r1 = rangeMapExtractor_->ExtractVector(residual, 0);
      Teuchos::RCP<MultiVector> r2 = rangeMapExtractor_->ExtractVector(residual, 1);

      // 2) solve F * \Delta \tilde{x}_1 = r_1
      //    start with zero guess \Delta \tilde{x}_1
      RCP<MultiVector> xtilde1 = MultiVectorFactory::Build(F_->getRowMap(),X.getNumVectors(),true);

      // Special handling if SchurComplement operator was a 1x1 blocked operator in Thyra mode
      // Then, we have to translate the Xpetra offset GIDs to plain Thyra GIDs and vice versa
      if(bA00ThyraSpecialTreatment == true) {
        RCP<MultiVector> xtilde1_thyra = domainMapExtractor_->getVector(0, X.getNumVectors(), true);
        RCP<MultiVector> r1_thyra = rangeMapExtractor_->getVector(0, B.getNumVectors(), true);
        // transform vector
        for(size_t k=0; k < r1->getNumVectors(); k++) {
          Teuchos::ArrayRCP<const Scalar> xpetraVecData  = r1->getData(k);
          Teuchos::ArrayRCP<Scalar> thyraVecData = r1_thyra->getDataNonConst(k);
          for(size_t i=0; i < r1->getLocalLength(); i++) {
            thyraVecData[i] = xpetraVecData[i];
          }
        }

        velPredictSmoo_->Apply(*xtilde1_thyra,*r1_thyra);

        for(size_t k=0; k < xtilde1_thyra->getNumVectors(); k++) {
          Teuchos::ArrayRCP<Scalar> xpetraVecData  = xtilde1->getDataNonConst(k);
          Teuchos::ArrayRCP<const Scalar> thyraVecData = xtilde1_thyra->getData(k);
          for(size_t i=0; i < xtilde1_thyra->getLocalLength(); i++) {
            xpetraVecData[i] = thyraVecData[i];
          }
        }
      } else {
        velPredictSmoo_->Apply(*xtilde1,*r1);
      }

      // 3) solve SchurComp equation
      //    start with zero guess \Delta \tilde{x}_2
      RCP<MultiVector> xtilde2 = MultiVectorFactory::Build(Z_->getRowMap(),X.getNumVectors(),true);

      // Special handling if SchurComplement operator was a 1x1 blocked operator in Thyra mode
      // Then, we have to translate the Xpetra offset GIDs to plain Thyra GIDs and vice versa
      if(bA11ThyraSpecialTreatment == true) {
        RCP<MultiVector> xtilde2_thyra = domainMapExtractor_->getVector(1, X.getNumVectors(), true);
        RCP<MultiVector> r2_thyra = rangeMapExtractor_->getVector(1, B.getNumVectors(), true);
        // transform vector
        for(size_t k=0; k < r2->getNumVectors(); k++) {
          Teuchos::ArrayRCP<const Scalar> xpetraVecData  = r2->getData(k);
          Teuchos::ArrayRCP<Scalar> thyraVecData = r2_thyra->getDataNonConst(k);
          for(size_t i=0; i < r2->getLocalLength(); i++) {
            thyraVecData[i] = xpetraVecData[i];
          }
        }

        schurCompSmoo_->Apply(*xtilde2_thyra,*r2_thyra);

        for(size_t k=0; k < xtilde2_thyra->getNumVectors(); k++) {
          Teuchos::ArrayRCP<Scalar> xpetraVecData  = xtilde2->getDataNonConst(k);
          Teuchos::ArrayRCP<const Scalar> thyraVecData = xtilde2_thyra->getData(k);
          for(size_t i=0; i < xtilde2_thyra->getLocalLength(); i++) {
            xpetraVecData[i] = thyraVecData[i];
          }
        }
      } else {
        schurCompSmoo_->Apply(*xtilde2,*r2);
      }

      // 4) extract parts of solution vector X
      Teuchos::RCP<MultiVector> x1 = domainMapExtractor_->ExtractVector(rcpX, 0);
      Teuchos::RCP<MultiVector> x2 = domainMapExtractor_->ExtractVector(rcpX, 1);

      // 5) update solution vector with increments xhat1 and xhat2
      //    rescale increment for x2 with omega_
      x1->update(omega,*xtilde1,one); // x1 = x1_old + omega xtilde1
      x2->update(omega,*xtilde2,one); // x2 = x2_old + omega xtilde2

      // write back solution in global vector X
      domainMapExtractor_->InsertVector(x1, 0, rcpX);
      domainMapExtractor_->InsertVector(x2, 1, rcpX);
    }
  }
  void SimpleSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const
  {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError, "MueLu::SimpleSmoother::Apply(): Setup() has not been called");
#ifdef HAVE_MUELU_DEBUG
    TEUCHOS_TEST_FOR_EXCEPTION(A_->getRangeMap()->isSameAs(*(B.getMap())) == false, Exceptions::RuntimeError, "MueLu::SimpleSmoother::Apply(): The map of RHS vector B is not the same as range map of the blocked operator A. Please check the map of B and A.");
    TEUCHOS_TEST_FOR_EXCEPTION(A_->getDomainMap()->isSameAs(*(X.getMap())) == false, Exceptions::RuntimeError, "MueLu::SimpleSmoother::Apply(): The map of the solution vector X is not the same as domain map of the blocked operator A. Please check the map of X and A.");
#endif

    Teuchos::RCP<Teuchos::FancyOStream> fos = Teuchos::getFancyOStream(Teuchos::rcpFromRef(std::cout));

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    // extract parameters from internal parameter list
    const ParameterList & pL = Factory::GetParameterList();
    LocalOrdinal nSweeps = pL.get<LocalOrdinal>("Sweeps");
    Scalar omega = pL.get<Scalar>("Damping factor");

    // The boolean flags check whether we use Thyra or Xpetra style GIDs
    // However, assuming that SIMPLE always only works for 2x2 blocked operators, we
    // most often have to use the ReorderedBlockedCrsOperator as input. If either the
    // F or Z (or SchurComplement block S) are 1x1 blocked operators with Thyra style
    // GIDs we need an extra transformation of vectors
    // In this case, we use the Xpetra (offset) GIDs for all operations and only transform
    // the input/output vectors before and after the subsolver calls!
    bool bRangeThyraModePredict  = rangeMapExtractor_->getThyraMode()  && (Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(F_) == Teuchos::null);
    bool bDomainThyraModePredict = domainMapExtractor_->getThyraMode() && (Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(F_) == Teuchos::null);
    bool bRangeThyraModeSchur    = rangeMapExtractor_->getThyraMode()  && (Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(Z_) == Teuchos::null);
    bool bDomainThyraModeSchur   = domainMapExtractor_->getThyraMode() && (Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(Z_) == Teuchos::null);

    // The following boolean flags catch the case where we need special transformation
    // for the GIDs when calling the subsmoothers.
    RCP<BlockedCrsMatrix> bF = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(F_);
    RCP<BlockedCrsMatrix> bZ = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(Z_);
    bool bFThyraSpecialTreatment = false;
    bool bZThyraSpecialTreatment = false;
    if (bF != Teuchos::null) {
      if(bF->Rows() == 1 && bF->Cols() == 1 && rangeMapExtractor_->getThyraMode() == true) bFThyraSpecialTreatment = true;
    }
    if (bZ != Teuchos::null) {
      if(bZ->Rows() == 1 && bZ->Cols() == 1 && rangeMapExtractor_->getThyraMode() == true) bZThyraSpecialTreatment = true;
    }

#if 1// new implementation

    // create a new vector for storing the current residual in a blocked multi vector
    RCP<MultiVector> res = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());
    RCP<BlockedMultiVector> residual = Teuchos::rcp(new BlockedMultiVector(rangeMapExtractor_,res));

    // create a new solution vector as a blocked multi vector
    RCP<MultiVector> rcpX = Teuchos::rcpFromRef(X);
    RCP<BlockedMultiVector> bX = Teuchos::rcp(new BlockedMultiVector(domainMapExtractor_,rcpX));

    // create a blocked rhs vector
    RCP<const MultiVector> rcpB = Teuchos::rcpFromRef(B);
    RCP<const BlockedMultiVector> bB = Teuchos::rcp(new const BlockedMultiVector(rangeMapExtractor_,rcpB));


    // incrementally improve solution vector X
    for (LocalOrdinal run = 0; run < nSweeps; ++run) {
      // 1) calculate current residual
      residual->update(one,*bB,zero); // r = B
      A_->apply(*bX, *residual, Teuchos::NO_TRANS, -one, one);

      // split residual vector
      Teuchos::RCP<MultiVector> r1 = rangeMapExtractor_->ExtractVector(residual, 0, bRangeThyraModePredict);
      Teuchos::RCP<MultiVector> r2 = rangeMapExtractor_->ExtractVector(residual, 1, bRangeThyraModeSchur);

      // 2) solve F * \Delta \tilde{x}_1 = r_1
      //    start with zero guess \Delta \tilde{x}_1
      RCP<MultiVector> xtilde1 = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      xtilde1->putScalar(zero);

      if(bFThyraSpecialTreatment == true) {
        xtilde1->replaceMap(domainMapExtractor_->getMap(0,true));
        r1->replaceMap(rangeMapExtractor_->getMap(0,true));
        velPredictSmoo_->Apply(*xtilde1,*r1);
        xtilde1->replaceMap(domainMapExtractor_->getMap(0,false));
      } else {
        velPredictSmoo_->Apply(*xtilde1,*r1);
      }

      // 3) calculate rhs for SchurComp equation
      //    r_2 - D \Delta \tilde{x}_1
      RCP<MultiVector> schurCompRHS = rangeMapExtractor_->getVector(1, B.getNumVectors(), bRangeThyraModeSchur);
      D_->apply(*xtilde1,*schurCompRHS);
      schurCompRHS->update(one,*r2,-one);

      // 4) solve SchurComp equation
      //    start with zero guess \Delta \tilde{x}_2
      RCP<MultiVector> xtilde2 = domainMapExtractor_->getVector(1, X.getNumVectors(), bDomainThyraModeSchur);
      xtilde2->putScalar(zero);

      // Special handling if SchurComplement operator was a 1x1 blocked operator in Thyra mode
      // Then, we have to translate the Xpetra offset GIDs to plain Thyra GIDs and vice versa
      if(bZThyraSpecialTreatment == true) {
        xtilde2->replaceMap(domainMapExtractor_->getMap(1,true));
        schurCompRHS->replaceMap(rangeMapExtractor_->getMap(1,true));
        schurCompSmoo_->Apply(*xtilde2,*schurCompRHS);
        xtilde2->replaceMap(domainMapExtractor_->getMap(1,false));
      } else {
        schurCompSmoo_->Apply(*xtilde2,*schurCompRHS);
      }

      // 5) scale xtilde2 with omega
      //    store this in xhat2
      RCP<MultiVector> xhat2 = domainMapExtractor_->getVector(1, X.getNumVectors(), bDomainThyraModeSchur);
      xhat2->update(omega,*xtilde2,zero);

      // 6) calculate xhat1
      RCP<MultiVector> xhat1      = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      RCP<MultiVector> xhat1_temp = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      G_->apply(*xhat2,*xhat1_temp); // store result temporarely in xtilde1_temp
      xhat1->elementWiseMultiply(one/*/omega*/,*diagFinv_,*xhat1_temp,zero);
      xhat1->update(one,*xtilde1,-one);

      // 7) extract parts of solution vector X
      Teuchos::RCP<MultiVector> x1 = domainMapExtractor_->ExtractVector(bX, 0, bDomainThyraModePredict);
      Teuchos::RCP<MultiVector> x2 = domainMapExtractor_->ExtractVector(bX, 1, bDomainThyraModeSchur);

      // 8) update solution vector with increments xhat1 and xhat2
      //    rescale increment for x2 with omega_
      x1->update(one,*xhat1,one);    // x1 = x1_old + xhat1
      x2->update(/*omega*/ one,*xhat2,one); // x2 = x2_old + omega xhat2
      // write back solution in global vector X
      domainMapExtractor_->InsertVector(x1, 0, bX, bDomainThyraModePredict);
      domainMapExtractor_->InsertVector(x2, 1, bX, bDomainThyraModeSchur);
    }

    // write back solution
    domainMapExtractor_->InsertVector(bX->getMultiVector(0,bDomainThyraModePredict), 0, rcpX, bDomainThyraModePredict);
    domainMapExtractor_->InsertVector(bX->getMultiVector(1,bDomainThyraModeSchur), 1, rcpX, bDomainThyraModeSchur);
#else

    // wrap current solution vector in RCP
    RCP<MultiVector> rcpX = Teuchos::rcpFromRef(X);

    // create residual vector
    // contains current residual of current solution X with rhs B
    RCP<MultiVector> residual = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());

    // incrementally improve solution vector X
    for (LocalOrdinal run = 0; run < nSweeps; ++run) {
      // 1) calculate current residual
      residual->update(one,B,zero); // residual = B
      A_->apply(*rcpX, *residual, Teuchos::NO_TRANS, -one, one);
      // split residual vector
      Teuchos::RCP<MultiVector> r1 = rangeMapExtractor_->ExtractVector(residual, 0, bRangeThyraModePredict);
      Teuchos::RCP<MultiVector> r2 = rangeMapExtractor_->ExtractVector(residual, 1, bRangeThyraModeSchur);

      // 2) solve F * \Delta \tilde{x}_1 = r_1
      //    start with zero guess \Delta \tilde{x}_1
      RCP<MultiVector> xtilde1 = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      xtilde1->putScalar(zero);

      // Special handling in case that F block is a 1x1 blocked operator in Thyra mode
      // Then we have to feed the smoother with real Thyra-based vectors
      if(bFThyraSpecialTreatment == true) {
        // create empty solution vector based on Thyra GIDs
        RCP<MultiVector> xtilde1_thyra = domainMapExtractor_->getVector(0, X.getNumVectors(), true);
        // create new RHS vector based on Thyra GIDs
        Teuchos::RCP<MultiVector> r1_thyra = rangeMapExtractor_->ExtractVector(residual, 0, true);
        velPredictSmoo_->Apply(*xtilde1_thyra,*r1_thyra);
        for(size_t k=0; k < xtilde1_thyra->getNumVectors(); k++) {
          Teuchos::ArrayRCP<Scalar> xpetraVecData  = xtilde1->getDataNonConst(k);
          Teuchos::ArrayRCP<const Scalar> thyraVecData = xtilde1_thyra->getData(k);
          for(size_t i=0; i < xtilde1_thyra->getLocalLength(); i++) {
            xpetraVecData[i] = thyraVecData[i];
          }
        }
      } else {
        velPredictSmoo_->Apply(*xtilde1,*r1);
      }

      // 3) calculate rhs for SchurComp equation
      //    r_2 - D \Delta \tilde{x}_1
      RCP<MultiVector> schurCompRHS = rangeMapExtractor_->getVector(1, B.getNumVectors(), bRangeThyraModeSchur);
      D_->apply(*xtilde1,*schurCompRHS);
      schurCompRHS->update(one,*r2,-one);

      // 4) solve SchurComp equation
      //    start with zero guess \Delta \tilde{x}_2
      RCP<MultiVector> xtilde2 = domainMapExtractor_->getVector(1, X.getNumVectors(), bDomainThyraModeSchur);
      xtilde2->putScalar(zero);

      // Special handling if SchurComplement operator was a 1x1 blocked operator in Thyra mode
      // Then, we have to translate the Xpetra offset GIDs to plain Thyra GIDs and vice versa
      if(bZThyraSpecialTreatment == true) {
        // create empty solution vector based on Thyra GIDs
        RCP<MultiVector> xtilde2_thyra = domainMapExtractor_->getVector(1, X.getNumVectors(), true);
        // create new RHS vector based on Thyra GIDs
        RCP<MultiVector> schurCompRHS_thyra = rangeMapExtractor_->getVector(1, B.getNumVectors(), true);
        // transform vector
        for(size_t k=0; k < schurCompRHS->getNumVectors(); k++) {
          Teuchos::ArrayRCP<const Scalar> xpetraVecData  = schurCompRHS->getData(k);
          Teuchos::ArrayRCP<Scalar> thyraVecData = schurCompRHS_thyra->getDataNonConst(k);
          for(size_t i=0; i < schurCompRHS->getLocalLength(); i++) {
            thyraVecData[i] = xpetraVecData[i];
          }
        }

        schurCompSmoo_->Apply(*xtilde2_thyra,*schurCompRHS_thyra);

        for(size_t k=0; k < xtilde2_thyra->getNumVectors(); k++) {
          Teuchos::ArrayRCP<Scalar> xpetraVecData  = xtilde2->getDataNonConst(k);
          Teuchos::ArrayRCP<const Scalar> thyraVecData = xtilde2_thyra->getData(k);
          for(size_t i=0; i < xtilde2_thyra->getLocalLength(); i++) {
            xpetraVecData[i] = thyraVecData[i];
          }
        }
      } else {
        schurCompSmoo_->Apply(*xtilde2,*schurCompRHS);
      }

      // 5) scale xtilde2 with omega
      //    store this in xhat2
      RCP<MultiVector> xhat2 = domainMapExtractor_->getVector(1, X.getNumVectors(), bDomainThyraModeSchur);
      xhat2->update(omega,*xtilde2,zero);

      // 6) calculate xhat1
      RCP<MultiVector> xhat1      = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      RCP<MultiVector> xhat1_temp = domainMapExtractor_->getVector(0, X.getNumVectors(), bDomainThyraModePredict);
      G_->apply(*xhat2,*xhat1_temp); // store result temporarely in xtilde1_temp
      xhat1->elementWiseMultiply(one/*/omega*/,*diagFinv_,*xhat1_temp,zero);
      xhat1->update(one,*xtilde1,-one);

      // 7) extract parts of solution vector X
      Teuchos::RCP<MultiVector> x1 = domainMapExtractor_->ExtractVector(rcpX, 0, bDomainThyraModePredict);
      Teuchos::RCP<MultiVector> x2 = domainMapExtractor_->ExtractVector(rcpX, 1, bDomainThyraModeSchur);

      // 8) update solution vector with increments xhat1 and xhat2
      //    rescale increment for x2 with omega_
      x1->update(one,*xhat1,one);    // x1 = x1_old + xhat1
      x2->update(/*omega*/ one,*xhat2,one); // x2 = x2_old + omega xhat2
      // write back solution in global vector X
      domainMapExtractor_->InsertVector(x1, 0, rcpX, bDomainThyraModePredict);
      domainMapExtractor_->InsertVector(x2, 1, rcpX, bDomainThyraModeSchur);
    }
#endif
  }
  void BraessSarazinSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(MultiVector& X, const MultiVector& B, bool InitialGuessIsZero) const {
    TEUCHOS_TEST_FOR_EXCEPTION(SmootherPrototype::IsSetup() == false, Exceptions::RuntimeError,
                               "MueLu::BraessSarazinSmoother::Apply(): Setup() has not been called");

#ifdef HAVE_MUELU_DEBUG
    TEUCHOS_TEST_FOR_EXCEPTION(A_->getRangeMap()->isSameAs(*(B.getMap())) == false, Exceptions::RuntimeError, "MueLu::BlockedGaussSeidelSmoother::Apply(): The map of RHS vector B is not the same as range map of the blocked operator A. Please check the map of B and A.");
    TEUCHOS_TEST_FOR_EXCEPTION(A_->getDomainMap()->isSameAs(*(X.getMap())) == false, Exceptions::RuntimeError, "MueLu::BlockedGaussSeidelSmoother::Apply(): The map of the solution vector X is not the same as domain map of the blocked operator A. Please check the map of X and A.");
#endif


    // The following boolean flags catch the case where we need special transformation
    // for the GIDs when calling the subsmoothers.
    RCP<BlockedCrsMatrix> bA11 = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(A11_);
    bool bA11ThyraSpecialTreatment = false;
    if (bA11 != Teuchos::null) {
      if(bA11->Rows() == 1 && bA11->Cols() == 1 && rangeMapExtractor_->getThyraMode() == true) bA11ThyraSpecialTreatment = true;
    }

    RCP<MultiVector> rcpX    = rcpFromRef(X);

    // use the GIDs of the sub blocks
    // This is valid as the subblocks actually represent the GIDs (either Thyra, Xpetra or pseudo Xpetra)
    RCP<MultiVector> deltaX0 = MultiVectorFactory::Build(A00_->getRowMap(), X.getNumVectors());
    RCP<MultiVector> deltaX1 = MultiVectorFactory::Build(A10_->getRowMap(), X.getNumVectors());
    RCP<MultiVector> Rtmp    = MultiVectorFactory::Build(A10_->getRowMap(), B.getNumVectors());

    typedef Teuchos::ScalarTraits<SC> STS;
    SC one = STS::one(), zero = STS::zero();

    // extract parameters from internal parameter list
    const ParameterList& pL = Factory::GetParameterList();
    LO nSweeps = pL.get<LO>("Sweeps");

    RCP<MultiVector> R;
    if (InitialGuessIsZero)  {
      R = MultiVectorFactory::Build(B.getMap(), B.getNumVectors());
      R->update(one, B, zero);
    } else {
      R = Utilities::Residual(*A_, X, B);
    }

    // extract diagonal of Schur complement operator
    RCP<Vector> diagSVector = VectorFactory::Build(S_->getRowMap());
    S_->getLocalDiagCopy(*diagSVector);
    ArrayRCP<SC> Sdiag = diagSVector->getDataNonConst(0);

    for (LO run = 0; run < nSweeps; ++run) {
      // Extract corresponding subvectors from X and R
      // Automatically detect whether we use Thyra or Xpetra GIDs
      // The GIDs should be always compatible with the GIDs of A00, A01, etc...
      RCP<MultiVector> R0 = rangeMapExtractor_ ->ExtractVector(R, 0);
      RCP<MultiVector> R1 = rangeMapExtractor_ ->ExtractVector(R, 1);

      RCP<MultiVector> X0 = domainMapExtractor_->ExtractVector(rcpX, 0);
      RCP<MultiVector> X1 = domainMapExtractor_->ExtractVector(rcpX, 1);

      // Calculate Rtmp = R1 - D * deltaX0 (equation 8.14)
      deltaX0->putScalar(zero);
      deltaX0->elementWiseMultiply(one, *D_, *R0, zero);    // deltaX0 = D * R0 (equation 8.13)
      A10_->apply(*deltaX0, *Rtmp);                         // Rtmp    = A10*D*deltaX0 (intermediate step)
      Rtmp->update(one, *R1, -one);                         // Rtmp    = R1 - A10*D*deltaX0

      if (!pL.get<bool>("q2q1 mode")) {
        deltaX1->putScalar(zero);
      } else {
        ArrayRCP<SC> deltaX1data = deltaX1->getDataNonConst(0);
        ArrayRCP<SC> Rtmpdata    = Rtmp->getDataNonConst(0);
        for (GO row = 0; row < deltaX1data.size(); row++)
          deltaX1data[row] = 1.1*Rtmpdata[row] / Sdiag[row];
      }

      // Special handling if SchurComplement operator was a 1x1 blocked operator in Thyra mode
      // Then, we have to translate the Xpetra offset GIDs to plain Thyra GIDs and vice versa
      if(bA11ThyraSpecialTreatment == true) {
        RCP<MultiVector> deltaX1_thyra = domainMapExtractor_->getVector(1, X.getNumVectors(), true);
        RCP<MultiVector> Rtmp_thyra = rangeMapExtractor_->getVector(1, B.getNumVectors(), true);
        // transform vector
        for(size_t k=0; k < Rtmp->getNumVectors(); k++) {
          Teuchos::ArrayRCP<const Scalar> xpetraVecData  = Rtmp->getData(k);
          Teuchos::ArrayRCP<Scalar> thyraVecData = Rtmp_thyra->getDataNonConst(k);
          for(size_t i=0; i < Rtmp->getLocalLength(); i++) {
            thyraVecData[i] = xpetraVecData[i];
          }
        }

        smoo_->Apply(*deltaX1_thyra,*Rtmp_thyra);

        for(size_t k=0; k < deltaX1_thyra->getNumVectors(); k++) {
          Teuchos::ArrayRCP<Scalar> xpetraVecData  = deltaX1->getDataNonConst(k);
          Teuchos::ArrayRCP<const Scalar> thyraVecData = deltaX1_thyra->getData(k);
          for(size_t i=0; i < deltaX1_thyra->getLocalLength(); i++) {
            xpetraVecData[i] = thyraVecData[i];
          }
        }
      } else {
        // Compute deltaX1 (pressure correction)
        // We use user provided preconditioner
        smoo_->Apply(*deltaX1,*Rtmp);
      }

      // Compute deltaX0
      deltaX0->putScalar(zero);                             // just for safety
      A01_->apply(*deltaX1, *deltaX0);                      // deltaX0 = A01*deltaX1
      deltaX0->update(one, *R0, -one);                      // deltaX0 = R0 - A01*deltaX1
      R0.swap(deltaX0);
      deltaX0->elementWiseMultiply(one, *D_, *R0, zero);    // deltaX0 = D*(R0 - A01*deltaX1)

      // Update solution
      X0->update(one, *deltaX0, one);
      X1->update(one, *deltaX1, one);

      domainMapExtractor_->InsertVector(X0, 0, rcpX);
      domainMapExtractor_->InsertVector(X1, 1, rcpX);

      if (run < nSweeps-1)
        R = Utilities::Residual(*A_, X, B);
    }
  }