void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const {
    {
      FactoryMonitor m(*this, "Computing Ac", coarseLevel);
      std::ostringstream levelstr;
      levelstr << coarseLevel.GetLevelID();

      TEUCHOS_TEST_FOR_EXCEPTION(hasDeclaredInput_==false, Exceptions::RuntimeError,
                                 "MueLu::RAPFactory::Build(): CallDeclareInput has not been called before Build!");

      // Set "Keeps" from params
      const Teuchos::ParameterList& pL = GetParameterList();
      if (pL.get<bool>("Keep AP Pattern"))
        coarseLevel.Keep("AP Pattern",  this);
      if (pL.get<bool>("Keep RAP Pattern"))
        coarseLevel.Keep("RAP Pattern", this);

      RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
      RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P"), AP, Ac;

      // Reuse pattern if available (multiple solve)
      if (coarseLevel.IsAvailable("AP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous AP pattern" << std::endl;

        AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern");
      }

      {
        SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel);

        AP = Utils::Multiply(*A, false, *P, false, AP, GetOStream(Statistics2),true,true,std::string("MueLu::A*P-")+levelstr.str());
      }
      if (pL.get<bool>("Keep AP Pattern"))
        Set(coarseLevel, "AP Pattern", AP);

      // Reuse coarse matrix memory if available (multiple solve)
      if (coarseLevel.IsAvailable("RAP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous RAP pattern" << std::endl;

        Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern");
        // Some eigenvalue may have been cached with the matrix in the previous run.
        // As the matrix values will be updated, we need to reset the eigenvalue.
        Ac->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
      }

      // If we do not modify matrix later, allow optimization of storage.
      // This is necessary for new faster Epetra MM kernels.
      bool doOptimizeStorage = !pL.get<bool>("RepairMainDiagonal");

      const bool doTranspose    = true;
      const bool doFillComplete = true;
      if (pL.get<bool>("transpose: use implicit") == true) {
        SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel);
        Ac = Utils::Multiply(*P,  doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-implicit-")+levelstr.str());

      } else {
        RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");

        SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel);
        Ac = Utils::Multiply(*R, !doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-explicit-")+levelstr.str());
      }

      CheckRepairMainDiagonal(Ac);

      if (IsPrint(Statistics1)) {
        RCP<ParameterList> params = rcp(new ParameterList());;
        params->set("printLoadBalancingInfo", true);
        params->set("printCommInfo",          true);
        GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ac, "Ac", params);
      }

      Set(coarseLevel, "A",           Ac);
      if (pL.get<bool>("Keep RAP Pattern"))
        Set(coarseLevel, "RAP Pattern", Ac);
    }

    if (transferFacts_.begin() != transferFacts_.end()) {
      SubFactoryMonitor m(*this, "Projections", coarseLevel);

      // call Build of all user-given transfer factories
      for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
        RCP<const FactoryBase> fac = *it;
        GetOStream(Runtime0) << "RAPFactory: call transfer factory: " << fac->description() << std::endl;
        fac->CallBuild(coarseLevel);
        // Coordinates transfer is marginally different from all other operations
        // because it is *optional*, and not required. For instance, we may need
        // coordinates only on level 4 if we start repartitioning from that level,
        // but we don't need them on level 1,2,3. As our current Hierarchy setup
        // assumes propagation of dependencies only through three levels, this
        // means that we need to rely on other methods to propagate optional data.
        //
        // The method currently used is through RAP transfer factories, which are
        // simply factories which are called at the end of RAP with a single goal:
        // transfer some fine data to coarser level. Because these factories are
        // kind of outside of the mainline factories, they behave different. In
        // particular, we call their Build method explicitly, rather than through
        // Get calls. This difference is significant, as the Get call is smart
        // enough to know when to release all factory dependencies, and Build is
        // dumb. This led to the following CoordinatesTransferFactory sequence:
        // 1. Request level 0
        // 2. Request level 1
        // 3. Request level 0
        // 4. Release level 0
        // 5. Release level 1
        //
        // The problem is missing "6. Release level 0". Because it was missing,
        // we had outstanding request on "Coordinates", "Aggregates" and
        // "CoarseMap" on level 0.
        //
        // This was fixed by explicitly calling Release on transfer factories in
        // RAPFactory. I am still unsure how exactly it works, but now we have
        // clear data requests for all levels.
        coarseLevel.Release(*fac);
      }
    }

  }
  void BlockedRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level &fineLevel, Level &coarseLevel) const {  //FIXME make fineLevel const!!
    FactoryMonitor m(*this, "Computing Ac (block)", coarseLevel);

    const Teuchos::ParameterList& pL = GetParameterList();

    RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
    RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P");

    RCP<BlockedCrsMatrix> bA = rcp_dynamic_cast<BlockedCrsMatrix>(A);
    RCP<BlockedCrsMatrix> bP = rcp_dynamic_cast<BlockedCrsMatrix>(P);
    TEUCHOS_TEST_FOR_EXCEPTION(bA.is_null() || bP.is_null(), Exceptions::BadCast, "Matrices R, A and P must be of type BlockedCrsMatrix.");

    RCP<BlockedCrsMatrix> bAP;
    RCP<BlockedCrsMatrix> bAc;
    {
      SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel);

      // Triple matrix product for BlockedCrsMatrixClass
      TEUCHOS_TEST_FOR_EXCEPTION((bA->Cols() != bP->Rows()), Exceptions::BadCast,
                               "Block matrix dimensions do not match: "
                               "A is " << bA->Rows() << "x" << bA->Cols() <<
                               "P is " << bP->Rows() << "x" << bP->Cols());

      bAP = Utils::TwoMatrixMultiplyBlock(*bA, false, *bP,  false, GetOStream(Statistics2), true, true);
    }


    // If we do not modify matrix later, allow optimization of storage.
    // This is necessary for new faster Epetra MM kernels.
    bool doOptimizeStorage = !checkAc_;

    const bool doTranspose    = true;
    const bool doFillComplete = true;
    if (pL.get<bool>("transpose: use implicit") == true) {
      SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel);
      bAc = Utils::TwoMatrixMultiplyBlock(*bP,  doTranspose, *bAP, !doTranspose, GetOStream(Statistics2), doFillComplete, doOptimizeStorage);

    } else {
      RCP<Matrix>           R  = Get< RCP<Matrix> >(coarseLevel, "R");
      RCP<BlockedCrsMatrix> bR = rcp_dynamic_cast<BlockedCrsMatrix>(R);
      TEUCHOS_TEST_FOR_EXCEPTION(bR.is_null(), Exceptions::BadCast, "Matrix R must be of type BlockedCrsMatrix.");

      TEUCHOS_TEST_FOR_EXCEPTION(bA->Rows() != bR->Cols(), Exceptions::BadCast,
                                 "Block matrix dimensions do not match: "
                                 "R is " << bR->Rows() << "x" << bR->Cols() <<
                                 "A is " << bA->Rows() << "x" << bA->Cols());

      SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel);
      bAc = Utils::TwoMatrixMultiplyBlock(*bR, !doTranspose, *bAP, !doTranspose, GetOStream(Statistics2), doFillComplete, doOptimizeStorage);
    }


    if (checkAc_)
      CheckMainDiagonal(bAc);

    GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*bAc, "Ac (blocked)");

    // static int run = 1;
    // RCP<CrsMatrixWrap> A11 = rcp(new CrsMatrixWrap(bAc->getMatrix(0,0)));
    // Utils::Write(toString(run) + "_A_11.mm", *A11);
    // if (!bAc->getMatrix(1,1).is_null()) {
      // RCP<CrsMatrixWrap> A22 = rcp(new CrsMatrixWrap(bAc->getMatrix(1,1)));
      // Utils::Write(toString(run) + "_A_22.mm", *A22);
    // }
    // RCP<CrsMatrixWrap> Am = rcp(new CrsMatrixWrap(bAc->Merge()));
    // Utils::Write(toString(run) + "_A.mm", *Am);
    // run++;

    Set<RCP <Matrix> >(coarseLevel, "A", bAc);

    if (transferFacts_.begin() != transferFacts_.end()) {
      SubFactoryMonitor m1(*this, "Projections", coarseLevel);

      // call Build of all user-given transfer factories
      for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
        RCP<const FactoryBase> fac = *it;
        GetOStream(Runtime0) << "BlockRAPFactory: call transfer factory: " << fac->description() << std::endl;
        fac->CallBuild(coarseLevel);
        // AP (11/11/13): I am not sure exactly why we need to call Release, but we do need it to get rid
        // of dangling data for CoordinatesTransferFactory
        coarseLevel.Release(*fac);
      }
    }
  }
void BlockedRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &fineLevel, Level &coarseLevel) const {  //FIXME make fineLevel const!!
    typedef Xpetra::BlockedCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> BlockedCrsMatrixClass; // TODO move me

    FactoryMonitor m(*this, "Computing Ac (block)", coarseLevel);

    //
    // Inputs: R, A, P
    //

    RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");
    RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
    RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P");

    //
    // Dynamic casts
    //

    RCP<BlockedCrsMatrixClass> bR, bA, bP;

    try {
        /* using rcp_dynamic_cast with throw_on_fail = true */
        bR = Teuchos::rcp_dynamic_cast<BlockedCrsMatrixClass>(R, true);
        bA = Teuchos::rcp_dynamic_cast<BlockedCrsMatrixClass>(A, true);
        bP = Teuchos::rcp_dynamic_cast<BlockedCrsMatrixClass>(P, true);
    } catch(std::bad_cast e) {
        TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::BadCast, "MueLu::BlockedRAPFactory::Build(): matrices R, A and P must be of type BlockedCrsMatrix. " << e.what());
    }

    /*Utils::Write( "A00.m", CrsMatrixWrap(bA->getMatrix(0,0)) );
    Utils::Write( "A11.m", CrsMatrixWrap(bA->getMatrix(1,1)) );
    Utils::Write( "A01.m", CrsMatrixWrap(bA->getMatrix(0,1)) );
    Utils::Write( "A10.m", CrsMatrixWrap(bA->getMatrix(1,0)) );

    Utils::Write( "P00.m", CrsMatrixWrap(bP->getMatrix(0,0)) );
    Utils::Write( "P11.m", CrsMatrixWrap(bP->getMatrix(1,1)) );*/

    //
    // Build Ac = RAP
    //

    // Triple matrix product for BlockedCrsMatrixClass
    TEUCHOS_TEST_FOR_EXCEPTION((bA->Cols() != bP->Rows()) || (bA->Rows() != bR->Cols()), Exceptions::BadCast, "MueLu::BlockedRAPFactory::Build(): block matrix dimensions do not match.");
    RCP<BlockedCrsMatrixClass> bAP = Utils::TwoMatrixMultiplyBlock(*bA, false, *bP,  false, true, true);
    RCP<BlockedCrsMatrixClass> bAc = Utils::TwoMatrixMultiplyBlock(*bR, false, *bAP, false, true, true);

    if (checkAc_)
        CheckMainDiagonal(bAc);

    GetOStream(Statistics1, 0) << Utils::PrintMatrixInfo(*bAc, "Ac (blocked)");

    Set<RCP <Matrix> >(coarseLevel, "A", bAc);

    if (transferFacts_.begin() != transferFacts_.end()) {
        SubFactoryMonitor m1(*this, "Projections", coarseLevel);

        // call Build of all user-given transfer factories
        for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
            RCP<const FactoryBase> fac = *it;
            GetOStream(Runtime0, 0) << "BlockRAPFactory: call transfer factory: " << fac->description() << std::endl;
            fac->CallBuild(coarseLevel);
            // AP (11/11/13): I am not sure exactly why we need to call Release, but we do need it to get rid
            // of dangling data for CoordinatesTransferFactory
            coarseLevel.Release(*fac);
        }
    }
}