void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::CheckRepairMainDiagonal(RCP<Matrix>& Ac) const {
    const Teuchos::ParameterList& pL = GetParameterList();
    bool repairZeroDiagonals = pL.get<bool>("RepairMainDiagonal");
    bool checkAc             = pL.get<bool>("CheckMainDiagonal");

    if (!checkAc && !repairZeroDiagonals)
      return;

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    Teuchos::RCP<Teuchos::ParameterList> p = Teuchos::rcp(new Teuchos::ParameterList());
    p->set("DoOptimizeStorage", true);

    RCP<const Map> rowMap = Ac->getRowMap();
    RCP<Vector> diagVec = VectorFactory::Build(rowMap);
    Ac->getLocalDiagCopy(*diagVec);

    LO lZeroDiags = 0;
    Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0);

    for (size_t i = 0; i < rowMap->getNodeNumElements(); i++) {
      if (diagVal[i] == zero) {
        lZeroDiags++;
      }
    }
    GO gZeroDiags;
    MueLu_sumAll(rowMap->getComm(), Teuchos::as<GO>(lZeroDiags), gZeroDiags);

    if (repairZeroDiagonals && gZeroDiags > 0) {
      // TAW: If Ac has empty rows, put a 1 on the diagonal of Ac. Be aware that Ac might have empty rows AND columns.
      // The columns might not exist in the column map at all.
      //
      // It would be nice to add the entries to the original matrix Ac. But then we would have to use
      // insertLocalValues. However we cannot add new entries for local column indices that do not exist in the column map
      // of Ac (at least Epetra is not able to do this).
      //
      // Here we build a diagonal matrix with zeros on the diagonal and ones on the diagonal for the rows where Ac has empty rows
      // We have to build a new matrix to be able to use insertGlobalValues. Then we add the original matrix Ac to our new block
      // diagonal matrix and use the result as new (non-singular) matrix Ac.
      // This is very inefficient.
      //
      // If you know something better, please let me know.
      RCP<Matrix> fixDiagMatrix = Teuchos::null;
      fixDiagMatrix = MatrixFactory::Build(rowMap, 1);
      for (size_t r = 0; r < rowMap->getNodeNumElements(); r++) {
        if (diagVal[r] == zero) {
          GO grid = rowMap->getGlobalElement(r);
          Teuchos::ArrayRCP<GO> indout(1,grid);
          Teuchos::ArrayRCP<SC> valout(1, one);
          fixDiagMatrix->insertGlobalValues(grid,indout.view(0, 1), valout.view(0, 1));
        }
      }
      {
        Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete1"));
        Ac->fillComplete(p);
      }

      MueLu::Utils2<Scalar, LocalOrdinal, GlobalOrdinal, Node>::TwoMatrixAdd(*Ac, false, 1.0, *fixDiagMatrix, 1.0);
      if (Ac->IsView("stridedMaps"))
        fixDiagMatrix->CreateView("stridedMaps", Ac);

      Ac = Teuchos::null;     // free singular coarse level matrix
      Ac = fixDiagMatrix;     // set fixed non-singular coarse level matrix
    }

    // call fillComplete with optimized storage option set to true
    // This is necessary for new faster Epetra MM kernels.
    {
      Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete2"));
      Ac->fillComplete(p);
    }

    // print some output
    if (IsPrint(Warnings0))
      GetOStream(Warnings0) << "RAPFactory (WARNING): " << (repairZeroDiagonals ? "repaired " : "found ")
          << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;

#ifdef HAVE_MUELU_DEBUG // only for debugging
    // check whether Ac has been repaired...
    Ac->getLocalDiagCopy(*diagVec);
    Teuchos::ArrayRCP< Scalar > diagVal2 = diagVec->getDataNonConst(0);
    for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) {
      if (diagVal2[r] == zero) {
        GetOStream(Errors,-1) << "Error: there are zeros left on diagonal after repair..." << std::endl;
        break;
      }
    }
#endif
  }
  void SubBlockAFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const ParameterList& pL = GetParameterList();
    size_t row = Teuchos::as<size_t>(pL.get<int>("block row"));
    size_t col = Teuchos::as<size_t>(pL.get<int>("block col"));

    RCP<Matrix>           Ain = currentLevel.Get< RCP<Matrix> >("A",this->GetFactory("A").get());
    RCP<BlockedCrsMatrix> A   = rcp_dynamic_cast<BlockedCrsMatrix>(Ain);

    TEUCHOS_TEST_FOR_EXCEPTION(A.is_null(),     Exceptions::BadCast,      "Input matrix A is not a BlockedCrsMatrix.");
    TEUCHOS_TEST_FOR_EXCEPTION(row > A->Rows(), Exceptions::RuntimeError, "row [" << row << "] > A.Rows() [" << A->Rows() << "].");
    TEUCHOS_TEST_FOR_EXCEPTION(col > A->Cols(), Exceptions::RuntimeError, "col [" << col << "] > A.Cols() [" << A->Cols() << "].");

    // get sub-matrix
    RCP<Matrix> Op = A->getMatrix(row, col);

    // Check if it is a BlockedCrsMatrix object
    // If it is a BlockedCrsMatrix object (most likely a ReorderedBlockedCrsMatrix)
    // we have to distinguish whether it is a 1x1 leaf block in the ReorderedBlockedCrsMatrix
    // or a nxm block. If it is a 1x1 leaf block, we "unpack" it and return the underlying
    // CrsMatrixWrap object.
    RCP<BlockedCrsMatrix> bOp = Teuchos::rcp_dynamic_cast<BlockedCrsMatrix>(Op);
    if (bOp != Teuchos::null) {
      // check if it is a 1x1 leaf block
      if (bOp->Rows() == 1 && bOp->Cols() == 1) {
        // return the unwrapped CrsMatrixWrap object underneath
        Op = bOp->getCrsMatrix();
        TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::rcp_dynamic_cast<CrsMatrixWrap>(Op) == Teuchos::null, Exceptions::BadCast, "SubBlockAFactory::Build: sub block A[" << row << "," << col << "] must be a single block CrsMatrixWrap object!");
      } else {
        // If it is a regular nxm blocked operator, just return it.
        // We do not set any kind of striding or blocking information as this
        // usually would not make sense for general blocked operators
        GetOStream(Statistics1) << "A(" << row << "," << col << ") is a " << bOp->Rows() << "x" << bOp->Cols() << " block matrix" << std::endl;
        GetOStream(Statistics2) << "with altogether " << bOp->getGlobalNumRows() << "x" << bOp->getGlobalNumCols() << " rows and columns." << std::endl;
        currentLevel.Set("A", Op, this);
        return;
      }
    }

    // The sub-block is not a BlockedCrsMatrix object, that is, we expect
    // it to be of type CrsMatrixWrap allowing direct access to the corresponding
    // data. For a single block CrsMatrixWrap type matrix we can/should set the
    // corresponding striding/blocking information for the algorithms to work
    // properly
    //
    // TAW: In fact, a 1x1 BlockedCrsMatrix object also allows to access the data
    //      directly, but this feature is nowhere really used in the algorithms.
    //      So let's keep checking for the CrsMatrixWrap class to avoid skrewing
    //      things up
    //
    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::rcp_dynamic_cast<CrsMatrixWrap>(Op) == Teuchos::null, Exceptions::BadCast, "SubBlockAFactory::Build: sub block A[" << row << "," << col << "] is NOT a BlockedCrsMatrix but also NOT a CrsMatrixWrap object? This cannot be.");

    // strided maps for range and domain map of sub matrix
    RCP<const StridedMap> srangeMap  = Teuchos::null;
    RCP<const StridedMap> sdomainMap = Teuchos::null;

    // check for user-specified striding information from XML file

    std::vector<size_t> rangeStridingInfo;
    std::vector<size_t> domainStridingInfo;
    LocalOrdinal rangeStridedBlockId = 0;
    LocalOrdinal domainStridedBlockId = 0;
    bool bRangeUserSpecified  = CheckForUserSpecifiedBlockInfo(true,  rangeStridingInfo,  rangeStridedBlockId);
    bool bDomainUserSpecified = CheckForUserSpecifiedBlockInfo(false, domainStridingInfo, domainStridedBlockId);
    TEUCHOS_TEST_FOR_EXCEPTION(bRangeUserSpecified != bDomainUserSpecified, Exceptions::RuntimeError, "MueLu::SubBlockAFactory[" << row << "," << col << "]: the user has to specify either both domain and range map or none.");

    // extract map information from MapExtractor
    RCP<const MapExtractor> rangeMapExtractor  = A->getRangeMapExtractor();
    RCP<const MapExtractor> domainMapExtractor = A->getDomainMapExtractor();

    RCP<const Map> rangeMap  = rangeMapExtractor ->getMap(row);
    RCP<const Map> domainMap = domainMapExtractor->getMap(col);

    // use user-specified striding information if available. Otherwise try to use internal striding information from the submaps!
    if(bRangeUserSpecified) srangeMap = Teuchos::rcp(new StridedMap(rangeMap,rangeStridingInfo,rangeMap->getIndexBase(),rangeStridedBlockId,0));
    else srangeMap = rcp_dynamic_cast<const StridedMap>(rangeMap);

    if(bDomainUserSpecified) sdomainMap = Teuchos::rcp(new StridedMap(domainMap,domainStridingInfo,domainMap->getIndexBase(),domainStridedBlockId,0));
    else sdomainMap = rcp_dynamic_cast<const StridedMap>(domainMap);

    // In case that both user-specified and internal striding information from the submaps
    // does not contain valid striding information, try to extract it from the global maps
    // in the map extractor.
    if (srangeMap.is_null()) {
      RCP<const Map>         fullRangeMap = rangeMapExtractor->getFullMap();
      RCP<const StridedMap> sFullRangeMap = rcp_dynamic_cast<const StridedMap>(fullRangeMap);
      TEUCHOS_TEST_FOR_EXCEPTION(sFullRangeMap.is_null(), Exceptions::BadCast, "Full rangeMap is not a strided map.");

      std::vector<size_t> stridedData = sFullRangeMap->getStridingData();
      if (stridedData.size() == 1 && row > 0) {
        // We have block matrices. use striding block information 0
        srangeMap = StridedMapFactory::Build(rangeMap, stridedData,   0, sFullRangeMap->getOffset());

      } else {
        // We have strided matrices. use striding information of the corresponding block
        srangeMap = StridedMapFactory::Build(rangeMap, stridedData, row, sFullRangeMap->getOffset());
      }
    }

    if (sdomainMap.is_null()) {
      RCP<const Map>         fullDomainMap = domainMapExtractor->getFullMap();
      RCP<const StridedMap> sFullDomainMap = rcp_dynamic_cast<const StridedMap>(fullDomainMap);
      TEUCHOS_TEST_FOR_EXCEPTION(sFullDomainMap.is_null(), Exceptions::BadCast, "Full domainMap is not a strided map");

      std::vector<size_t> stridedData = sFullDomainMap->getStridingData();
      if (stridedData.size() == 1 && col > 0) {
        // We have block matrices. use striding block information 0
        sdomainMap = StridedMapFactory::Build(domainMap, stridedData,   0, sFullDomainMap->getOffset());

      } else {
        // We have strided matrices. use striding information of the corresponding block
        sdomainMap = StridedMapFactory::Build(domainMap, stridedData, col, sFullDomainMap->getOffset());
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(srangeMap.is_null(),  Exceptions::BadCast, "rangeMap "  << row << " is not a strided map.");
    TEUCHOS_TEST_FOR_EXCEPTION(sdomainMap.is_null(), Exceptions::BadCast, "domainMap " << col << " is not a strided map.");

    GetOStream(Statistics1) << "A(" << row << "," << col << ") is a single block and has strided maps:"
        << "\n  range  map fixed block size = " << srangeMap ->getFixedBlockSize() << ", strided block id = " << srangeMap ->getStridedBlockId()
        << "\n  domain map fixed block size = " << sdomainMap->getFixedBlockSize() << ", strided block id = " << sdomainMap->getStridedBlockId() << std::endl;
    GetOStream(Statistics2) << "A(" << row << "," << col << ") has " << Op->getGlobalNumRows() << "x" << Op->getGlobalNumCols() << " rows and columns." << std::endl;

    // TODO do we really need that? we moved the code to getMatrix...
    if (Op->IsView("stridedMaps") == true)
      Op->RemoveView("stridedMaps");
    Op->CreateView("stridedMaps", srangeMap, sdomainMap);

    TEUCHOS_TEST_FOR_EXCEPTION(Op->IsView("stridedMaps") == false, Exceptions::RuntimeError, "Failed to set \"stridedMaps\" view.");

    currentLevel.Set("A", Op, this);
  }
  void SaPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildP(Level &fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);
    std::ostringstream levelstr;
    levelstr << coarseLevel.GetLevelID();

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);
      A = Utils2::Transpose(*A, true); // build transpose of A explicitely
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    const ParameterList & pL = GetParameterList();
    Scalar dampingFactor = as<Scalar>(pL.get<double>("sa: damping factor"));
    LO maxEigenIterations = as<LO>(pL.get<int>("sa: eigenvalue estimate num iterations"));
    bool estimateMaxEigen = pL.get<bool>("sa: calculate eigenvalue estimate");
    if (dampingFactor != Teuchos::ScalarTraits<Scalar>::zero()) {

      Scalar lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one() || estimateMaxEigen) {
          GetOStream(Statistics1) << "Calculating max eigenvalue estimate now (max iters = "<< maxEigenIterations << ")" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utils::PowerMethod(*A, true, maxEigenIterations, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel);
        Teuchos::RCP<Vector> invDiag = Utils::GetMatrixDiagonalInverse(*A);

        SC omega = dampingFactor / lambdaMax;

        // finalP = Ptent + (I - \omega D^{-1}A) Ptent
        finalP = Utils::Jacobi(omega, *invDiag, *A, *Ptent, finalP, GetOStream(Statistics2),std::string("MueLu::SaP-")+levelstr.str());
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utils2::Transpose(*finalP, true); // use Utils2 -> specialization for double
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      params->set("printCommInfo",          true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);
    }

  } //Build()
Beispiel #4
0
  void SaPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level &fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);
      A = Utils2::Transpose(A, true); // build transpose of A explicitely
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    //FIXME Xpetra::Matrix should calculate/stash max eigenvalue
    //FIXME SC lambdaMax = A->GetDinvALambda();

    const ParameterList & pL = GetParameterList();
    Scalar dampingFactor = pL.get<Scalar>("Damping factor");
    if (dampingFactor != Teuchos::ScalarTraits<Scalar>::zero()) {

      //Teuchos::ParameterList matrixList;
      //RCP<Matrix> I = MueLu::Gallery::CreateCrsMatrix<SC, LO, GO, Map, CrsMatrixWrap>("Identity", Get< RCP<Matrix> >(fineLevel, "A")->getRowMap(), matrixList);
      //RCP<Matrix> newPtent = Utils::TwoMatrixMultiply(I, false, Ptent, false);
      //Ptent = newPtent; //I tried a checkout of the original Ptent, and it seems to be gone now (which is good)

      RCP<Matrix> AP;
      {
        SubFactoryMonitor m2(*this, "MxM: A x Ptentative", coarseLevel);
        //JJH -- If I switch doFillComplete to false, the resulting matrix seems weird when printed with describe.
        //JJH -- The final prolongator is wrong, to boot.  So right now, I fillComplete AP, but avoid fillComplete
        //JJH -- in the scaling.  Long story short, we're doing 2 fillCompletes, where ideally we'd do just one.
        bool doFillComplete=true;

        bool optimizeStorage=true;

        // FIXME: ADD() need B.getProfileType()==DynamicProfile
        if (A->getRowMap()->lib() == Xpetra::UseTpetra) {
          optimizeStorage=false;
        }

        bool allowMLMultiply = true;
#ifdef HAVE_MUELU_EXPERIMENTAL
        // Energy minimization uses AP pattern for restriction. The problem with ML multiply is that it automatically
        // removes zero valued entries in the matrix product, resulting in incorrect pattern for the minimization.
        // One could try to mitigate that by multiply matrices with all entries equal to zero, which would produce the
        // correct graph. However, that is one extra MxM we don't need.
        // Instead, I disable ML multiply when experimental option is specified.
        // NOTE: Thanks to C.Siefert, native EPetra MxM multiply version should actually be comparable with ML in time
        allowMLMultiply      = false;
#endif

        AP = Utils::Multiply(*A, false, *Ptent, false, doFillComplete, optimizeStorage, allowMLMultiply);
      }

      {
        SubFactoryMonitor m2(*this, "Scaling (A x Ptentative) by D^{-1}", coarseLevel);
        bool doFillComplete=true;
        bool optimizeStorage=false;
        Teuchos::ArrayRCP<SC> diag = Utils::GetMatrixDiagonal(*A);
        Utils::MyOldScaleMatrix(AP, diag, true, doFillComplete, optimizeStorage); //scale matrix with reciprocal of diag
      }

      Scalar lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one()) {
          GetOStream(Statistics1, 0) << "Calculating max eigenvalue estimate now" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utils::PowerMethod(*A, true, (LO) 10, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1, 0) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics1, 0) << "Damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "M+M: P = (Ptentative) + (D^{-1} x A x Ptentative)", coarseLevel);

        bool doTranspose=false;
        bool PtentHasFixedNnzPerRow=true;
        Utils2::TwoMatrixAdd(Ptent, doTranspose, Teuchos::ScalarTraits<Scalar>::one(), AP, doTranspose, -dampingFactor/lambdaMax, finalP, PtentHasFixedNnzPerRow);
      }

      {
        SubFactoryMonitor m2(*this, "FillComplete() of P", coarseLevel);
        finalP->fillComplete( Ptent->getDomainMap(), Ptent->getRangeMap() );
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utils2::Transpose(finalP, true); // use Utils2 -> specialization for double
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    RCP<ParameterList> params = rcp(new ParameterList());
    params->set("printLoadBalancingInfo", true);
    GetOStream(Statistics0,0) << Utils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);

  } //Build()
  void SaPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level &fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);
      A = Utils2::Transpose(*A, true); // build transpose of A explicitely
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    //FIXME Xpetra::Matrix should calculate/stash max eigenvalue
    //FIXME SC lambdaMax = A->GetDinvALambda();

    const ParameterList & pL = GetParameterList();
    Scalar dampingFactor = pL.get<Scalar>("Damping factor");
    if (dampingFactor != Teuchos::ScalarTraits<Scalar>::zero()) {

      //Teuchos::ParameterList matrixList;
      //RCP<Matrix> I = MueLu::Gallery::CreateCrsMatrix<SC, LO, GO, Map, CrsMatrixWrap>("Identity", Get< RCP<Matrix> >(fineLevel, "A")->getRowMap(), matrixList);
      //RCP<Matrix> newPtent = Utils::TwoMatrixMultiply(I, false, Ptent, false);
      //Ptent = newPtent; //I tried a checkout of the original Ptent, and it seems to be gone now (which is good)

      Scalar lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one()) {
          GetOStream(Statistics1, 0) << "Calculating max eigenvalue estimate now" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utils::PowerMethod(*A, true, (LO) 10, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1, 0) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics0, 0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel);
        Teuchos::RCP<Vector> invDiag = Utils::GetMatrixDiagonalInverse(*A);

	SC omega = dampingFactor / lambdaMax;
	finalP=Utils::Jacobi(omega,*invDiag,*A, *Ptent, finalP,GetOStream(Statistics2,0));
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utils2::Transpose(*finalP, true); // use Utils2 -> specialization for double
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    RCP<ParameterList> params = rcp(new ParameterList());
    params->set("printLoadBalancingInfo", true);
    GetOStream(Statistics1,0) << Utils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);

  } //Build()
  TEUCHOS_UNIT_TEST(CoalesceDropFactory, AmalgamationStridedLW)
  {
#   include "MueLu_UseShortNames.hpp"
    MUELU_TESTING_SET_OSTREAM;
    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,NO);
    out << "version: " << MueLu::Version() << std::endl;

    // unit test for block size 3 using a strided map
    // lightweight wrap = true

    RCP<const Teuchos::Comm<int> > comm = Parameters::getDefaultComm();

    Level fineLevel;
    TestHelpers::TestFactory<SC,LO,GO,NO>::createSingleLevelHierarchy(fineLevel);

    int blockSize=3;

    GO nx = blockSize*comm->getSize();
    RCP<Matrix> A = TestHelpers::TestFactory<SC,LO,GO,NO>::Build1DPoisson(nx);

    std::vector<size_t> stridingInfo;
    stridingInfo.push_back(as<size_t>(blockSize));
    LocalOrdinal stridedBlockId = -1;

    RCP<const Xpetra::StridedMap<LocalOrdinal, GlobalOrdinal, Node> > stridedRangeMap = Xpetra::StridedMapFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(
                                                  A->getRangeMap(),
                                                  stridingInfo,
                                                  stridedBlockId,
                                                  0 /*offset*/
                                                  );
    RCP<const Map> stridedDomainMap = Xpetra::StridedMapFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(
                                            A->getDomainMap(),
                                            stridingInfo,
                                            stridedBlockId,
                                            0 /*offset*/
                                            );

    if(A->IsView("stridedMaps") == true) A->RemoveView("stridedMaps");
    A->CreateView("stridedMaps", stridedRangeMap, stridedDomainMap);

    fineLevel.Set("A", A);
    CoalesceDropFactory dropFact = CoalesceDropFactory();
    dropFact.SetParameter("lightweight wrap",Teuchos::ParameterEntry(true));
    fineLevel.Request("Graph", &dropFact);
    fineLevel.Request("DofsPerNode", &dropFact);

    dropFact.Build(fineLevel);

    fineLevel.print(out);
    RCP<GraphBase> graph = fineLevel.Get<RCP<GraphBase> >("Graph", &dropFact);
    LO myDofsPerNode = fineLevel.Get<LO>("DofsPerNode", &dropFact);
    TEST_EQUALITY(as<int>(graph->GetDomainMap()->getGlobalNumElements()) == comm->getSize(), true);
    TEST_EQUALITY(as<int>(myDofsPerNode) == blockSize, true);
    bool bCorrectGraph = false;
    if (comm->getSize() == 1 && graph->getNeighborVertices(0).size() == 1) {
      bCorrectGraph = true;
    } else {
      if (comm->getRank() == 0 || comm->getRank() == comm->getSize()-1) {
        if (graph->getNeighborVertices(0).size() == 2) bCorrectGraph = true;
      }
      else {
        if (graph->getNeighborVertices(0).size() == blockSize) bCorrectGraph = true;
      }
    }
    TEST_EQUALITY(bCorrectGraph, true);

    const RCP<const Map> myImportMap = graph->GetImportMap(); // < note that the ImportMap is built from the column map of the matrix A WITHOUT dropping!
    const RCP<const Map> myDomainMap = graph->GetDomainMap();

    TEST_EQUALITY(myImportMap->getMaxAllGlobalIndex(), comm->getSize()-1);
    TEST_EQUALITY(myImportMap->getMinAllGlobalIndex(), 0);
    TEST_EQUALITY(myImportMap->getMinLocalIndex(),0);
    TEST_EQUALITY(myImportMap->getGlobalNumElements(),as<size_t>(comm->getSize()+2*(comm->getSize()-1)));
    if (comm->getSize()>1) {
      size_t numLocalRowMapElts = graph->GetNodeNumVertices();
      size_t numLocalImportElts = myImportMap->getNodeNumElements();
      if (comm->getRank() == 0 || comm->getRank() == comm->getSize()-1) {
        TEST_EQUALITY(as<bool>(numLocalImportElts==numLocalRowMapElts+1), true);
      } else {
        TEST_EQUALITY(as<bool>(numLocalImportElts==numLocalRowMapElts+2), true);
      }
    }
    if (comm->getSize()>1) {
      size_t numLocalRowMapElts = graph->GetNodeNumVertices();
      size_t maxLocalIndex = myImportMap->getMaxLocalIndex();
      if (comm->getRank() == 0 || comm->getRank() == comm->getSize()-1) {
        TEST_EQUALITY(as<bool>(maxLocalIndex==numLocalRowMapElts*blockSize-2), true);
      } else {
        TEST_EQUALITY(as<bool>(maxLocalIndex==numLocalRowMapElts*blockSize-1), true);
      }
    }

    TEST_EQUALITY(myDomainMap->getMaxAllGlobalIndex(), comm->getSize()-1);
    TEST_EQUALITY(myDomainMap->getMinAllGlobalIndex(), 0);
    TEST_EQUALITY(myDomainMap->getMinLocalIndex(),0);
    TEST_EQUALITY(myDomainMap->getMaxLocalIndex(),0);
    TEST_EQUALITY(myDomainMap->getGlobalNumElements(),as<size_t>(comm->getSize()));
    TEST_EQUALITY(as<bool>(myDomainMap->getNodeNumElements()==1), true);
  } // AmalgamationStridedLW
void SubBlockAFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    const ParameterList& pL = GetParameterList();
    size_t row = Teuchos::as<size_t>(pL.get<int>("block row"));
    size_t col = Teuchos::as<size_t>(pL.get<int>("block col"));

    RCP<Matrix>           Ain = Get<RCP<Matrix> >(currentLevel, "A");
    RCP<BlockedCrsMatrix> A   = rcp_dynamic_cast<BlockedCrsMatrix>(Ain);

    TEUCHOS_TEST_FOR_EXCEPTION(A.is_null(),     Exceptions::BadCast,      "Input matrix A is not a BlockedCrsMatrix.");
    TEUCHOS_TEST_FOR_EXCEPTION(row > A->Rows(), Exceptions::RuntimeError, "row [" << row << "] > A.Rows() [" << A->Rows() << "].");
    TEUCHOS_TEST_FOR_EXCEPTION(col > A->Cols(), Exceptions::RuntimeError, "col [" << col << "] > A.Cols() [" << A->Cols() << "].");

    RCP<CrsMatrixWrap> Op = Teuchos::rcp(new CrsMatrixWrap(A->getMatrix(row, col)));

    //////////////// EXPERIMENTAL
    // extract striding information from RangeMapExtractor

    RCP<const MapExtractor> rangeMapExtractor  = A->getRangeMapExtractor();
    RCP<const MapExtractor> domainMapExtractor = A->getDomainMapExtractor();

    RCP<const Map> rangeMap  = rangeMapExtractor ->getMap(row);
    RCP<const Map> domainMap = domainMapExtractor->getMap(col);

    RCP<const StridedMap> srangeMap  = rcp_dynamic_cast<const StridedMap>(rangeMap);
    RCP<const StridedMap> sdomainMap = rcp_dynamic_cast<const StridedMap>(domainMap);

    if (srangeMap.is_null()) {
        RCP<const Map>         fullRangeMap = rangeMapExtractor->getFullMap();
        RCP<const StridedMap> sFullRangeMap = rcp_dynamic_cast<const StridedMap>(fullRangeMap);
        TEUCHOS_TEST_FOR_EXCEPTION(sFullRangeMap.is_null(), Exceptions::BadCast, "Full rangeMap is not a strided map.");

        std::vector<size_t> stridedData = sFullRangeMap->getStridingData();
        if (stridedData.size() == 1 && row > 0) {
            // We have block matrices. use striding block information 0
            srangeMap = StridedMapFactory::Build(rangeMap, stridedData,   0, sFullRangeMap->getOffset());

        } else {
            // We have strided matrices. use striding information of the corresponding block
            srangeMap = StridedMapFactory::Build(rangeMap, stridedData, row, sFullRangeMap->getOffset());
        }
    }

    if (sdomainMap.is_null()) {
        RCP<const Map>         fullDomainMap = domainMapExtractor->getFullMap();
        RCP<const StridedMap> sFullDomainMap = rcp_dynamic_cast<const StridedMap>(fullDomainMap);
        TEUCHOS_TEST_FOR_EXCEPTION(sFullDomainMap.is_null(), Exceptions::BadCast, "Full domainMap is not a strided map");

        std::vector<size_t> stridedData = sFullDomainMap->getStridingData();
        if (stridedData.size() == 1 && col > 0) {
            // We have block matrices. use striding block information 0
            sdomainMap = StridedMapFactory::Build(domainMap, stridedData,   0, sFullDomainMap->getOffset());

        } else {
            // We have strided matrices. use striding information of the corresponding block
            sdomainMap = StridedMapFactory::Build(domainMap, stridedData, col, sFullDomainMap->getOffset());
        }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(srangeMap.is_null(),  Exceptions::BadCast, "rangeMap "  << row << " is not a strided map.");
    TEUCHOS_TEST_FOR_EXCEPTION(sdomainMap.is_null(), Exceptions::BadCast, "domainMap " << col << " is not a strided map.");

    GetOStream(Statistics1) << "A(" << row << "," << col << ") has strided maps:"
                            << "\n  range  map fixed block size = " << srangeMap ->getFixedBlockSize() << ", strided block id = " << srangeMap ->getStridedBlockId()
                            << "\n  domain map fixed block size = " << sdomainMap->getFixedBlockSize() << ", strided block id = " << sdomainMap->getStridedBlockId() << std::endl;

    if (Op->IsView("stridedMaps") == true)
        Op->RemoveView("stridedMaps");
    Op->CreateView("stridedMaps", srangeMap, sdomainMap);

    TEUCHOS_TEST_FOR_EXCEPTION(Op->IsView("stridedMaps") == false, Exceptions::RuntimeError, "Failed to set \"stridedMaps\" view.");

    //////////////// EXPERIMENTAL

    currentLevel.Set("A", rcp_dynamic_cast<Matrix>(Op), this);
}
  void SaPFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);

    // Add debugging information
    DeviceType::execution_space::print_configuration(GetOStream(Runtime1));

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory_kokkos and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);

      A = Utilities_kokkos::Transpose(*A, true); // build transpose of A explicitly
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    // Reuse pattern if available
    RCP<ParameterList> APparams = rcp(new ParameterList);
    if (coarseLevel.IsAvailable("AP reuse data", this)) {
      GetOStream(static_cast<MsgType>(Runtime0 | Test)) << "Reusing previous AP data" << std::endl;

      APparams = coarseLevel.Get< RCP<ParameterList> >("AP reuse data", this);

      if (APparams->isParameter("graph"))
        finalP = APparams->get< RCP<Matrix> >("graph");
    }

    const ParameterList& pL = GetParameterList();
    SC dampingFactor      = as<SC>(pL.get<double>("sa: damping factor"));
    LO maxEigenIterations = as<LO>(pL.get<int>("sa: eigenvalue estimate num iterations"));
    bool estimateMaxEigen = pL.get<bool>("sa: calculate eigenvalue estimate");
    if (dampingFactor != Teuchos::ScalarTraits<SC>::zero()) {

      SC lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one() || estimateMaxEigen) {
          GetOStream(Statistics1) << "Calculating max eigenvalue estimate now (max iters = "<< maxEigenIterations << ")" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utilities_kokkos::PowerMethod(*A, true, maxEigenIterations, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel);
        RCP<Vector> invDiag = Utilities_kokkos::GetMatrixDiagonalInverse(*A);

        SC omega = dampingFactor / lambdaMax;

        // finalP = Ptent + (I - \omega D^{-1}A) Ptent
        finalP = Xpetra::IteratorOps<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Jacobi(omega, *invDiag, *A, *Ptent, finalP, GetOStream(Statistics2), std::string("MueLu::SaP-") + toString(coarseLevel.GetLevelID()), APparams);
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utilities_kokkos::Transpose(*finalP, true);
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      params->set("printCommInfo",          true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);
    }

  } //Build()