void CoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const
  {
    FactoryMonitor m(*this, "Build", currentLevel);

    RCP<Aggregates> aggregates;
    {
      //TODO check for reuse of aggregates here

      // Level Get
      RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

      // Build
      aggregates = rcp(new Aggregates(*graph));
      aggregates->setObjectLabel("UC");

      algo1_.CoarsenUncoupled(*graph, *aggregates);
      algo2_.AggregateLeftovers(*graph, *aggregates);

    }

    aggregates->AggregatesCrossProcessors(true);

    // Level Set
    Set(currentLevel, "Aggregates", aggregates);

    if (IsPrint(Statistics0)) {
      aggregates->describe(GetOStream(Statistics0, 0), getVerbLevel());
    }

  }
Example #2
0
void UserAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
  FactoryMonitor m(*this, "Build", currentLevel);

  const ParameterList & pL = GetParameterList();

  RCP< const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();
  const int myRank = comm->getRank();

  std::string fileName = pL.get<std::string>("filePrefix") + toString(currentLevel.GetLevelID()) + "_" + toString(myRank) + "." + pL.get<std::string>("fileExt");
  std::ifstream ifs(fileName.c_str());
  if (!ifs.good())
    throw Exceptions::RuntimeError("Cannot read data from \"" + fileName + "\"");

  LO numVertices, numAggregates;
  ifs >> numVertices >> numAggregates;

  // FIXME: what is the map?
  Xpetra::UnderlyingLib  lib       = Xpetra::UseEpetra;
  const int              indexBase = 0;
  RCP<Map> map = MapFactory::Build(lib, numVertices, indexBase, comm);

  RCP<Aggregates> aggregates = rcp(new Aggregates(map));
  aggregates->setObjectLabel("User");

  aggregates->SetNumAggregates(numAggregates);

  Teuchos::ArrayRCP<LO> vertex2AggId = aggregates->GetVertex2AggId()->getDataNonConst(0);
  Teuchos::ArrayRCP<LO> procWinner   = aggregates->GetProcWinner()  ->getDataNonConst(0);

  for (LO i = 0; i < numAggregates; i++) {
    int aggSize = 0;
    ifs >> aggSize;

    std::vector<LO> list(aggSize);
    for (int k = 0; k < aggSize; k++) {
      // FIXME: File contains GIDs, we need LIDs
      // for now, works on a single processor
      ifs >> list[k];
    }

    // Mark first node as root node for the aggregate
    aggregates->SetIsRoot(list[0]);

    // Fill vertex2AggId and procWinner structure with information
    for (int k = 0; k < aggSize; k++) {
      vertex2AggId[list[k]] = i;
      procWinner  [list[k]] = myRank;
    }
  }

  // FIXME: do the proper check whether aggregates cross interprocessor boundary
  aggregates->AggregatesCrossProcessors(false);

  Set(currentLevel, "Aggregates", aggregates);

  GetOStream(Statistics0, 0) << aggregates->description() << std::endl;
}
  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Aggregates_kokkos, JustUncoupledAggregation, Scalar, LocalOrdinal, GlobalOrdinal, Node)
  {
#   include "MueLu_UseShortNames.hpp"
    MUELU_TESTING_SET_OSTREAM;
    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,NO);
    out << "version: " << MueLu::Version() << std::endl;

    RCP<Matrix> A = TestHelpers_kokkos::TestFactory<SC, LO, GO, NO>::Build1DPoisson(15);

    RCP<AmalgamationInfo> amalgInfo;
    RCP<Aggregates_kokkos> aggregates = gimmeUncoupledAggregates(A, amalgInfo);

    TEST_EQUALITY(aggregates != Teuchos::null,              true);
    TEST_EQUALITY(aggregates->AggregatesCrossProcessors(),  false);
  }
  TEUCHOS_UNIT_TEST(Aggregates, UncoupledPhase3)
  {
    out << "version: " << MueLu::Version() << std::endl;

    RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(36);
    RCP<const Map> rowmap = A->getRowMap();
    RCP<AmalgamationInfo> amalgInfo;
    RCP<Aggregates> aggregates = gimmeUncoupledAggregates(A, amalgInfo,false,false,false,true);
    GO numAggs = aggregates->GetNumAggregates();
    RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm();

    TEST_EQUALITY(aggregates->AggregatesCrossProcessors(),false);

    ArrayRCP<LO> aggSizes = Teuchos::ArrayRCP<LO>(numAggs);
    ArrayRCP<LO> aggStart;
    ArrayRCP<GO> aggToRowMap;
    amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap);
    for (LO i = 0; i < numAggs; ++i)
      aggSizes[i] = aggStart[i+1] - aggStart[i];

    bool foundAggNotSize2=false;
    for (int i=0; i<aggSizes.size(); ++i)
      if (aggSizes[i] != 2) {
        foundAggNotSize2=true;
        break;
      }

    switch (comm->getSize()) {

      case 1 :
        TEST_EQUALITY(numAggs, 18);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 2:
        TEST_EQUALITY(numAggs, 9);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 3:
        TEST_EQUALITY(numAggs, 6);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 4:
        TEST_EQUALITY(numAggs, 4);
        TEST_EQUALITY(foundAggNotSize2, true);
        break;

      default:
        std::string msg = "Only 1-4 MPI processes are supported.";
        //throw(MueLu::Exceptions::NotImplemented(msg));
        out << msg << std::endl;
        break;
    }

    //ArrayRCP< ArrayRCP<GO> > aggToRowMap(numAggs);
    int root = out.getOutputToRootOnly();
    out.setOutputToRootOnly(-1);
    for (int j=0; j<comm->getSize(); ++j) {
      if (comm->getRank() == j) {
        out << "++ pid " << j << " ++" << std::endl;
        out << "   num local DOFs = " << rowmap->getNodeNumElements() << std::endl;
        for (int i=0; i< numAggs; ++i) {
          out << "   aggregate " << i << ": ";
          for (int k=aggStart[i]; k< aggStart[i+1]; ++k)
            out << aggToRowMap[k] << " ";
          out << std::endl;
        }
      }
      comm->barrier();
    }
    out.setOutputToRootOnly(root);

  } //UncoupledPhase3
  void BrickAggregationFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    typedef Xpetra::MultiVector<double,LO,GO,NO> MultiVector_d;

    const ParameterList& pL = GetParameterList();

    RCP<MultiVector_d> coords = Get<RCP<MultiVector_d> >(currentLevel, "Coordinates");
    RCP<Matrix>        A      = Get< RCP<Matrix> >      (currentLevel, "A");
    RCP<const Map>     rowMap = A->getRowMap();
    RCP<const Map>     colMap = A->getColMap();

    RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
    int numProcs = comm->getSize();
    int myRank   = comm->getRank();

    int numPoints = colMap->getNodeNumElements();

    bx_ = pL.get<int>("aggregation: brick x size");
    by_ = pL.get<int>("aggregation: brick y size");
    bz_ = pL.get<int>("aggregation: brick z size");

    if (numProcs > 1) {
      // TODO: deal with block size > 1  (see comments above)
      TEUCHOS_TEST_FOR_EXCEPTION(bx_ > 3 || by_ > 3 || bz_ > 3, Exceptions::RuntimeError, "Currently cannot deal with brick size > 3");
    }

    RCP<MultiVector_d> overlappedCoords = coords;
    RCP<const Import> importer = ImportFactory::Build(coords->getMap(), colMap);
    if (!importer.is_null()) {
      overlappedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(colMap, coords->getNumVectors());
      overlappedCoords->doImport(*coords, *importer, Xpetra::INSERT);
    }

    // Setup misc structures
    // Logically, we construct enough data to query topological information of a rectangular grid
    Setup(comm, overlappedCoords, colMap);

    GetOStream(Runtime0) << "Using brick size: " << bx_
                                                 << (nDim_ > 1 ? "x " + toString(by_) : "")
                                                 << (nDim_ > 2 ? "x " + toString(bz_) : "") << std::endl;

    // Construct aggregates
    RCP<Aggregates> aggregates = rcp(new Aggregates(colMap));
    aggregates->setObjectLabel("Brick");

    ArrayRCP<LO> vertex2AggId = aggregates->GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates->GetProcWinner()  ->getDataNonConst(0);

    // In the first pass, we set a mapping from a vertex to aggregate global id. We deal with a structured
    // rectangular mesh, therefore we know the structure of aggregates. For each vertex we can tell exactly
    // which aggregate it belongs to.
    // If we determine that the aggregate does not belong to us (i.e. the root vertex does not belong to this
    // processor, or is outside and we lost "" arbitration), we record the global aggregate id in order to
    // fetch the local info from the processor owning the aggregate. This is required for aggregates, as it
    // uses the local aggregate ids of the owning processor.
    std::set<GO> myAggGIDs, remoteAggGIDs;
    for (LO LID = 0; LID < numPoints; LID++) {
      GO aggGID = getAggGID(LID);

      if ((revMap_.find(getRoot(LID)) != revMap_.end()) && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) {
        // Root of the brick aggregate containing GID (<- LID) belongs to us
        vertex2AggId[LID] = aggGID;
        myAggGIDs.insert(aggGID);

        if (isRoot(LID))
          aggregates->SetIsRoot(LID);

      } else {
        remoteAggGIDs.insert(aggGID);
      }
    }
    size_t numAggregates = myAggGIDs    .size();
    size_t numRemote     = remoteAggGIDs.size();
    aggregates->SetNumAggregates(numAggregates);

    std::map<GO,LO>  AggG2L;  // Map: Agg GID -> Agg LID (possibly on a different processor)
    std::map<GO,int> AggG2R;  // Map: Agg GID -> processor rank owning aggregate

    Array<GO> myAggGIDsArray(numAggregates), remoteAggGIDsArray(numRemote);

    // Fill in the maps for aggregates that we own
    size_t ind = 0;
    for (typename std::set<GO>::const_iterator it = myAggGIDs.begin(); it != myAggGIDs.end(); it++) {
      AggG2L[*it] = ind;
      AggG2R[*it] = myRank;

      myAggGIDsArray[ind++] = *it;
    }

    // The map is a convenient way to fetch remote local indices from global indices.
    RCP<Map> aggMap = MapFactory::Build(rowMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
                                        myAggGIDsArray, 0, comm);

    ind = 0;
    for (typename std::set<GO>::const_iterator it = remoteAggGIDs.begin(); it != remoteAggGIDs.end(); it++)
      remoteAggGIDsArray[ind++] = *it;

    // Fetch the required aggregate local ids and ranks
    Array<int> remoteProcIDs(numRemote);
    Array<LO>  remoteLIDs   (numRemote);
    aggMap->getRemoteIndexList(remoteAggGIDsArray, remoteProcIDs, remoteLIDs);

    // Fill in the maps for aggregates that we don't own but which have some of our vertices
    for (size_t i = 0; i < numRemote; i++) {
      AggG2L[remoteAggGIDsArray[i]] = remoteLIDs   [i];
      AggG2R[remoteAggGIDsArray[i]] = remoteProcIDs[i];
    }

    // Remap aggregate GIDs to LIDs and set up owning processors
    for (LO LID = 0; LID < numPoints; LID++) {
      if (revMap_.find(getRoot(LID)) != revMap_.end() && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) {
        GO aggGID = vertex2AggId[LID];

        vertex2AggId[LID] = AggG2L[aggGID];
        procWinner  [LID] = AggG2R[aggGID];
      }
    }

    GO numGlobalRemote;
    MueLu_sumAll(comm, as<GO>(numRemote), numGlobalRemote);
    aggregates->AggregatesCrossProcessors(numGlobalRemote);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    ParameterList pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (pL.get<std::string>("aggregation: mode") == "old") {
      if (pL.get<bool>("UseOnePtAggregationAlgorithm")             == true)   algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
      if (pL.get<bool>("UsePreserveDirichletAggregationAlgorithm") == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("UseUncoupledAggregationAlgorithm")         == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("UseMaxLinkAggregationAlgorithm")           == true)   algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
      if (pL.get<bool>("UseEmergencyAggregationAlgorithm")         == true)   algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));

    } else {
      if (pL.get<bool>("aggregation: preserve Dirichlet points")   == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 1" )             == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2a")             == true)   algos_.push_back(rcp(new AggregationPhase2aAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2b")             == true)   algos_.push_back(rcp(new AggregationPhase2bAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 3" )             == true)   algos_.push_back(rcp(new AggregationPhase3Algorithm            (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    }

    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name");
    RCP<const Map> OnePtMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO numRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(numRows, READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null)
      for (LO i = 0; i < numRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = BOUNDARY;

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (OnePtMap != Teuchos::null) {
      for (LO i = 0; i < numRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        for (LO kr = 0; kr < nDofsPerNode; kr++)
          if (OnePtMap->isNodeGlobalElement(grid + kr))
            aggStat[i] = ONEPT;
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(numRows), numGlobalRows);

    LO numNonAggregatedNodes = numRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = numRows - numNonAggregatedNodes, numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(),  numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        if (aggPercent > 99.99 && aggPercent < 100.00) {
          // Due to round off (for instance, for 140465733/140466897), we could
          // get 100.00% display even if there are some remaining nodes. This
          // is bad from the users point of view. It is much better to change
          // it to display 99.99%.
          aggPercent = 99.99;
        }
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const ParameterList& pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    bool bUseOnePtAggregationAlgorithm             = pL.get<bool>("UseOnePtAggregationAlgorithm");
    bool bUseSmallAggregationAlgorithm             = pL.get<bool>("UseSmallAggregatesAggregationAlgorithm");
    bool bUsePreserveDirichletAggregationAlgorithm = pL.get<bool>("UsePreserveDirichletAggregationAlgorithm");
    bool bUseUncoupledAggregationAglorithm         = pL.get<bool>("UseUncoupledAggregationAlgorithm");
    bool bUseMaxLinkAggregationAlgorithm           = pL.get<bool>("UseMaxLinkAggregationAlgorithm");
    bool bUseIsolatedNodeAggregationAglorithm      = pL.get<bool>("UseIsolatedNodeAggregationAlgorithm");
    bool bUseEmergencyAggregationAlgorithm         = pL.get<bool>("UseEmergencyAggregationAlgorithm");

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (bUseOnePtAggregationAlgorithm)             algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
    if (bUseSmallAggregationAlgorithm)             algos_.push_back(rcp(new SmallAggregationAlgorithm             (graphFact)));
    if (bUseUncoupledAggregationAglorithm)         algos_.push_back(rcp(new UncoupledAggregationAlgorithm         (graphFact)));
    if (bUseMaxLinkAggregationAlgorithm)           algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
    if (bUsePreserveDirichletAggregationAlgorithm) algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
    if (bUseIsolatedNodeAggregationAglorithm)      algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    if (bUseEmergencyAggregationAlgorithm)         algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));


    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name"), mapSmallAggName = pL.get<std::string>("SmallAgg aggregate map name");
    RCP<const Map> OnePtMap, SmallAggMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }
    if (mapSmallAggName.length()) {
      RCP<const FactoryBase> mapSmallAggFact = GetFactory("SmallAgg aggregate map factory");
      SmallAggMap = currentLevel.Get<RCP<const Map> >(mapSmallAggName, mapSmallAggFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO nRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(nRows, NodeStats::READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = NodeStats::BOUNDARY;
    }

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (SmallAggMap != Teuchos::null || OnePtMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        if (SmallAggMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (SmallAggMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::SMALLAGG;
          }
        }

        if (OnePtMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (OnePtMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::ONEPT;
          }
        }
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(nRows), numGlobalRows);

    LO numNonAggregatedNodes = nRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = nRows - numNonAggregatedNodes,  numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(), numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level & fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    typedef Xpetra::MultiVector<double,LO,GO,NO> xdMV;

    GetOStream(Runtime0) << "Transferring coordinates" << std::endl;

    if (coarseLevel.IsAvailable("Coordinates", this)) {
      GetOStream(Runtime0) << "Reusing coordinates" << std::endl;
      return;
    }

    RCP<Aggregates>     aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates");
    RCP<xdMV>    fineCoords = Get< RCP<xdMV> >(fineLevel, "Coordinates");
    RCP<const Map>      coarseMap  = Get< RCP<const Map> >  (fineLevel, "CoarseMap");

    // coarseMap is being used to set up the domain map of tentative P, and therefore, the row map of Ac
    // Therefore, if we amalgamate coarseMap, logical nodes in the coordinates vector would correspond to
    // logical blocks in the matrix

    ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
    LO                  blkSize      = 1;
    if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
      blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();

    GO                  indexBase    = coarseMap->getIndexBase();
    size_t              numElements  = elementAList.size() / blkSize;
    Array<GO>           elementList(numElements);

    // Amalgamate the map
    for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
      elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;

    RCP<const Map>   uniqueMap      = fineCoords->getMap();
    RCP<const Map>   coarseCoordMap = MapFactory        ::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), elementList, indexBase, coarseMap->getComm());
    RCP<xdMV> coarseCoords   = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(coarseCoordMap, fineCoords->getNumVectors());

    // Create overlapped fine coordinates to reduce global communication
    RCP<xdMV> ghostedCoords = fineCoords;
    if (aggregates->AggregatesCrossProcessors()) {
      RCP<const Map>    nonUniqueMap = aggregates->GetMap();
      RCP<const Import> importer     = ImportFactory::Build(uniqueMap, nonUniqueMap);

      ghostedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(nonUniqueMap, fineCoords->getNumVectors());
      ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
    }

    // Get some info about aggregates
    int                         myPID        = uniqueMap->getComm()->getRank();
    LO                          numAggs      = aggregates->GetNumAggregates();
    ArrayRCP<LO>                aggSizes     = aggregates->ComputeAggregateSizes(true,true);
    const ArrayRCP<const LO>    vertex2AggID = aggregates->GetVertex2AggId()->getData(0);
    const ArrayRCP<const LO>    procWinner   = aggregates->GetProcWinner()->getData(0);

    // Fill in coarse coordinates
    for (size_t j = 0; j < fineCoords->getNumVectors(); j++) {
      ArrayRCP<const double> fineCoordsData = ghostedCoords->getData(j);
      ArrayRCP<double>     coarseCoordsData = coarseCoords->getDataNonConst(j);

      for (LO lnode = 0; lnode < vertex2AggID.size(); lnode++)
        if (procWinner[lnode] == myPID)
          coarseCoordsData[vertex2AggID[lnode]] += fineCoordsData[lnode];

      for (LO agg = 0; agg < numAggs; agg++)
        coarseCoordsData[agg] /= aggSizes[agg];
    }

    Set<RCP<xdMV> >(coarseLevel, "Coordinates", coarseCoords);

    const ParameterList& pL = GetParameterList();
    int writeStart = pL.get<int>("write start"), writeEnd = pL.get<int>("write end");
    if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd) {
      std::ostringstream buf;
      buf << fineLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Xpetra::IO<double,LO,GO,NO>::Write(fileName,*fineCoords);
    }
    if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd) {
      std::ostringstream buf;
      buf << coarseLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Xpetra::IO<double,LO,GO,NO>::Write(fileName,*coarseCoords);
    }
  }