void calculateStats(Type& minVal, Type& maxVal, double& avgVal, double& devVal, const RCP<const Teuchos::Comm<int> >& comm, const Type& v) {
    int numProcs = comm->getSize();

    Type sumVal, sum2Val;

    MueLu_sumAll(comm,   v, sumVal);
    MueLu_sumAll(comm, v*v, sum2Val);
    MueLu_minAll(comm,   v, minVal);
    MueLu_maxAll(comm,   v, maxVal);

    avgVal = as<double>(sumVal) / numProcs;
    devVal = (numProcs != 1 ? sqrt((sum2Val - sumVal*avgVal)/(numProcs-1)) : 0);
  }
  std::string MHDRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::PrintLoadBalancingInfo(const Matrix & Ac, const std::string & msgTag) {
    std::stringstream ss(std::stringstream::out);

    // TODO: provide a option to skip this (to avoid global communication)
      // TODO: skip if nproc == 1

    //nonzero imbalance
    size_t numMyNnz  = Ac.getNodeNumEntries();
    GO maxNnz, minNnz;
    RCP<const Teuchos::Comm<int> > comm = Ac.getRowMap()->getComm();
    MueLu_maxAll(comm,(GO)numMyNnz,maxNnz);
    //min nnz over all proc (disallow any processors with 0 nnz)
    MueLu_minAll(comm, (GO)((numMyNnz > 0) ? numMyNnz : maxNnz), minNnz);
    double imbalance = ((double) maxNnz) / minNnz;

    size_t numMyRows = Ac.getNodeNumRows();
    //Check whether Ac is spread over more than one process.
    GO numActiveProcesses=0;
    MueLu_sumAll(comm, (GO)((numMyRows > 0) ? 1 : 0), numActiveProcesses);

    //min, max, and avg # rows per proc
    GO minNumRows, maxNumRows;
    double avgNumRows;
    MueLu_maxAll(comm, (GO)numMyRows, maxNumRows);
    MueLu_minAll(comm, (GO)((numMyRows > 0) ? numMyRows : maxNumRows), minNumRows);
    assert(numActiveProcesses > 0);
    avgNumRows = Ac.getGlobalNumRows() / numActiveProcesses;

    ss << msgTag << " # processes with rows = " << numActiveProcesses << std::endl;
    ss << msgTag << " min # rows per proc = " << minNumRows << ", max # rows per proc = " << maxNumRows << ", avg # rows per proc = " << avgNumRows << std::endl;
    ss << msgTag << " nonzero imbalance = " << imbalance << std::endl;

    return ss.str();
  }
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::CheckRepairMainDiagonal(RCP<Matrix>& Ac) const {
    const Teuchos::ParameterList& pL = GetParameterList();
    bool repairZeroDiagonals = pL.get<bool>("RepairMainDiagonal");
    bool checkAc             = pL.get<bool>("CheckMainDiagonal");

    if (!checkAc && !repairZeroDiagonals)
      return;

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    Teuchos::RCP<Teuchos::ParameterList> p = Teuchos::rcp(new Teuchos::ParameterList());
    p->set("DoOptimizeStorage", true);

    RCP<const Map> rowMap = Ac->getRowMap();
    RCP<Vector> diagVec = VectorFactory::Build(rowMap);
    Ac->getLocalDiagCopy(*diagVec);

    LO lZeroDiags = 0;
    Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0);

    for (size_t i = 0; i < rowMap->getNodeNumElements(); i++) {
      if (diagVal[i] == zero) {
        lZeroDiags++;
      }
    }
    GO gZeroDiags;
    MueLu_sumAll(rowMap->getComm(), Teuchos::as<GO>(lZeroDiags), gZeroDiags);

    if (repairZeroDiagonals && gZeroDiags > 0) {
      // TAW: If Ac has empty rows, put a 1 on the diagonal of Ac. Be aware that Ac might have empty rows AND columns.
      // The columns might not exist in the column map at all.
      //
      // It would be nice to add the entries to the original matrix Ac. But then we would have to use
      // insertLocalValues. However we cannot add new entries for local column indices that do not exist in the column map
      // of Ac (at least Epetra is not able to do this).
      //
      // Here we build a diagonal matrix with zeros on the diagonal and ones on the diagonal for the rows where Ac has empty rows
      // We have to build a new matrix to be able to use insertGlobalValues. Then we add the original matrix Ac to our new block
      // diagonal matrix and use the result as new (non-singular) matrix Ac.
      // This is very inefficient.
      //
      // If you know something better, please let me know.
      RCP<Matrix> fixDiagMatrix = Teuchos::null;
      fixDiagMatrix = MatrixFactory::Build(rowMap, 1);
      for (size_t r = 0; r < rowMap->getNodeNumElements(); r++) {
        if (diagVal[r] == zero) {
          GO grid = rowMap->getGlobalElement(r);
          Teuchos::ArrayRCP<GO> indout(1,grid);
          Teuchos::ArrayRCP<SC> valout(1, one);
          fixDiagMatrix->insertGlobalValues(grid,indout.view(0, 1), valout.view(0, 1));
        }
      }
      {
        Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete1"));
        Ac->fillComplete(p);
      }

      MueLu::Utils2<Scalar, LocalOrdinal, GlobalOrdinal, Node>::TwoMatrixAdd(*Ac, false, 1.0, *fixDiagMatrix, 1.0);
      if (Ac->IsView("stridedMaps"))
        fixDiagMatrix->CreateView("stridedMaps", Ac);

      Ac = Teuchos::null;     // free singular coarse level matrix
      Ac = fixDiagMatrix;     // set fixed non-singular coarse level matrix
    }

    // call fillComplete with optimized storage option set to true
    // This is necessary for new faster Epetra MM kernels.
    {
      Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete2"));
      Ac->fillComplete(p);
    }

    // print some output
    if (IsPrint(Warnings0))
      GetOStream(Warnings0) << "RAPFactory (WARNING): " << (repairZeroDiagonals ? "repaired " : "found ")
          << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;

#ifdef HAVE_MUELU_DEBUG // only for debugging
    // check whether Ac has been repaired...
    Ac->getLocalDiagCopy(*diagVec);
    Teuchos::ArrayRCP< Scalar > diagVal2 = diagVec->getDataNonConst(0);
    for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) {
      if (diagVal2[r] == zero) {
        GetOStream(Errors,-1) << "Error: there are zeros left on diagonal after repair..." << std::endl;
        break;
      }
    }
#endif
  }
  void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const Teuchos::ParameterList & pL = GetParameterList();
    // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
    // TODO (JG): I don't really know if we want to do this.
    const int    startLevel          = pL.get<int>   ("repartition: start level");
    const LO     minRowsPerProcessor = pL.get<LO>    ("repartition: min rows per proc");
    const double nonzeroImbalance    = pL.get<double>("repartition: max imbalance");
    const bool   remapPartitions     = pL.get<bool>  ("repartition: remap parts");

    // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");

    // ======================================================================================================
    // Determine whether partitioning is needed
    // ======================================================================================================
    // NOTE: most tests include some global communication, which is why we currently only do tests until we make
    // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
    // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.

    // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
    if (currentLevel.GetLevelID() < startLevel) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" <<
          "\n  current level = " << Teuchos::toString(currentLevel.GetLevelID()) <<
          ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    RCP<const Map> rowMap = A->getRowMap();

    // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is
    // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it
    // does not matter.
    RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
    RCP<const Teuchos::Comm<int> > comm     = origComm->duplicate();

    // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
    // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
    // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
    {
      int numActiveProcesses = 0;
      MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses);

      if (numActiveProcesses == 1) {
        GetOStream(Statistics0) << "Repartitioning?  NO:" <<
            "\n  # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;

        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    bool test3 = false, test4 = false;
    std::string msg3, msg4;

    // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
    // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
    if (minRowsPerProcessor > 0) {
      LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
      LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0;
      MueLu_sumAll(comm, haveFewRows, numWithFewRows);
      MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);

      // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
      // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
      // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
      if (numWithFewRows > 0)
        test3 = true;

      msg3 = "\n  min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor);
    }

    // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
    if (!test3) {
      GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries());
      MueLu_maxAll(comm, numMyNnz,                           maxNnz);
      MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
      double imbalance = Teuchos::as<double>(maxNnz)/minNnz;

      if (imbalance > nonzeroImbalance)
        test4 = true;

      msg4 = "\n  nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
    }

    if (!test3 && !test4) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" << msg3 + msg4 << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;

    GO                     indexBase = rowMap->getIndexBase();
    Xpetra::UnderlyingLib  lib       = rowMap->lib();
    int myRank   = comm->getRank();
    int numProcs = comm->getSize();

    RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
    TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();

    // ======================================================================================================
    // Calculate number of partitions
    // ======================================================================================================
    // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
    // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
    GO numPartitions;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    } else {
      if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) {
        // System is too small, migrate it to a single processor
        numPartitions = 1;

      } else {
        // Make sure that each processor has approximately minRowsPerProcessor
        numPartitions = A->getGlobalNumRows() / minRowsPerProcessor;
      }
      numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs));

      currentLevel.Set("number of partitions", numPartitions, NoFactory::get());
    }
    GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl;

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition;
    if (numPartitions == 1) {
      // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
      // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
      // in Zoltan[12]Interface).
      // TODO: We can probably skip more work in this case (like building all extra data structures)
      GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl;
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true);

    } else {
      decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");

      if (decomposition.is_null()) {
        GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    // ======================================================================================================
    // Remap if necessary
    // ======================================================================================================
    // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
    // There are two problems, however.
    // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
    //     This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
    //     each partition is the same.
    // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
    //     Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
    //     GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
    //     processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
    //     be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
    //     algorithm can resolve these differently. For instance, running
    //         mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
    //     with
    //         <ParameterList name="MueLu">
    //           <Parameter name="coarse: max size"                type="int"      value="1"/>
    //           <Parameter name="repartition: enable"             type="bool"     value="true"/>
    //           <Parameter name="repartition: min rows per proc"  type="int"      value="2"/>
    //           <ParameterList name="level 1">
    //             <Parameter name="repartition: remap parts"      type="bool"     value="false/true"/>
    //           </ParameterList>
    //         </ParameterList>
    //     produces different repartitioning for level 2.
    //     This different repartitioning may then escalate into different aggregation for the next level.
    //
    // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
    // Fixing (2) is more complicated.
    // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
    // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
    // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
    // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
    // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
    // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
    // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
    // MPI_COMM_WORLD, which may lead to issues for logging.
    if (remapPartitions) {
      SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);

      DeterminePartitionPlacement(*A, *decomposition, numPartitions);
    }

    // ======================================================================================================
    // Construct importer
    // ======================================================================================================
    // At this point, the following is true:
    //  * Each processors owns 0 or 1 partitions
    //  * If a processor owns a partition, that partition number is equal to the processor rank
    //  * The decomposition vector contains the partitions ids that the corresponding GID belongs to

    ArrayRCP<const GO> decompEntries;
    if (decomposition->getLocalLength() > 0)
      decompEntries = decomposition->getData(0);

#ifdef HAVE_MUELU_DEBUG
    // Test range of partition ids
    int incorrectRank = -1;
    for (int i = 0; i < decompEntries.size(); i++)
      if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
        incorrectRank = myRank;
        break;
      }

    int incorrectGlobalRank = -1;
    MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
    TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
#endif

    Array<GO> myGIDs;
    myGIDs.reserve(decomposition->getLocalLength());

    // Step 0: Construct mapping
    //    part number -> GIDs I own which belong to this part
    // NOTE: my own part GIDs are not part of the map
    typedef std::map<GO, Array<GO> > map_type;
    map_type sendMap;
    for (LO i = 0; i < decompEntries.size(); i++) {
      GO id  = decompEntries[i];
      GO GID = rowMap->getGlobalElement(i);

      if (id == myRank)
        myGIDs     .push_back(GID);
      else
        sendMap[id].push_back(GID);
    }
    decompEntries = Teuchos::null;

    if (IsPrint(Statistics2)) {
      GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
      MueLu_sumAll(comm,numLocalKept, numGlobalKept);
      GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl;
    }

    int numSend = sendMap.size(), numRecv;

    // Arrayify map keys
    Array<GO> myParts(numSend), myPart(1);
    int cnt = 0;
    myPart[0] = myRank;
    for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
      myParts[cnt++] = it->first;

    // Step 1: Find out how many processors send me data
    // partsIndexBase starts from zero, as the processors ids start from zero
    GO partsIndexBase = 0;
    RCP<Map>    partsIHave  = MapFactory   ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
    RCP<Map>    partsIOwn   = MapFactory   ::Build(lib,                                                 numProcs,  myPart(), partsIndexBase, comm);
    RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);

    RCP<GOVector> partsISend    = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave);
    RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn);
    if (numSend) {
      ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
      for (int i = 0; i < numSend; i++)
        partsISendData[i] = 1;
    }
    (numPartsIRecv->getDataNonConst(0))[0] = 0;

    numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
    numRecv = (numPartsIRecv->getData(0))[0];

    // Step 2: Get my GIDs from everybody else
    MPI_Datatype MpiType = MpiTypeTraits<GO>::getType();
    int msgTag = 12345;  // TODO: use Comm::dup for all internal messaging

    // Post sends
    Array<MPI_Request> sendReqs(numSend);
    cnt = 0;
    for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
      MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);

    map_type recvMap;
    size_t totalGIDs = myGIDs.size();
    for (int i = 0; i < numRecv; i++) {
      MPI_Status status;
      MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);

      // Get rank and number of elements from status
      int fromRank = status.MPI_SOURCE, count;
      MPI_Get_count(&status, MpiType, &count);

      recvMap[fromRank].resize(count);
      MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);

      totalGIDs += count;
    }

    // Do waits on send requests
    if (numSend) {
      Array<MPI_Status> sendStatuses(numSend);
      MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
    }

    // Merge GIDs
    myGIDs.reserve(totalGIDs);
    for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
      int offset = myGIDs.size(), len = it->second.size();
      if (len) {
        myGIDs.resize(offset + len);
        memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO));
      }
    }
    // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
    // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
    std::sort(myGIDs.begin(), myGIDs.end());

    // Step 3: Construct importer
    RCP<Map>          newRowMap      = MapFactory   ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
    RCP<const Import> rowMapImporter;
    {
      SubFactoryMonitor m1(*this, "Import construction", currentLevel);
      rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
    }

    Set(currentLevel, "Importer", rowMapImporter);

    // ======================================================================================================
    // Print some data
    // ======================================================================================================
    if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
      // Print the grid of processors
      GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;

      char amActive = (myGIDs.size() ? 1 : 0);
      std::vector<char> areActive(numProcs, 0);
      MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);

      int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
      for (int proc = 0; proc < numProcs; proc += rowWidth) {
        for (int j = 0; j < rowWidth; j++)
          if (proc + j < numProcs)
            GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
          else
          GetOStream(Statistics2) << " ";

        GetOStream(Statistics2) << "      " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
      }
    }

  } // Build
  void BrickAggregationFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    typedef Xpetra::MultiVector<double,LO,GO,NO> MultiVector_d;

    const ParameterList& pL = GetParameterList();

    RCP<MultiVector_d> coords = Get<RCP<MultiVector_d> >(currentLevel, "Coordinates");
    RCP<Matrix>        A      = Get< RCP<Matrix> >      (currentLevel, "A");
    RCP<const Map>     rowMap = A->getRowMap();
    RCP<const Map>     colMap = A->getColMap();

    RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
    int numProcs = comm->getSize();
    int myRank   = comm->getRank();

    int numPoints = colMap->getNodeNumElements();

    bx_ = pL.get<int>("aggregation: brick x size");
    by_ = pL.get<int>("aggregation: brick y size");
    bz_ = pL.get<int>("aggregation: brick z size");

    if (numProcs > 1) {
      // TODO: deal with block size > 1  (see comments above)
      TEUCHOS_TEST_FOR_EXCEPTION(bx_ > 3 || by_ > 3 || bz_ > 3, Exceptions::RuntimeError, "Currently cannot deal with brick size > 3");
    }

    RCP<MultiVector_d> overlappedCoords = coords;
    RCP<const Import> importer = ImportFactory::Build(coords->getMap(), colMap);
    if (!importer.is_null()) {
      overlappedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(colMap, coords->getNumVectors());
      overlappedCoords->doImport(*coords, *importer, Xpetra::INSERT);
    }

    // Setup misc structures
    // Logically, we construct enough data to query topological information of a rectangular grid
    Setup(comm, overlappedCoords, colMap);

    GetOStream(Runtime0) << "Using brick size: " << bx_
                                                 << (nDim_ > 1 ? "x " + toString(by_) : "")
                                                 << (nDim_ > 2 ? "x " + toString(bz_) : "") << std::endl;

    // Construct aggregates
    RCP<Aggregates> aggregates = rcp(new Aggregates(colMap));
    aggregates->setObjectLabel("Brick");

    ArrayRCP<LO> vertex2AggId = aggregates->GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates->GetProcWinner()  ->getDataNonConst(0);

    // In the first pass, we set a mapping from a vertex to aggregate global id. We deal with a structured
    // rectangular mesh, therefore we know the structure of aggregates. For each vertex we can tell exactly
    // which aggregate it belongs to.
    // If we determine that the aggregate does not belong to us (i.e. the root vertex does not belong to this
    // processor, or is outside and we lost "" arbitration), we record the global aggregate id in order to
    // fetch the local info from the processor owning the aggregate. This is required for aggregates, as it
    // uses the local aggregate ids of the owning processor.
    std::set<GO> myAggGIDs, remoteAggGIDs;
    for (LO LID = 0; LID < numPoints; LID++) {
      GO aggGID = getAggGID(LID);

      if ((revMap_.find(getRoot(LID)) != revMap_.end()) && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) {
        // Root of the brick aggregate containing GID (<- LID) belongs to us
        vertex2AggId[LID] = aggGID;
        myAggGIDs.insert(aggGID);

        if (isRoot(LID))
          aggregates->SetIsRoot(LID);

      } else {
        remoteAggGIDs.insert(aggGID);
      }
    }
    size_t numAggregates = myAggGIDs    .size();
    size_t numRemote     = remoteAggGIDs.size();
    aggregates->SetNumAggregates(numAggregates);

    std::map<GO,LO>  AggG2L;  // Map: Agg GID -> Agg LID (possibly on a different processor)
    std::map<GO,int> AggG2R;  // Map: Agg GID -> processor rank owning aggregate

    Array<GO> myAggGIDsArray(numAggregates), remoteAggGIDsArray(numRemote);

    // Fill in the maps for aggregates that we own
    size_t ind = 0;
    for (typename std::set<GO>::const_iterator it = myAggGIDs.begin(); it != myAggGIDs.end(); it++) {
      AggG2L[*it] = ind;
      AggG2R[*it] = myRank;

      myAggGIDsArray[ind++] = *it;
    }

    // The map is a convenient way to fetch remote local indices from global indices.
    RCP<Map> aggMap = MapFactory::Build(rowMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
                                        myAggGIDsArray, 0, comm);

    ind = 0;
    for (typename std::set<GO>::const_iterator it = remoteAggGIDs.begin(); it != remoteAggGIDs.end(); it++)
      remoteAggGIDsArray[ind++] = *it;

    // Fetch the required aggregate local ids and ranks
    Array<int> remoteProcIDs(numRemote);
    Array<LO>  remoteLIDs   (numRemote);
    aggMap->getRemoteIndexList(remoteAggGIDsArray, remoteProcIDs, remoteLIDs);

    // Fill in the maps for aggregates that we don't own but which have some of our vertices
    for (size_t i = 0; i < numRemote; i++) {
      AggG2L[remoteAggGIDsArray[i]] = remoteLIDs   [i];
      AggG2R[remoteAggGIDsArray[i]] = remoteProcIDs[i];
    }

    // Remap aggregate GIDs to LIDs and set up owning processors
    for (LO LID = 0; LID < numPoints; LID++) {
      if (revMap_.find(getRoot(LID)) != revMap_.end() && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) {
        GO aggGID = vertex2AggId[LID];

        vertex2AggId[LID] = AggG2L[aggGID];
        procWinner  [LID] = AggG2R[aggGID];
      }
    }

    GO numGlobalRemote;
    MueLu_sumAll(comm, as<GO>(numRemote), numGlobalRemote);
    aggregates->AggregatesCrossProcessors(numGlobalRemote);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void UncoupledAggregationFactory_kokkos<LocalOrdinal, GlobalOrdinal, Node>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    ParameterList pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    if (pL.get<int>("aggregation: max agg size") == -1)
      pL.set("aggregation: max agg size", INT_MAX);

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm_kokkos(graphFact)));
    if (pL.get<bool>("aggregation: allow user-specified singletons") == true)   algos_.push_back(rcp(new OnePtAggregationAlgorithm_kokkos             (graphFact)));
    if (pL.get<bool>("aggregation: enable phase 1" )                 == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm_kokkos            (graphFact)));
    if (pL.get<bool>("aggregation: enable phase 2a")                 == true)   algos_.push_back(rcp(new AggregationPhase2aAlgorithm_kokkos           (graphFact)));
    if (pL.get<bool>("aggregation: enable phase 2b")                 == true)   algos_.push_back(rcp(new AggregationPhase2bAlgorithm_kokkos           (graphFact)));
    if (pL.get<bool>("aggregation: enable phase 3" )                 == true)   algos_.push_back(rcp(new AggregationPhase3Algorithm_kokkos            (graphFact)));

    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name");
    RCP<Map> OnePtMap = Teuchos::null;
    if (mapOnePtName.length()) {
      std::string mapOnePtFactName = pL.get<std::string>("OnePt aggregate map factory");
      if (mapOnePtFactName == "" || mapOnePtFactName == "NoFactory") {
        OnePtMap = currentLevel.Get<RCP<Map> >(mapOnePtName, NoFactory::get());
      } else {
        RCP<const FactoryBase> mapOnePtFact = GetFactory(mapOnePtFactName);
        OnePtMap = currentLevel.Get<RCP<Map> >(mapOnePtName, mapOnePtFact.get());
      }
    }

    RCP<const LWGraph_kokkos> graph = Get< RCP<LWGraph_kokkos> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates_kokkos> aggregates = rcp(new Aggregates_kokkos(*graph));
    aggregates->setObjectLabel("UC");

    const LO numRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(numRows, READY);

    // TODO
    //ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    ArrayRCP<const bool> dirichletBoundaryMap;

    if (dirichletBoundaryMap != Teuchos::null)
      for (LO i = 0; i < numRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = BOUNDARY;

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (OnePtMap != Teuchos::null) {
      for (LO i = 0; i < numRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        for (LO kr = 0; kr < nDofsPerNode; kr++)
          if (OnePtMap->isNodeGlobalElement(grid + kr))
            aggStat[i] = ONEPT;
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      MueLu_sumAll(comm, as<GO>(numRows), numGlobalRows);

    LO numNonAggregatedNodes = numRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      int oldRank = algos_[a]->SetProcRankVerbose(this->GetProcRankVerbose());
      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);
      algos_[a]->SetProcRankVerbose(oldRank);

      if (IsPrint(Statistics1)) {
        GO numLocalAggregated = numRows - numNonAggregatedNodes, numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(),  numGlobalAggs = 0;
        MueLu_sumAll(comm, numLocalAggregated, numGlobalAggregated);
        MueLu_sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        if (aggPercent > 99.99 && aggPercent < 100.00) {
          // Due to round off (for instance, for 140465733/140466897), we could
          // get 100.00% display even if there are some remaining nodes. This
          // is bad from the users point of view. It is much better to change
          // it to display 99.99%.
          aggPercent = 99.99;
        }
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics1) << aggregates->description() << std::endl;
  }