Beispiel #1
  int LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::RemoveSmallAggs(Aggregates& aggregates, int min_size,
                      RCP<Xpetra::Vector<double,LO,GO,NO> > & distWeights, const MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> & myWidget) const {
    int myPid = aggregates.GetMap()->getComm()->getRank();

    LO nAggregates = aggregates.GetNumAggregates();

    ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()->getDataNonConst(0);
    ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    LO size = procWinner.size();

    //ArrayRCP<int> AggInfo = Teuchos::arcp<int>(nAggregates+1);
    ArrayRCP<LO> AggInfo = aggregates.ComputeAggregateSizes();

    ArrayRCP<double> weights = distWeights->getDataNonConst(0);

    // Make a list of all aggregates indicating New AggId
    // Use AggInfo array for this.

    LO NewNAggs = 0;
    for (LO i = 0; i < nAggregates; i++) {
      if ( AggInfo[i] < min_size) {
        AggInfo[i] =  MUELU_UNAGGREGATED;
      else AggInfo[i] = NewNAggs++;

    for (LO k = 0; k < size; k++ ) {
      if (procWinner[k] == myPid) {
        if (vertex2AggId[k] !=  MUELU_UNAGGREGATED) {
          vertex2AggId[k] = AggInfo[vertex2AggId[k]];
          weights[k] = 1.;
        if (vertex2AggId[k] ==  MUELU_UNAGGREGATED)
    nAggregates = NewNAggs;

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // procWinner is not set correctly for aggregates which have
    // been eliminated
    for (LO i = 0; i < size; i++) {
      if (vertex2AggId[i] == MUELU_UNAGGREGATED)
        procWinner[i] = MUELU_UNASSIGNED;

    return 0; //TODO
  } //RemoveSmallAggs
void IsolatedNodeAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
  Monitor m(*this, "BuildAggregates");

  Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
  Teuchos::ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);

  const LO nRows = graph.GetNodeNumVertices();

  for (LO iNode=0; iNode<nRows; iNode++) {
    if (aggStat[iNode] == NodeStats::BOUNDARY ||
        (aggStat[iNode] != NodeStats::AGGREGATED && graph.getNeighborVertices(iNode).size() == 1)) {
      // This is a boundary or an isolated node
      aggStat[iNode] = NodeStats::AGGREGATED;
  void EmergencyAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
    Monitor m(*this, "BuildAggregates");

    // vertex ids for output
    ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);

    const LO  nRows  = graph.GetNodeNumVertices();
    const int myRank = graph.GetComm()->getRank();

    int              aggIndex = -1;
    size_t           aggSize  =  0;
    std::vector<int> aggList(graph.getNodeMaxNumRowEntries());

    LO nLocalAggregates = aggregates.GetNumAggregates();
    for (LO iNode = 0; iNode < nRows; iNode++) {
      if (aggStat[iNode] != NodeStats::AGGREGATED) {
        aggSize = 0;

        aggList[aggSize++] = iNode;
        aggIndex = nLocalAggregates++;

        ArrayView<const LO> neighOfINode = graph.getNeighborVertices(iNode);

        for (LO j = 0; j < neighOfINode.size(); j++) {
          LO neigh = neighOfINode[j];

          if (neigh != iNode && graph.isLocalNeighborVertex(neigh) && aggStat[neigh] != NodeStats::AGGREGATED)
            aggList[aggSize++] = neigh;

        // finalize aggregate
        for (size_t k = 0; k < aggSize; k++) {
          aggStat     [aggList[k]] = NodeStats::AGGREGATED;
          vertex2AggId[aggList[k]] = aggIndex;
          procWinner  [aggList[k]] = myRank;

        numNonAggregatedNodes -= aggSize;

void OnePtAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::BuildAggregates(Teuchos::ParameterList const & params, GraphBase const & graph, Aggregates & aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
  Monitor m(*this, "BuildAggregates");

  const LocalOrdinal nRows = graph.GetNodeNumVertices();
  const int myRank = graph.GetComm()->getRank();

  // vertex ids for output
  Teuchos::ArrayRCP<LocalOrdinal> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
  Teuchos::ArrayRCP<LocalOrdinal> procWinner   = aggregates.GetProcWinner()->getDataNonConst(0);

  // some internal variables
  LocalOrdinal nLocalAggregates = aggregates.GetNumAggregates();    // number of local aggregates on current proc
  LocalOrdinal iNode1  = 0;        // current node

  // main loop over all local rows of grpah(A)
  while (iNode1 < nRows) {

    if (aggStat[iNode1] == ONEPT) {

      aggregates.SetIsRoot(iNode1);    // mark iNode1 as root node for new aggregate 'ag'
      Aggregate ag;
      ag.index = nLocalAggregates++;

      // finalize aggregate
      for(size_t k=0; k<ag.list.size(); k++) {
        aggStat[ag.list[k]] = IGNORED;
        vertex2AggId[ag.list[k]] = ag.index;
        procWinner[ag.list[k]] = myRank;
      numNonAggregatedNodes -= ag.list.size();

  } // end while

  // update aggregate object
  void UncoupledAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::
  BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat,
                  LO& numNonAggregatedNodes) const {
    Monitor m(*this, "BuildAggregates");

    AggOptions::Ordering ordering    = params.get<AggOptions::Ordering>("Ordering");
    LO MaxNeighAlreadySelected       = params.get<LO>                  ("MaxNeighAlreadySelected");
    LO MinNodesPerAggregate          = params.get<LO>                  ("MinNodesPerAggregate");
    LO MaxNodesPerAggregate          = params.get<LO>                  ("MaxNodesPerAggregate");

    TEUCHOS_TEST_FOR_EXCEPTION(MaxNodesPerAggregate < MinNodesPerAggregate, Exceptions::RuntimeError, "MueLu::UncoupledAggregationAlgorithm::BuildAggregates: MinNodesPerAggregate must be smaller or equal to MaxNodePerAggregate!");

    if (ordering != NATURAL && ordering != RANDOM && ordering != GRAPH)
      throw Exceptions::RuntimeError("UncoupledAggregation::BuildAggregates : bad aggregation ordering option");

    const LO  nRows  = graph.GetNodeNumVertices();
    const int myRank = graph.GetComm()->getRank();

    // vertex ids for output
    Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    Teuchos::ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);

    // some internal variables
    LO nLocalAggregates = aggregates.GetNumAggregates();    // number of local aggregates on current proc
    std::queue<LO> graph_ordering_inodes; // inodes for graph ordering

    ArrayRCP<LO> randomVector;
    if (ordering == RANDOM) {
      randomVector = arcp<LO>(nRows);
      for (LO i = 0; i < nRows; i++)
        randomVector[i] = i;

    int              aggIndex = -1;
    size_t           aggSize  =  0;
    std::vector<int> aggList(graph.getNodeMaxNumRowEntries());

    // Main loop over all local rows of graph(A)
    for (LO iNode2 = 0; iNode2 < nRows; iNode2++) {
      // Step 1: pick the next node to aggregate
      LO iNode1 = 0;
      if      (ordering == NATURAL) iNode1 = iNode2;
      else if (ordering == RANDOM)  iNode1 = randomVector[iNode2];
      else if (ordering == GRAPH) {

        if (graph_ordering_inodes.size() == 0) {
          // There are no nodes for graph ordering scheme,
          // add exactly one ready node for graph ordering aggregates
          for (LO jnode = 0; jnode < nRows; jnode++)
            if (aggStat[jnode] == NodeStats::READY) {
        if (graph_ordering_inodes.size() == 0) {
          // There are no more ready nodes, end the phase
        iNode1 = graph_ordering_inodes.front();   // take next node from graph ordering queue
        graph_ordering_inodes.pop();              // delete this node in list

      if (aggStat[iNode1] == NodeStats::READY) {
        // Step 2: build tentative aggregate
        aggSize = 0;
        aggList[aggSize++] = iNode1;

        ArrayView<const LO> neighOfINode = graph.getNeighborVertices(iNode1);

        LO numAggregatedNeighbours = 0;

        // NOTE: if neighOfINode.size() < MinNodesPerAggregate, we could skip this loop,
        // but only for NATURAL and RANDOM (for GRAPH we still need the list of local neighbors)
        for (LO j = 0; j < neighOfINode.size(); j++) {
          LO neigh = neighOfINode[j];

          if (neigh != iNode1 && graph.isLocalNeighborVertex(neigh)) {

            if (aggStat[neigh] == NodeStats::READY || aggStat[neigh] == NodeStats::NOTSEL) {
              // Add neighbor node to tentative aggregate
              // but only if aggregate size is not exceeding maximum size
              // NOTE: We do not exit the loop over all neighbours since we have still
              //       to count all aggregated neighbour nodes for the aggregation criteria
              // NOTE: We check here for the maximum aggregation size. If we would do it below
              //       with all the other check too big aggregates would not be accepted at all.
              if (aggSize < as<size_t>(MaxNodesPerAggregate))
                aggList[aggSize++] = neigh;

            } else {

        // Step 3: check if tentative aggregate is acceptable
        if ((numAggregatedNeighbours <= MaxNeighAlreadySelected) &&   // too many connections to other aggregates
            (as<LO>(aggSize)         >= MinNodesPerAggregate)) {      // too few nodes in the tentative aggregate
          // Accept new aggregate
          // iNode1 becomes the root of the newly formed aggregate
          aggIndex = nLocalAggregates++;

          for (size_t k = 0; k < aggSize; k++) {
            aggStat     [aggList[k]] = NodeStats::AGGREGATED;
            vertex2AggId[aggList[k]] = aggIndex;
            procWinner  [aggList[k]] = myRank;

            if (ordering == GRAPH) {
              Teuchos::ArrayView<const LO> neighOfJNode = graph.getNeighborVertices(aggList[k]);
              for (int j = 0; j < neighOfJNode.size(); j++) {
                LO neigh = neighOfJNode[j];

                if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == NodeStats::READY)

          numNonAggregatedNodes -= aggSize;

        } else {
          // Aggregate is not accepted
          aggStat[iNode1] = NodeStats::NOTSEL;

          if (ordering == GRAPH) {
            // Even though the aggregate around iNode1 is not perfect, we want to try
            // the neighbor nodes of iNode1
            for (int j = 0; j < neighOfINode.size(); j++) {
              LO neigh = neighOfINode[j];

              if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == NodeStats::READY)

    // update aggregate object
  void AmalgamationInfo<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::UnamalgamateAggregates(const Aggregates& aggregates,
        Teuchos::ArrayRCP<LocalOrdinal>& aggStart, Teuchos::ArrayRCP<GlobalOrdinal>& aggToRowMap) const {
    int myPid = aggregates.GetMap()->getComm()->getRank();
    Teuchos::ArrayView<const GO> nodeGlobalElts = aggregates.GetMap()->getNodeElementList();
    Teuchos::ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()->getDataNonConst(0);
    Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    LO size = procWinner.size();
    GO numAggregates = aggregates.GetNumAggregates();

    std::vector<LO> sizes(numAggregates);
    if (stridedblocksize_ == 1) {
      for (LO lnode = 0; lnode < size; ++lnode) {
        LO myAgg = vertex2AggId[lnode];
        if (procWinner[lnode] == myPid)
          sizes[myAgg] += 1;
    } else {
      for (LO lnode = 0; lnode < size; ++lnode) {
        LO myAgg = vertex2AggId[lnode];
        if (procWinner[lnode] == myPid) {
          GO gnodeid = nodeGlobalElts[lnode];
          for (LocalOrdinal k = 0; k < stridedblocksize_; k++) {
            GlobalOrdinal gDofIndex = ComputeGlobalDOF(gnodeid,k);
            if (columnMap_->isNodeGlobalElement(gDofIndex))
              sizes[myAgg] += 1;
    aggStart = ArrayRCP<LO>(numAggregates+1,0);
    for (GO i=0; i<numAggregates; ++i) {
      aggStart[i+1] = aggStart[i] + sizes[i];
    aggToRowMap = ArrayRCP<GO>(aggStart[numAggregates],0);

    // count, how many dofs have been recorded for each aggregate so far
    Array<LO> numDofs(numAggregates, 0); // empty array with number of Dofs for each aggregate

    if (stridedblocksize_ == 1) {
      for (LO lnode = 0; lnode < size; ++lnode) {
        LO myAgg = vertex2AggId[lnode];
        if (procWinner[lnode] == myPid) {
          aggToRowMap[ aggStart[myAgg] + numDofs[myAgg] ] = ComputeGlobalDOF(nodeGlobalElts[lnode]);
    } else {
      for (LO lnode = 0; lnode < size; ++lnode) {
        LO myAgg = vertex2AggId[lnode];

        if (procWinner[lnode] == myPid) {
          GO gnodeid = nodeGlobalElts[lnode];
          for (LocalOrdinal k = 0; k < stridedblocksize_; k++) {
            GlobalOrdinal gDofIndex = ComputeGlobalDOF(gnodeid,k);
            if (columnMap_->isNodeGlobalElement(gDofIndex)) {
              aggToRowMap[ aggStart[myAgg] + numDofs[myAgg] ] = gDofIndex;
    // todo plausibility check: entry numDofs[k] == aggToRowMap[k].size()

  } //UnamalgamateAggregates
  void AmalgamationInfo<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::UnamalgamateAggregatesLO(const Aggregates& aggregates,
        Teuchos::ArrayRCP<LO>& aggStart, Teuchos::ArrayRCP<LO>& aggToRowMap) const {

    int myPid = aggregates.GetMap()->getComm()->getRank();
    Teuchos::ArrayView<const GO> nodeGlobalElts = aggregates.GetMap()->getNodeElementList();

    Teuchos::ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);
    Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    const GO numAggregates             = aggregates.GetNumAggregates();

    // FIXME: Do we need to compute size here? Or can we use existing?
    LO size = procWinner.size();

    std::vector<LO> sizes(numAggregates);
    if (stridedblocksize_ == 1) {
      for (LO lnode = 0; lnode < size; lnode++)
        if (procWinner[lnode] == myPid)
    } else {
      for (LO lnode = 0; lnode < size; lnode++)
        if (procWinner[lnode] == myPid) {
          GO nodeGID = nodeGlobalElts[lnode];

          for (LO k = 0; k < stridedblocksize_; k++) {
            GO GID = ComputeGlobalDOF(nodeGID, k);
            if (columnMap_->isNodeGlobalElement(GID))

    aggStart = ArrayRCP<LO>(numAggregates+1); // FIXME: useless initialization with zeros
    aggStart[0] = 0;
    for (GO i = 0; i < numAggregates; i++)
      aggStart[i+1] = aggStart[i] + sizes[i];

    aggToRowMap = ArrayRCP<LO>(aggStart[numAggregates], 0);

    // count, how many dofs have been recorded for each aggregate so far
    Array<LO> numDofs(numAggregates, 0); // empty array with number of DOFs for each aggregate
    if (stridedblocksize_ == 1) {
      for (LO lnode = 0; lnode < size; ++lnode)
        if (procWinner[lnode] == myPid) {
          LO myAgg = vertex2AggId[lnode];
          aggToRowMap[aggStart[myAgg] + numDofs[myAgg]] = lnode;
    } else {
      for (LO lnode = 0; lnode < size; ++lnode)
        if (procWinner[lnode] == myPid) {
          LO myAgg = vertex2AggId[lnode];
          GO nodeGID = nodeGlobalElts[lnode];

          for (LO k = 0; k < stridedblocksize_; k++) {
            GO GID = ComputeGlobalDOF(nodeGID, k);
            if (columnMap_->isNodeGlobalElement(GID)) {
              aggToRowMap[aggStart[myAgg] + numDofs[myAgg]] = lnode*stridedblocksize_ + k;
    // todo plausibility check: entry numDofs[k] == aggToRowMap[k].size()

  } //UnamalgamateAggregates
  void MaxLinkAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::
  BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
    Monitor m(*this, "BuildAggregates");

    LO MaxNodesPerAggregate = params.get<LO>("MaxNodesPerAggregate");

    // vertex ids for output
    ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);
    ArrayRCP<LO> aggSizes     = aggregates.ComputeAggregateSizes(); // contains number of nodes in aggregate with given aggId

    const LO  nRows  = graph.GetNodeNumVertices();
    const int myRank = graph.GetComm()->getRank();

    size_t           aggSize = 0;
    std::vector<int> aggList(graph.getNodeMaxNumRowEntries());

    //bool recomputeAggregateSizes=false; // variable not used TODO remove it

    for (LO iNode = 0; iNode < nRows; iNode++) {
      if (aggStat[iNode] == NodeStats::AGGREGATED)

      ArrayView<const LocalOrdinal> neighOfINode = graph.getNeighborVertices(iNode);

      aggSize = 0;
      for (LO j = 0; j < neighOfINode.size(); j++) {
        LO neigh = neighOfINode[j];

        // NOTE: we don't need the check (neigh != iNode), as we work only
        // if aggStat[neigh] == AGGREGATED, which we know is different from aggStat[iNode]
        if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == NodeStats::AGGREGATED)
          aggList[aggSize++] = vertex2AggId[neigh];

      // Ideally, we would have a _fast_ hash table here.
      // But for the absense of that, sorting works just fine.
      std::sort(aggList.begin(), aggList.begin() + aggSize);

      // terminator
      aggList[aggSize] = -1;

      // Find an aggregate id with most connections to
      LO maxNumConnections =  0, curNumConnections = 0;
      LO selectedAggregate = -1;
      for (size_t i = 0; i < aggSize; i++) {
        if (aggList[i+1] != aggList[i]) {
          if (curNumConnections > maxNumConnections &&         // only select aggregate if it has more connections
              aggSizes[aggList[i]] < MaxNodesPerAggregate) {   // and if it is not too big (i.e. can have one more node)
            maxNumConnections = curNumConnections;
            selectedAggregate = aggList[i];
          curNumConnections = 0;

      // Add node iNode to aggregate
      if (selectedAggregate != -1) {
        aggStat[iNode]      = NodeStats::AGGREGATED;
        vertex2AggId[iNode] = selectedAggregate;
        procWinner[iNode]   = myRank;

  void AggregationPhase2aAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
    Monitor m(*this, "BuildAggregates");

    LO minNodesPerAggregate = params.get<LO>("aggregation: min agg size");
    LO maxNodesPerAggregate = params.get<LO>("aggregation: max agg size");

    const LO  numRows = graph.GetNodeNumVertices();
    const int myRank  = graph.GetComm()->getRank();

    ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);

    LO numLocalAggregates = aggregates.GetNumAggregates();

    LO numLocalNodes      = procWinner.size();
    LO numLocalAggregated = numLocalNodes - numNonAggregatedNodes;

    const double aggFactor = 0.5;
    double       factor    = as<double>(numLocalAggregated)/(numLocalNodes+1);
    factor = pow(factor, aggFactor);

    int              aggIndex = -1;
    size_t           aggSize  =  0;
    std::vector<int> aggList(graph.getNodeMaxNumRowEntries());

    for (LO rootCandidate = 0; rootCandidate < numRows; rootCandidate++) {
      if (aggStat[rootCandidate] != READY)

      aggSize = 0;

      ArrayView<const LocalOrdinal> neighOfINode = graph.getNeighborVertices(rootCandidate);

      LO numNeighbors = 0;
      for (int j = 0; j < neighOfINode.size(); j++) {
        LO neigh = neighOfINode[j];

        if (neigh != rootCandidate) {
          if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == READY) {
            // If aggregate size does not exceed max size, add node to the tentative aggregate
            // NOTE: We do not exit the loop over all neighbours since we have still
            //       to count all aggregated neighbour nodes for the aggregation criteria
            // NOTE: We check here for the maximum aggregation size. If we would do it below
            //       with all the other check too big aggregates would not be accepted at all.
            if (aggSize < as<size_t>(maxNodesPerAggregate))
              aggList[aggSize++] = neigh;


      // NOTE: ML uses a hardcoded value 3 instead of MinNodesPerAggregate
      if (aggSize > as<size_t>(minNodesPerAggregate) &&
          aggSize > factor*numNeighbors) {
        // Accept new aggregate
        // rootCandidate becomes the root of the newly formed aggregate
        aggIndex = numLocalAggregates++;

        for (size_t k = 0; k < aggSize; k++) {
          aggStat     [aggList[k]] = AGGREGATED;
          vertex2AggId[aggList[k]] = aggIndex;
          procWinner  [aggList[k]] = myRank;

        numNonAggregatedNodes -= aggSize;

    // update aggregate object
Beispiel #10
  void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::AggregateLeftovers(GraphBase const &graph, Aggregates &aggregates) const {
    Monitor m(*this, "AggregateLeftovers");

    my_size_t nVertices = graph.GetNodeNumVertices();
    int exp_nRows    = aggregates.GetMap()->getNodeNumElements(); // Tentative fix... was previously exp_nRows = nVertices + graph.GetNodeNumGhost();
    int myPid        = graph.GetComm()->getRank();
    my_size_t nAggregates  = aggregates.GetNumAggregates();

    int minNodesPerAggregate = GetMinNodesPerAggregate();

    const RCP<const Map> nonUniqueMap = aggregates.GetMap(); //column map of underlying graph
    const RCP<const Map> uniqueMap    = graph.GetDomainMap();

    MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> myWidget(uniqueMap, nonUniqueMap);

    //TODO JJH We want to skip this call
    RCP<Xpetra::Vector<double,LO,GO,NO> > distWeights = Xpetra::VectorFactory<double,LO,GO,NO>::Build(nonUniqueMap);

    // Aggregated vertices not "definitively" assigned to processors are
    // arbitrated by ArbitrateAndCommunicate(). There is some
    // additional logic to prevent losing root nodes in arbitration.
      ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>    weights     = distWeights->getDataNonConst(0);

      for (size_t i=0;i<nonUniqueMap->getNodeNumElements();i++) {
        if (procWinner[i] == MUELU_UNASSIGNED) {
          if (vertex2AggId[i] != MUELU_UNAGGREGATED) {
            weights[i] = 1.;
            if (aggregates.IsRoot(i)) weights[i] = 2.;

      // views on distributed vectors are freed here.

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Tentatively assign any vertex (ghost or local) which neighbors a root
    // to the aggregate associated with the root.
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>   weights      = distWeights->getDataNonConst(0);

      for (my_size_t i = 0; i < nVertices; i++) {
        if ( aggregates.IsRoot(i) && (procWinner[i] == myPid) ) {

          // neighOfINode is the neighbor node list of node 'i'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int colj = *it;
            if (vertex2AggId[colj] == MUELU_UNAGGREGATED) {
              weights[colj]= 1.;
              vertex2AggId[colj] = vertex2AggId[i];

      // views on distributed vectors are freed here.

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Record the number of aggregated vertices
    GO total_phase_one_aggregated = 0;
      ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);

      GO phase_one_aggregated = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] != MUELU_UNAGGREGATED)

      sumAll(graph.GetComm(), phase_one_aggregated, total_phase_one_aggregated);

      GO local_nVertices = nVertices, total_nVertices = 0;
      sumAll(graph.GetComm(), local_nVertices, total_nVertices);

      /* Among unaggregated points, see if we can make a reasonable size    */
      /* aggregate out of it. We do this by looking at neighbors and seeing */
      /* how many are unaggregated and on my processor. Loosely,            */
      /* base the number of new aggregates created on the percentage of     */
      /* unaggregated nodes.                                                */

      ArrayRCP<double>    weights      = distWeights->getDataNonConst(0);

      double factor = 1.;
      factor = ((double) total_phase_one_aggregated)/((double)(total_nVertices + 1));
      factor = pow(factor, GetPhase3AggCreation());

      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED)

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);
            int rowi_N = neighOfINode.size();

            int nonaggd_neighbors = 0;
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int colj = *it;
              if (vertex2AggId[colj] == MUELU_UNAGGREGATED && colj < nVertices)
            if (  (nonaggd_neighbors > minNodesPerAggregate) &&
                  (((double) nonaggd_neighbors)/((double) rowi_N) > factor))
                vertex2AggId[i] = (nAggregates)++;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int colj = *it;
                  if (vertex2AggId[colj]==MUELU_UNAGGREGATED) {
                    vertex2AggId[colj] = vertex2AggId[i];
                    if (colj < nVertices) weights[colj] = 2.;
                    else                  weights[colj] = 1.;
                weights[i] = 2.;
      } // for (i = 0; i < nVertices; i++)

      // views on distributed vectors are freed here.

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    //All tentatively assigned vertices are now definitive

    if (IsPrint(Statistics1)) {
      GO Nphase1_agg = nAggregates;
      GO total_aggs;

      sumAll(graph.GetComm(), Nphase1_agg, total_aggs);

      GetOStream(Statistics1, 0) << "Phase 1 - nodes aggregated = " << total_phase_one_aggregated << std::endl;
      GetOStream(Statistics1, 0) << "Phase 1 - total aggregates = " << total_aggs << std::endl;

      GO i = nAggregates - Nphase1_agg;
      { GO ii; sumAll(graph.GetComm(),i,ii); i = ii; }
      GetOStream(Statistics1, 0) << "Phase 3 - additional aggregates = " << i << std::endl;

    // Determine vertices that are not shared by setting Temp to all ones
    // and doing NonUnique2NonUnique(..., ADD). This sums values of all
    // local copies associated with each Gid. Thus, sums > 1 are shared.

    //         std::cout << "exp_nrows=" << exp_nRows << " (nVertices= " << nVertices << ", numGhost=" << graph.GetNodeNumGhost() << ")" << std::endl;
    //         std::cout << "nonUniqueMap=" << nonUniqueMap->getNodeNumElements() << std::endl;

    RCP<Xpetra::Vector<double,LO,GO,NO> > temp_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap,false); //no need to zero out vector in ctor

    RCP<Xpetra::Vector<double,LO,GO,NO> > tempOutput_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap);

    myWidget.NonUnique2NonUnique(*temp_, *tempOutput_, Xpetra::ADD);

    std::vector<bool> gidNotShared(exp_nRows);
      ArrayRCP<const double> tempOutput = tempOutput_->getData(0);
      for (int i = 0; i < exp_nRows; i++) {
        if (tempOutput[i] > 1.)
          gidNotShared[i] = false;
          gidNotShared[i] = true;

    // Phase 4.
    double nAggregatesTarget;
    nAggregatesTarget = ((double)  uniqueMap->getGlobalNumElements())* (((double) uniqueMap->getGlobalNumElements())/ ((double) graph.GetGlobalNumEdges()));

    GO nAggregatesLocal=nAggregates, nAggregatesGlobal; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

    LO minNAggs; minAll(graph.GetComm(), nAggregates, minNAggs);
    LO maxNAggs; maxAll(graph.GetComm(), nAggregates, maxNAggs);

    // Only do this phase if things look really bad. THIS
    if ((nAggregatesGlobal < graph.GetComm()->getSize()) &&
        (2.5*nAggregatesGlobal < nAggregatesTarget) &&
        (minNAggs ==0) && (maxNAggs <= 1)) {

      // Modify seed of the random algorithm used by temp_->randomize()
        typedef Teuchos::ScalarTraits<double> scalarTrait; // temp_ is of type double.
        scalarTrait::seedrandom(static_cast<unsigned int>(myPid*2 + (int) (11*scalarTrait::random())));
        int k = (int)ceil( (10.*myPid)/graph.GetComm()->getSize());
        for (int i = 0; i < k+7; i++) scalarTrait::random();
        temp_->setSeed(static_cast<unsigned int>(scalarTrait::random()));


      ArrayRCP<double> temp = temp_->getDataNonConst(0);

      // build a list of candidate root nodes (vertices not adjacent
      // to aggregated vertices)

      my_size_t nCandidates = 0;
      global_size_t nCandidatesGlobal;

      ArrayRCP<LO> candidates = Teuchos::arcp<LO>(nVertices+1);

      double priorThreshold = 0.;
      for (int kkk = 0; kkk < MUELU_PHASE4BUCKETS; kkk++) {

          ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
          ArrayView<const LO> vertex2AggIdView = vertex2AggId();
          RootCandidates(nVertices, vertex2AggIdView, graph, candidates, nCandidates, nCandidatesGlobal);
          // views on distributed vectors are freed here.

        double nTargetNewGuys =  nAggregatesTarget - nAggregatesGlobal;
        double threshold      =  priorThreshold + (1. - priorThreshold)*nTargetNewGuys/(nCandidatesGlobal + .001);

        threshold = (threshold*(kkk+1.))/((double) MUELU_PHASE4BUCKETS);
        priorThreshold = threshold;

          ArrayRCP<LO>     vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
          ArrayRCP<double> weights      = distWeights->getDataNonConst(0);

          for (int k = 0; k < nCandidates; k++ ) {
            int i = candidates[k];
            if ((vertex2AggId[i] == MUELU_UNAGGREGATED) && (fabs(temp[i])  < threshold)) {
              // Note: priorThreshold <= fabs(temp[i]) <= 1

              // neighOfINode is the neighbor node list of node 'iNode'.
              ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

              if (neighOfINode.size() > minNodesPerAggregate) { //TODO: check if this test is exactly was we want to do
                int count = 0;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int Adjacent    = *it;
                  // This might not be true if someone close to i
                  // is chosen as a root via fabs(temp[]) < Threshold
                  if (vertex2AggId[Adjacent] == MUELU_UNAGGREGATED){
                    vertex2AggId[Adjacent] = nAggregates;
                    weights[Adjacent] = 1.;
                if (count >= minNodesPerAggregate) {
                  vertex2AggId[i] = nAggregates++;
                  weights[i] = 2.;
                else { // undo things
                  for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                    int Adjacent    = *it;
                    if (vertex2AggId[Adjacent] == nAggregates){
                      vertex2AggId[Adjacent] = MUELU_UNAGGREGATED;
                      weights[Adjacent] = 0.;
          // views on distributed vectors are freed here.
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
        sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

        // check that there are no aggregates sizes below minNodesPerAggregate


        RemoveSmallAggs(aggregates, minNodesPerAggregate, distWeights, myWidget);

        nAggregates = aggregates.GetNumAggregates();
      }   // one possibility

    // Initialize things for Phase 5. This includes building the transpose
    // of the matrix ONLY for transposed rows that correspond to unaggregted
    // ghost vertices. Further, the transpose is only a local transpose.
    // Nonzero edges which exist on other processors are not represented.

    int observedNAgg=-1; //number of aggregates that contain vertices on this process

      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      for(LO k = 0; k < vertex2AggId.size(); ++k )

    ArrayRCP<int> Mark = Teuchos::arcp<int>(exp_nRows+1);
    ArrayRCP<int> agg_incremented = Teuchos::arcp<int>(observedNAgg);
    ArrayRCP<int> SumOfMarks = Teuchos::arcp<int>(observedNAgg);

    for (int i = 0; i < exp_nRows; i++)   Mark[i] = MUELU_DISTONE_VERTEX_WEIGHT;
    for (int i = 0; i < agg_incremented.size(); i++) agg_incremented[i] = 0;
    for (int i = 0; i < SumOfMarks.size(); i++) SumOfMarks[i] = 0;

    // Grab the transpose matrix graph for unaggregated ghost vertices.
    //     a) count the number of nonzeros per row in the transpose
    std::vector<int> RowPtr(exp_nRows+1-nVertices);
    ArrayRCP<const LO> vertex2AggIdCst = aggregates.GetVertex2AggId()->getData(0);

    for (int i = nVertices; i < exp_nRows;  i++) RowPtr[i-nVertices] = 0;
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){

    //     b) Convert RowPtr[i] to point to 1st first nnz spot in row i.

    int iSum = 0, iTemp;
    for (int i = nVertices; i < exp_nRows;  i++) {
      iTemp = RowPtr[i-nVertices];
      RowPtr[i-nVertices] = iSum;
      iSum += iTemp;
    RowPtr[exp_nRows-nVertices] = iSum;
    std::vector<LO> cols(iSum+1);

    //     c) Traverse matrix and insert entries in proper location.
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          cols[RowPtr[j-nVertices]++] = i;

    //     d) RowPtr[i] points to beginning of row i+1 so shift by one location.
    for (int i = exp_nRows; i > nVertices;  i--)
      RowPtr[i-nVertices] = RowPtr[i-1-nVertices];
    RowPtr[0] = 0;

    // views on distributed vectors are freed here.
    vertex2AggIdCst = Teuchos::null;

    int bestScoreCutoff;
    int thresholds[10] = {300,200,100,50,25,13,7,4,2,0};

    // Stick unaggregated vertices into existing aggregates as described above.

      int ncalls=0;

      for (int kk = 0; kk < 10; kk += 2) {
        bestScoreCutoff = thresholds[kk];

        ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
        ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);
        ArrayRCP<double> weights       = distWeights->getDataNonConst(0);

        for (int i = 0; i < exp_nRows; i++) {

          if (vertex2AggId[i] == MUELU_UNAGGREGATED) {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode;

            // Grab neighboring vertices which is either in graph for local ids
            // or sits in transposed fragment just constructed above for ghosts.
            if (i < nVertices) {
              neighOfINode = graph.getNeighborVertices(i);
            else {
              LO *rowi_col = NULL, rowi_N;
              rowi_col = &(cols[RowPtr[i-nVertices]]);
              rowi_N   = RowPtr[i+1-nVertices] - RowPtr[i-nVertices];

              neighOfINode = ArrayView<const LO>(rowi_col, rowi_N);
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];

              //Adjacent is aggregated and either I own the aggregate
              // or I could own the aggregate after arbitration.
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED))){
                SumOfMarks[AdjacentAgg] += Mark[Adjacent];
            int best_score = MUELU_NOSCORE;
            int best_agg = -1;
            int BestMark = -1;
            bool cannotLoseAllFriends=false; // Used to address possible loss of vertices in arbitration of shared nodes discussed above. (Initialized to false only to avoid a compiler warning).

            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              //Adjacent is unaggregated, has some value and no
              //other processor has definitively claimed him
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  (SumOfMarks[AdjacentAgg] != 0) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED ))) {

                // first figure out the penalty associated with
                // AdjacentAgg having already been incremented
                // during this phase, then compute score.

                double penalty = (double) (INCR_SCALING*agg_incremented[AdjacentAgg]);
                if (penalty > MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]))
                  penalty = MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]);
                int score = SumOfMarks[AdjacentAgg]- ((int) floor(penalty));

                if (score > best_score) {
                  best_agg             = AdjacentAgg;
                  best_score           = score;
                  BestMark             = Mark[Adjacent];
                  cannotLoseAllFriends = false;

                  // This address issue mentioned above by checking whether
                  // Adjacent could be lost in arbitration. weight==0 means that
                  // Adjacent was not set during this loop of Phase 5 (and so it
                  // has already undergone arbitration). GidNotShared == true
                  // obviously implies that Adjacent cannot be lost to arbitration
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                // Another vertex within current best aggregate found.
                // We should have (best_score == score). We need to see
                // if we can improve BestMark and cannotLoseAllFriends.
                else if (best_agg == AdjacentAgg) {
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                  if (Mark[Adjacent] > BestMark) BestMark = Mark[Adjacent];
            // Clean up
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              if (AdjacentAgg >= 0) SumOfMarks[AdjacentAgg] = 0;
            // Tentatively assign vertex to best_agg.
            if ( (best_score >= bestScoreCutoff) && (cannotLoseAllFriends)) {

              TEUCHOS_TEST_FOR_EXCEPTION(best_agg == -1 || BestMark == -1, MueLu::Exceptions::RuntimeError, "MueLu::CoupledAggregationFactory internal error"); // should never happen

              vertex2AggId[i] = best_agg;
              weights[i] = best_score;
              Mark[i] = (int) ceil(   ((double) BestMark)/2.);

          // views on distributed vectors are freed here.

        vertex2AggId = Teuchos::null;
        procWinner   = Teuchos::null;
        weights      = Teuchos::null;

        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive

      //       if (graph.GetComm()->getRank()==0)
      //         std::cout << "#calls to Arb&Comm=" << ncalls << std::endl;

    // Phase 6: Aggregate remain unaggregated vertices and try at all costs
    //          to avoid small aggregates.
    //          One case where we can find ourselves in this situation
    //          is if all vertices vk adjacent to v have already been
    //          put in other processor's aggregates and v does not have
    //          a direct connection to a local vertex in any of these
    //          aggregates.

    int Nleftover = 0, Nsingle = 0;

      ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<double> weights       = distWeights->getDataNonConst(0);
      ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);

      int count = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED) {

          // neighOfINode is the neighbor node list of node 'iNode'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          // We don't want too small of an aggregate. So lets see if there is an
          // unaggregated neighbor that we can also put with this vertex

          vertex2AggId[i] = nAggregates;
          weights[i] = 1.;
          if (count == 0) aggregates.SetIsRoot(i);
          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int j = *it;
            if ((j != i)&&(vertex2AggId[j] == MUELU_UNAGGREGATED)&&
                (j < nVertices)) {
              vertex2AggId[j] = nAggregates;
              weights[j] = 1.;
          if ( count >= minNodesPerAggregate) {
            count = 0;

      // We have something which is under minNodesPerAggregate when
      if (count != 0) {
#ifdef FIXME
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            if ((vertex2AggId[i] == nAggregates) && (procWinner[i] == myPid)) {
              vertex2AggId[i] = 0;
        else {
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            // TW: This is not a real fix. This may produce ugly bad aggregates!
            // I removed the procWinner[i] == myPid check. it makes no sense to me since
            // it leaves vertex2AggId[i] == nAggregates -> crash in ComputeAggregateSizes().
            // Maybe it's better to add the leftovers to the last generated agg on the current proc.
            // The best solution would be to add them to the "next"/nearest aggregate, that may be
            // on an other processor
            if (vertex2AggId[i] == nAggregates) {
              vertex2AggId[i] = nAggregates-1; //0;
        else {

      // views on distributed vectors are freed here.

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, false);

    if (IsPrint(Statistics1)) {
      GO total_Nsingle=0;   sumAll(graph.GetComm(), (GO)Nsingle,     total_Nsingle);
      GO total_Nleftover=0; sumAll(graph.GetComm(), (GO)Nleftover,   total_Nleftover);
      // GO total_aggs;        sumAll(graph.GetComm(), (GO)nAggregates, total_aggs);
      // GetOStream(Statistics1, 0) << "Phase 6 - total aggregates = " << total_aggs << std::endl;
      GetOStream(Statistics1, 0) << "Phase 6 - leftovers = " << total_Nleftover << " and singletons = " << total_Nsingle << std::endl;


  } //AggregateLeftovers