std::string MHDRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::PrintLoadBalancingInfo(const Matrix & Ac, const std::string & msgTag) {
    std::stringstream ss(std::stringstream::out);

    // TODO: provide a option to skip this (to avoid global communication)
      // TODO: skip if nproc == 1

    //nonzero imbalance
    size_t numMyNnz  = Ac.getNodeNumEntries();
    GO maxNnz, minNnz;
    RCP<const Teuchos::Comm<int> > comm = Ac.getRowMap()->getComm();
    maxAll(comm,(GO)numMyNnz,maxNnz);
    //min nnz over all proc (disallow any processors with 0 nnz)
    minAll(comm, (GO)((numMyNnz > 0) ? numMyNnz : maxNnz), minNnz);
    double imbalance = ((double) maxNnz) / minNnz;

    size_t numMyRows = Ac.getNodeNumRows();
    //Check whether Ac is spread over more than one process.
    GO numActiveProcesses=0;
    sumAll(comm, (GO)((numMyRows > 0) ? 1 : 0), numActiveProcesses);

    //min, max, and avg # rows per proc
    GO minNumRows, maxNumRows;
    double avgNumRows;
    maxAll(comm, (GO)numMyRows, maxNumRows);
    minAll(comm, (GO)((numMyRows > 0) ? numMyRows : maxNumRows), minNumRows);
    assert(numActiveProcesses > 0);
    avgNumRows = Ac.getGlobalNumRows() / numActiveProcesses;

    ss << msgTag << " # processes with rows = " << numActiveProcesses << std::endl;
    ss << msgTag << " min # rows per proc = " << minNumRows << ", max # rows per proc = " << maxNumRows << ", avg # rows per proc = " << avgNumRows << std::endl;
    ss << msgTag << " nonzero imbalance = " << imbalance << std::endl;

    return ss.str();
  }
Example #2
0
Max7219::Max7219(byte dataInPin, byte loadPin, byte clockPin, byte numMax)
	: m_dataInPin(dataInPin), m_loadPin(loadPin), m_clockPin(clockPin), m_numMax(numMax)
#ifdef SUPPORT_PERCENTAGE
	, m_percentMaxValue(100)
	, m_percentLastValue(0)
#endif
#ifdef SUPPORT_SCROLLING
	,m_scrollText(0), m_scrollIndex(0), m_currScrollPixRowCol(0), m_inverseScroll(false)
#endif
{
	pinMode(m_dataInPin, OUTPUT);
	pinMode(m_clockPin, OUTPUT);
	pinMode(m_loadPin, OUTPUT);

	digitalWrite(13, HIGH);

	// initiating the max 7219
	maxAll(max7219_reg_scanLimit, 0x07);
	maxAll(max7219_reg_decodeMode, 0x00);					// using an led matrix (not digits)
	maxAll(max7219_reg_shutdown, 0x01);						// not in shutdown mode
	maxAll(max7219_reg_displayTest, 0x00);				// no display test
	for(byte e = 1; e <= 8; e++)									// empty registers, turn all LEDs off
		maxAll(e, 0);

	setIntensity(15);
} // ctor
Example #3
0
void max72_setup () {


  pinMode(dataIn, OUTPUT);
  pinMode(clock,  OUTPUT);
  pinMode(load,   OUTPUT);

  //beginSerial(9600);
  digitalWrite(13, HIGH);  

  //initiation of the max 7219
  maxAll(max7219_reg_scanLimit, 0x07);      
  maxAll(max7219_reg_decodeMode, 0x00);  // using an led matrix (not digits)
  maxAll(max7219_reg_shutdown, 0x01);    // not in shutdown mode
  maxAll(max7219_reg_displayTest, 0x00); // no display test
   for (e=1; e<=8; e++) {    // empty registers, turn all LEDs off 
    maxAll(e,0);
  }
  maxAll(max7219_reg_intensity, 0x0f & 0x0f);    // the first 0x0f is the value you can set
                                                  // range: 0x00 to 0x0f
}  
// Reset
void IRCar_DotMatrix_Reset(Reset_t Reset)
{
	if (RESET_INIT == Reset)
	{
		pinMode(IO_OUT_SPI_DATA, OUTPUT);
		pinMode(IO_OUT_SPI_LOAD, OUTPUT);
		pinMode(IO_OUT_SPI_CLK,  OUTPUT);

		//initiation of the max 7219
		maxAll(max7219_reg_scanLimit,   0x07);
		maxAll(max7219_reg_decodeMode,  0x00);  // using an led matrix (not digits)
		maxAll(max7219_reg_shutdown,    0x01);    // not in shutdown mode
		maxAll(max7219_reg_displayTest, 0x00); // no display test
		maxAll(max7219_reg_intensity,   0x0f & 0x0f); // the first 0x0f is the value you can set range: 0x00 to 0x0f

		// empty registers, turn all LEDs off
		for (U8 e=1; e<=8; e++)
		{
			maxAll(e, 0);
		}
	}

	if (RESET_NONE != Reset)
	{
		DotMatrix_Cmd();
	}
	else
	{
  	// Update Global Time
	  IRCar_TimeMs = millis();
  	
		// Restart TimeOut for 5 sec
		DotMatrix_Timer_Duration = 5*1000UL;
		DotMatrix_TimerON = IRCar_TimeMs;
		// no null Timer => reserved for stopped state
		if (!DotMatrix_TimerON) DotMatrix_TimerON = 1;
	}
}
  void RebalanceTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    const ParameterList& pL = GetParameterList();

    int implicit   = !pL.get<bool>("repartition: rebalance P and R");
    int writeStart = pL.get<int> ("write start");
    int writeEnd   = pL.get<int> ("write end");

    if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd && IsAvailable(fineLevel, "Coordinates")) {
      std::string fileName = "coordinates_level_0.m";
      RCP<MultiVector> fineCoords = fineLevel.Get< RCP<MultiVector> >("Coordinates");
      if (fineCoords != Teuchos::null)
        Utils::Write(fileName, *fineCoords);
    }

    RCP<const Import> importer = Get<RCP<const Import> >(coarseLevel, "Importer");
    if (implicit) {
      // Save the importer, we'll need it for solve
      coarseLevel.Set("Importer", importer, NoFactory::get());
    }

    RCP<ParameterList> params = rcp(new ParameterList());;
    params->set("printLoadBalancingInfo", true);
    params->set("printCommInfo",          true);

    std::string transferType = pL.get<std::string>("type");
    if (transferType == "Interpolation") {
      RCP<Matrix> originalP = Get< RCP<Matrix> >(coarseLevel, "P");

      {
        // This line must be after the Get call
        SubFactoryMonitor m1(*this, "Rebalancing prolongator", coarseLevel);

        if (implicit || importer.is_null()) {
          GetOStream(Runtime0) << "Using original prolongator" << std::endl;
          Set(coarseLevel, "P", originalP);

        } else {
          // P is the transfer operator from the coarse grid to the fine grid.
          // P must transfer the data from the newly reordered coarse A to the
          // (unchanged) fine A.  This means that the domain map (coarse) of P
          // must be changed according to the new partition. The range map
          // (fine) is kept unchanged.
          //
          // The domain map of P must match the range map of R.  See also note
          // below about domain/range map of R and its implications for P.
          //
          // To change the domain map of P, P needs to be fillCompleted again
          // with the new domain map.  To achieve this, P is copied into a new
          // matrix that is not fill-completed.  The doImport() operation is
          // just used here to make a copy of P: the importer is trivial and
          // there is no data movement involved.  The reordering actually
          // happens during the fillComplete() with domainMap == importer->getTargetMap().
          RCP<Matrix> rebalancedP = originalP;
          RCP<const CrsMatrixWrap> crsOp = rcp_dynamic_cast<const CrsMatrixWrap>(originalP);
          TEUCHOS_TEST_FOR_EXCEPTION(crsOp == Teuchos::null, Exceptions::BadCast, "Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");

          RCP<CrsMatrix> rebalancedP2 = crsOp->getCrsMatrix();
          TEUCHOS_TEST_FOR_EXCEPTION(rebalancedP2 == Teuchos::null, std::runtime_error, "Xpetra::CrsMatrixWrap doesn't have a CrsMatrix");

          {
            SubFactoryMonitor subM(*this, "Rebalancing prolongator -- fast map replacement", coarseLevel);

            RCP<const Import> newImporter = ImportFactory::Build(importer->getTargetMap(), rebalancedP->getColMap());
            rebalancedP2->replaceDomainMapAndImporter(importer->getTargetMap(), newImporter);
          }

          ///////////////////////// EXPERIMENTAL
          // TODO FIXME somehow we have to transfer the striding information of the permuted domain/range maps.
          // That is probably something for an external permutation factory
          //   if (originalP->IsView("stridedMaps"))
          //     rebalancedP->CreateView("stridedMaps", originalP);
          ///////////////////////// EXPERIMENTAL

          Set(coarseLevel, "P", rebalancedP);

          if (IsPrint(Statistics1))
            GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*rebalancedP, "P (rebalanced)", params);
        }
      }

      if (importer.is_null()) {
        if (IsAvailable(coarseLevel, "Nullspace"))
          Set(coarseLevel, "Nullspace", Get<RCP<MultiVector> >(coarseLevel, "Nullspace"));

        if (pL.isParameter("Coordinates") && pL.get< RCP<const FactoryBase> >("Coordinates") != Teuchos::null)
          if (IsAvailable(coarseLevel, "Coordinates"))
            Set(coarseLevel, "Coordinates", Get< RCP<MultiVector> >(coarseLevel, "Coordinates"));

        return;
      }

      if (pL.isParameter("Coordinates") &&
          pL.get< RCP<const FactoryBase> >("Coordinates") != Teuchos::null &&
          IsAvailable(coarseLevel, "Coordinates")) {
        RCP<MultiVector> coords = Get<RCP<MultiVector> >(coarseLevel, "Coordinates");

        // This line must be after the Get call
        SubFactoryMonitor subM(*this, "Rebalancing coordinates", coarseLevel);

        LO nodeNumElts = coords->getMap()->getNodeNumElements();

        // If a process has no matrix rows, then we can't calculate blocksize using the formula below.
        LO myBlkSize = 0, blkSize = 0;
        if (nodeNumElts > 0)
          myBlkSize = importer->getSourceMap()->getNodeNumElements() / nodeNumElts;
        maxAll(coords->getMap()->getComm(), myBlkSize, blkSize);

        RCP<const Import> coordImporter;
        if (blkSize == 1) {
          coordImporter = importer;

        } else {
          // NOTE: there is an implicit assumption here: we assume that dof any node are enumerated consequently
          // Proper fix would require using decomposition similar to how we construct importer in the
          // RepartitionFactory
          RCP<const Map> origMap   = coords->getMap();
          GO             indexBase = origMap->getIndexBase();

          ArrayView<const GO> OEntries   = importer->getTargetMap()->getNodeElementList();
          LO                  numEntries = OEntries.size()/blkSize;
          ArrayRCP<GO> Entries(numEntries);
          for (LO i = 0; i < numEntries; i++)
            Entries[i] = (OEntries[i*blkSize]-indexBase)/blkSize + indexBase;

          RCP<const Map> targetMap = MapFactory::Build(origMap->lib(), origMap->getGlobalNumElements(), Entries(), indexBase, origMap->getComm());
          coordImporter = ImportFactory::Build(origMap, targetMap);
        }

        RCP<MultiVector> permutedCoords  = MultiVectorFactory::Build(coordImporter->getTargetMap(), coords->getNumVectors());
        permutedCoords->doImport(*coords, *coordImporter, Xpetra::INSERT);

        if (pL.get<bool>("useSubcomm") == true)
          permutedCoords->replaceMap(permutedCoords->getMap()->removeEmptyProcesses());

        Set(coarseLevel, "Coordinates", permutedCoords);

        std::string fileName = "rebalanced_coordinates_level_" + toString(coarseLevel.GetLevelID()) + ".m";
        if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd && permutedCoords->getMap() != Teuchos::null)
          Utils::Write(fileName, *permutedCoords);
      }

      if (IsAvailable(coarseLevel, "Nullspace")) {
        RCP<MultiVector> nullspace = Get< RCP<MultiVector> >(coarseLevel, "Nullspace");

        // This line must be after the Get call
        SubFactoryMonitor subM(*this, "Rebalancing nullspace", coarseLevel);

        RCP<MultiVector> permutedNullspace = MultiVectorFactory::Build(importer->getTargetMap(), nullspace->getNumVectors());
        permutedNullspace->doImport(*nullspace, *importer, Xpetra::INSERT);

        if (pL.get<bool>("useSubcomm") == true)
          permutedNullspace->replaceMap(permutedNullspace->getMap()->removeEmptyProcesses());

        Set(coarseLevel, "Nullspace", permutedNullspace);
      }

    } else {
      if (pL.get<bool>("transpose: use implicit") == false) {
        RCP<Matrix> originalR = Get< RCP<Matrix> >(coarseLevel, "R");

        SubFactoryMonitor m2(*this, "Rebalancing restriction", coarseLevel);

        if (implicit || importer.is_null()) {
          GetOStream(Runtime0) << "Using original restrictor" << std::endl;
          Set(coarseLevel, "R", originalR);

        } else {
          RCP<Matrix> rebalancedR;
          {
            SubFactoryMonitor subM(*this, "Rebalancing restriction -- fusedImport", coarseLevel);

            RCP<Map> dummy;         // meaning: use originalR's domain map.
            rebalancedR = MatrixFactory::Build(originalR, *importer, dummy, importer->getTargetMap());
          }
          Set(coarseLevel, "R", rebalancedR);

          ///////////////////////// EXPERIMENTAL
          // TODO FIXME somehow we have to transfer the striding information of the permuted domain/range maps.
          // That is probably something for an external permutation factory
          // if (originalR->IsView("stridedMaps"))
          //   rebalancedR->CreateView("stridedMaps", originalR);
          ///////////////////////// EXPERIMENTAL

          if (IsPrint(Statistics1))
            GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*rebalancedR, "R (rebalanced)", params);
        }
      }
    }
  }
Example #6
0
void Max7219::setIntensity(byte intensity) const
{
	maxAll(max7219_reg_intensity, intensity bitand 0x0f);		// the first 0x0f is the value you can set (range: 0x00 to 0x0f)
}
Example #7
0
Int_t main(Int_t argc, Char_t *argv[])
{
   ROOT::Mpi::TEnvironment env(argc, argv);
   ROOT::Mpi::TIntraCommunicator world;
   TVectorT<Double_t> mResult;
   Double_t fScalarResult;
   TVectorT<Double_t> v1(elements);
   TVectorT<Double_t> v2(elements);

   for (Int_t i = 0; i < elements; i++) {
      v1[i] = i + (i + world.Size());
      v2[i] = i * (i + world.Size());

   }

///////////////////////////////////////////////
//Testing methdos with results in single Rank//
///////////////////////////////////////////////
   ROOT::Mpi::Math::TVectorTWrapper<Double_t> add(v1);
   add.Addition(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sub(v1);
   sub.Subtraction(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> dot(v1);
   dot.Dot(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm2Sqr(v1);
   norm2Sqr.Norm2Sqr(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm1(v1);
   norm1.Norm1(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> min(v1);
   min.Min(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> max(v1);
   max.Max(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> normalize(v1);
   normalize.Normalize(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sum(v1);
   sum.Sum(root);

   if (world.Rank() == root) {
      add.GetResult(mResult);
      MpiCompareTVectorTest(mResult, v1 + v2, world.Rank(), "Vector Addition Single");

      sub.GetResult(mResult);
      MpiCompareTVectorTest(mResult, v1 - v2, world.Rank(), "Vector Subtraction Single");

      dot.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, Dot(v1, v2) , world.Rank(), "Vector Dot Product Single");

      norm2Sqr.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Norm2Sqr() , world.Rank(), "Vector Norm2Sqr Single");

      norm1.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Norm1() , world.Rank(), "Vector Norm1 Single");

      min.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Min() , world.Rank(), "Vector Min Single");

      max.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Max() , world.Rank(), "Vector Max Single");

      normalize.GetResult(mResult);
      MpiCompareTest(mResult.Norm2Sqr(), ((1 / TMath::Sqrt(v1.Norm2Sqr()))*v1).Norm2Sqr() , world.Rank(), "Vector Normalize Single");

      sum.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Sum(), world.Rank(), "Vector Sum Single");
   }

///////////////////////////////////////////////
//Testing methdos with results in all ranks  //
///////////////////////////////////////////////

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> addAll(v1);
   add.Addition(v2);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> subAll(v1);
   sub.Subtraction(v2);

   add.GetResult(mResult);
   MpiCompareTVectorTest(mResult, v1 + v2, world.Rank(), "Vector Addition All");

   sub.GetResult(mResult);
   MpiCompareTVectorTest(mResult, v1 - v2, world.Rank(), "Vector Subtraction All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> dotAll(v1);
   dotAll.Dot(v2);
   dotAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, Dot(v1, v2) , world.Rank(), "Vector Dot Product All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm2SqrAll(v2);
   norm2SqrAll.Norm2Sqr();
   norm2SqrAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Norm2Sqr() , world.Rank(), "Vector Norm2Sqr All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm1All(v2);
   norm1All.Norm1();
   norm1All.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Norm1() , world.Rank(), "Vector Norm1 All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> minAll(v2);
   minAll.Min();
   minAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Min() , world.Rank(), "Vector Min All");


   ROOT::Mpi::Math::TVectorTWrapper<Double_t> maxAll(v2);
   maxAll.Max();
   maxAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Max() , world.Rank(), "Vector Max All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> normalizeAll(v2);
   normalizeAll.Normalize();
   normalizeAll.GetResult(mResult);
   //show if the vector is normalize, then Norm2Sqr of result is near to 1
   MpiCompareTest(mResult.Norm2Sqr(), ((1 / TMath::Sqrt(v2.Norm2Sqr()))*v2).Norm2Sqr() , world.Rank(), "Vector Normalize All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sumAll(v2);
   sumAll.Sum();
   sumAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Sum() , world.Rank(), "Vector Sum All");


   return 0;
}
Example #8
0
  void CoupledAggregationCommHelper<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::ArbitrateAndCommunicate(Vector &weight_, LOVector &procWinner_, LOVector *companion, const bool perturb) const {
    const RCP<const Map> weightMap = weight_.getMap();
    const size_t nodeNumElements = weightMap->getNodeNumElements();
    const RCP<const Teuchos::Comm<int> > & comm = weightMap->getComm();
    int MyPid = comm->getRank(); // TODO:remove the getMap() step
    ++numCalls_;

    //short-circuit if only one process
    if (comm->getSize() == 1) {
      ArrayRCP<SC> serialWeight = weight_.getDataNonConst(0);
      ArrayRCP<LO> serialProcWinner = procWinner_.getDataNonConst(0);
      for (size_t i=0; i < nodeNumElements; ++i) {
        if (serialWeight[i] > 0) {
          serialWeight[i] = 0;
          serialProcWinner[i] = MyPid;
        }
      }
      //companion doesn't change
      return;
    }

#ifdef COMPARE_IN_OUT_VECTORS
    RCP<Vector> in_weight_ = VectorFactory::Build(weight_.getMap());
    {
      ArrayRCP<SC> in_weight = in_weight_->getDataNonConst(0);
      ArrayRCP<SC> weight = weight_.getDataNonConst(0);
      for (size_t i=0; i < nodeNumElements; ++i) in_weight[i] = weight[i];
    }
    RCP<LOVector> in_procWinner_ = LOVectorFactory::Build(procWinner_.getMap());
    {
      ArrayRCP<LO> in_procWinner = in_procWinner_->getDataNonConst(0);
      ArrayRCP<LO> procWinner = procWinner_.getDataNonConst(0);
      for (size_t i=0; i < nodeNumElements; ++i) in_procWinner[i] = procWinner[i];
    }
    RCP<LOVector> in_companion;
    {
      if (companion != NULL) {
        in_companion = LOVectorFactory::Build(companion->getMap());
        ArrayRCP<LO> in_comp = in_companion->getDataNonConst(0);
        ArrayRCP<LO> comp = companion->getDataNonConst(0);
        for (size_t i=0; i < nodeNumElements; ++i) in_comp[i] = comp[i];
      }
    }
#endif

    if (perturb) {
      if (perturbWt_ == Teuchos::null || !perturbWt_->getMap()->isSameAs(*weightMap)) {
        perturbWt_ = VectorFactory::Build(weightMap,false); //no need to zero out because this will be randomized

        // Modify seed of the random algorithm used by perturbWt_->randomize()
        {
          ST::seedrandom( Teuchos::as<unsigned int>(MyPid*47) );
          for (int i = 0; i < 10; ++i) ST::random();
        }
        //Note that we must not use perturbWt_->randomize().  This produces the same
        //local random vector on each processor.  The whole point of the weights
        //is to provide tie-breaking that isn't based on the highest PID.
        ArrayRCP<SC> lperturbWt = perturbWt_->getDataNonConst(0);
        for (size_t i=0; i < nodeNumElements; ++i)
          lperturbWt[i] = 1e-7*fabs(ST::random()); //FIXME this won't work for general SC
#ifdef COMPARE_IN_OUT_VECTORS
        ArrayRCP<SC> locperturbWt = perturbWt_->getDataNonConst(0);
        for (size_t i=0; i < nodeNumElements; ++i)
          printf("perturbWt[%d] = %15.10e\n",i,locperturbWt[i]);
#endif
      } //if (perturbWt_ == Teuchos::null || ...

      ArrayRCP<SC> weight = weight_.getDataNonConst(0); // TODO: const?
      ArrayRCP<SC> perturbWt = perturbWt_->getDataNonConst(0);

      // Note: maxValue() not available for Tpetra
      //SC largestGlobalWeight = weight_.maxValue();
      SC largestGlobalWeight = weight_.normInf();
      for (size_t i=0; i < nodeNumElements; ++i) {
        if (weight[i] != 0.) {
          weight[i] += largestGlobalWeight*perturbWt[i];
        }
      }
      //TODO is it necessary to return the *perturbed* weights?
    } //if (perturb)

    // Communicate weights and store results in PostComm (which will be copied
    // back into weights later. When multiple processors have different weights
    // for the same GID, we take the largest weight. After this fragment every
    // processor should have the same value for PostComm[] even when multiple
    // copies of the same Gid are involved.

    if (postComm_ == Teuchos::null || !postComm_->getMap()->isSameAs(*weightMap) )
      postComm_ = VectorFactory::Build(weightMap);

    //note: postComm_ is zeroed either in build above, or in loop below upon last touch.

    NonUnique2NonUnique(weight_, *postComm_, Xpetra::ABSMAX);

    // Let every processor know who is the procWinner. For nonunique
    // copies of the same Gid, this corresponds to the processor with
    // the highest Wt[]. When several processors have the same positive value
    // for weight[] (which is also the maximum value), the highest proc id
    // is declared the procWinner.
    //
    // Note:This is accomplished by filling a vector with MyPid+1 if weight[k] is
    //      nonzero and PostComm[k]==weight[k]. NonUnique2NonUnique(...,AbsMax)
    //      is invoked to let everyone know the procWinner.
    //      One is then subtracted so that procWinner[i] indicates the
    //      Pid of the winning processor.
    //      When all weight's for a GID are zero, the associated procWinner's
    //      are left untouched.

    if (candidateWinners_ == Teuchos::null || !candidateWinners_->getMap()->isSameAs(*weightMap) )
      candidateWinners_ = VectorFactory::Build(weightMap,false);
    //note: candidateWinners_ is initialized below

    ArrayRCP<SC> weight = weight_.getDataNonConst(0);

    {
      ArrayRCP<SC> candidateWinners = candidateWinners_->getDataNonConst(0);
      ArrayRCP<SC> postComm = postComm_->getDataNonConst(0);
      for (size_t i=0; i < nodeNumElements; ++i) {
        if (postComm[i] == weight[i]) candidateWinners[i] = (SC) MyPid+1;
        else                          candidateWinners[i] = 0;
        weight[i]=postComm[i];
      }
    }
    NonUnique2NonUnique(*candidateWinners_, *postComm_, Xpetra::ABSMAX);

    // Note:
    //                      associated CandidateWinners[]
    //    weight[i]!=0  ==> on some proc is equal to its ==> postComm[i]!=0
    //                      MyPid+1.
    //
    int numMyWinners = 0;
    ArrayRCP<LO> procWinner = procWinner_.getDataNonConst(0);
    {
      ArrayRCP<SC> postComm = postComm_->getDataNonConst(0);
      for (size_t i=0; i < nodeNumElements; ++i)  {
        if ( weight[i] != 0.) procWinner[i] = ((int) (postComm[i])) - 1;
        weight[i] = 0.;    //we are done with weight
        postComm[i] = 0.;  //avoids having to initialize postComm_ on next call to ArbitrateAndCommunicate
        if (procWinner[i] == MyPid) ++numMyWinners;
      }
    }

    weight = Teuchos::null; //TODO why do we do this?

    if (companion != NULL) {
      // Now build a new Map, WinnerMap which just consists of procWinners.
      // This is done by extracting the Gids for Wt, and shoving
      // the subset that correspond to procWinners in MyWinners.
      // WinnerMap is then constructed using MyWinners.
      //
      // In order to avoid regenerating winnerMap_, the following are checked:
      //   1) Do the local number of entries in MyWinners differ?  If so, regenerate/repopulate MyWinners and regenerate winnerMap_.
      //   2) If the local number of entries in MyWinners are the same, do any entries differ?  If so, repopulate MyWinners and
      //      regenerate winnerMap_.

      ArrayView<const GO> myGids = weightMap->getNodeElementList(); //== weightMap->MyGlobalElements(myGids);
      bool realloc=false;
      if (numMyWinners != numMyWinners_ || winnerMap_ == Teuchos::null) {
        // The local number of entries in MyWinners_ have changed since the last invocation, so reallocate myWinners_.
        myWinners_ = ArrayRCP<GO>(numMyWinners);
        realloc=true;
        //std::cout << MyPid << ":  numMyWinners has changed : (old) " << numMyWinners_ << ", (new) " << numMyWinners << std::endl;
        numMyWinners_ = numMyWinners;
      }

#ifdef JG_DEBUG
      procWinner = Teuchos::null;
      std::cout << MyPid << ": nodeNumElements=" << nodeNumElements << std::endl;
      std::cout << MyPid << ": procWinner=" << procWinner_ << std::endl;
      procWinner = procWinner_.getDataNonConst(0);
#endif

      if (realloc==true) {
        // The local number of entries in MyWinners have changed since the last invocation, so repopulate MyWinners_.
        numMyWinners = 0;
        for (size_t i = 0; i < nodeNumElements; ++i) {
          if (procWinner[i] == MyPid) {
            myWinners_[numMyWinners++] = myGids[i];
          }
        }
      } else {
        // The local number of entries in MyWinners are the same as the last invocation, but
        // we still must check if any entries differ from the last invocation.
        bool entryMismatch=false;
        numMyWinners = 0;
        for (size_t i = 0; i < nodeNumElements; ++i) {
          if (procWinner[i] == MyPid) {
            if (myWinners_[numMyWinners++] != myGids[i]) {
              entryMismatch=true;
              break;
            }
          }
        }

        if (entryMismatch == true) {
          // Entries differ from last invocation, so repopulate myWinners_.
          realloc=true;
          numMyWinners = 0;
          for (size_t i = 0; i < nodeNumElements; ++i) {
            if (procWinner[i] == MyPid) {
              myWinners_[numMyWinners++] = myGids[i];
            }
          }
        }
      } //if (realloc==true) ... else

      procWinner = Teuchos::null;

#ifdef JG_DEBUG
      std::cout << MyPid << ": numMyWinners=" << numMyWinners << std::endl;
      std::cout << MyPid << ": myWinners_" << myWinners_ << std::endl;
      for(int i=0;i<numMyWinners; i++)
        std::cout << MyPid << ": myWinners_[locId=" << i << "] = " << myWinners_[i] << std::endl;

#endif

#ifdef HAVE_MPI
      //See whether any process has determined that winnerMap_ must be regenerated.
      int irealloc,orealloc;
      if (realloc) irealloc=1;
      else         irealloc=0;
      maxAll(comm,irealloc,orealloc);
      if (orealloc == 1) realloc=true;
      else               realloc=false;
#endif

      if (realloc) {
        // Either the number of entries or the value have changed since the last invocation, so reallocation the map.
        const Xpetra::global_size_t GSTI = Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid();
        winnerMap_ = MapFactory::Build(weightMap->lib(), GSTI, myWinners_(), 0, weightMap->getComm());
      }

      // Pull the Winners out of companion
      //     JustWinners <-- companion[Winners];

      RCP<LOVector> justWinners = LOVectorFactory::Build(winnerMap_);

#ifdef JG_DEBUG
      RCP<Teuchos::FancyOStream> out = rcp(new Teuchos::FancyOStream(rcp(&std::cout,false)));
      std::cout << MyPid << ": justWinners(Vector in)=" << *justWinners << std::endl;
      justWinners->describe(*out, Teuchos::VERB_EXTREME);
#endif

      if ( winnerImport_ == Teuchos::null
           || !winnerImport_->getSourceMap()->isSameAs(*weightMap)
           || !winnerImport_->getTargetMap()->isSameAs(*winnerMap_)  )
        winnerImport_ = ImportFactory::Build(weightMap, winnerMap_);
      RCP<const Import> winnerImport = winnerImport_;
      try
        {
          justWinners->doImport(*companion, *winnerImport, Xpetra::INSERT);
        }
      catch(std::exception& e)
        {
          std::cout << MyPid << ": ERR2: An exception occurred." << std::endl;
          throw e;
        }

      // Put the JustWinner values back into companion so that
      // all nonunique copies of the same Gid have the procWinner's
      // version of the companion.
      //#define JG_DEBUG
#ifdef JG_DEBUG
      RCP<Teuchos::FancyOStream> fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
      fos->setOutputToRootOnly(-1);
      if (!weightMap->getComm()->getRank())
        std::cout << "------ winnerMap_ ------" << std::endl;
      winnerMap_->describe(*fos,Teuchos::VERB_EXTREME);
      if (!weightMap->getComm()->getRank())
        std::cout << "------ weightMap ------" << std::endl;
      weightMap->getComm()->barrier();
      weightMap->describe(*fos,Teuchos::VERB_EXTREME);
      //std::cout << *winnerMap_ << std::endl;
      //std::cout << *weightMap << std::endl;
      sleep(5);
      exit(1);
#endif
#ifdef JG_DEBUG
#undef JG_DEBUG
#endif

      if ( pushWinners_ == Teuchos::null
           || !pushWinners_->getSourceMap()->isSameAs(*winnerMap_)
           || !pushWinners_->getTargetMap()->isSameAs(*weightMap)  )
        pushWinners_ = ImportFactory::Build(winnerMap_,weightMap);
      RCP<Import> pushWinners = pushWinners_;
      //RCP<Import> pushWinners = ImportFactory::Build(winnerMap_, weightMap); // VERSION1
      //RCP<Export> pushWinners = ExportFactory::Build(winnerMap_, weightMap); // VERSION4
      try
        {
          companion->doImport(*justWinners, *pushWinners, Xpetra::INSERT);   // VERSION1 Slow
          //companion->doExport(*justWinners, *winnerImport_, Xpetra::INSERT);   // JJH this should work... but exception
          //             if (weightMap->lib() == Xpetra::UseEpetra)
          //               justWinners->doExport(*companion, *winnerImport, Xpetra::INSERT);  // VERSION2 Tpetra doc is wrong
          //             else if (weightMap->lib() == Xpetra::UseTpetra)
          //               companion->doExport(*justWinners, *winnerImport, Xpetra::INSERT);     // VERSION3 - TODO: will certainly not work with Epetra? (change Xpetra?)
          //companion->doExport(*justWinners, *pushWinners, Xpetra::INSERT);     // VERSION4
          //             else throw "lib()";
        }
      catch(std::exception& e)
        {
          throw e;
        }
      //#define JG_DEBUG
#ifdef JG_DEBUG
      //            RCP<Teuchos::FancyOStream> out = rcp(new Teuchos::FancyOStream(rcp(&std::cout,false)));
      //->describe(*out, Teuchos::VERB_EXTREME);

      // std::cout << MyPid << ": ERR3: An exception occurred." << std::endl;

      std::cout << MyPid << ": numMyWinners=" << numMyWinners << std::endl;

      std::cout << MyPid << ": justWinners(Vector in)=" << std::endl;
      justWinners->describe(*out, Teuchos::VERB_EXTREME);

      std::cout << MyPid << ": companion(Vector out)=" << std::endl;
      companion->describe(*out, Teuchos::VERB_EXTREME);

      // std::cout << MyPid << ": pushWinners(Import(winnerMap_, weight_.Map))=" << *pushWinners << std::endl;
      std::cout << MyPid << ": winnerMap_=" << *winnerMap_ << std::endl;
      std::cout << MyPid << ": weight_.Map=" << *weightMap << std::endl;
#endif
      //  throw e;
      // throw 1;
    }

#ifdef COMPARE_IN_OUT_VECTORS
    if (MyPid == 0) {
      std::cout << "==============" << std::endl;
      std::cout << "call #" << numCalls << " (1-based)" << std::endl;
      std::cout << "==============" << std::endl;
    }
    /*
      bool sameWeight=true;
      bool sameWinner=true;
      bool sameComp=true;
    */
    std::string sameOrDiff;
    {
      ArrayRCP<SC> in_weight = in_weight_->getDataNonConst(0);
      ArrayRCP<SC> weight = weight_.getDataNonConst(0);
      if (MyPid == 0) std::cout << "==============\nweight\n==============\n" << std::endl;
      for (size_t i=0; i < weight_.getLocalLength(); ++i) {
        if (in_weight[i] - weight[i] != 0) sameOrDiff = "  <<<<";
        else                           sameOrDiff = " ";
        std::cout << std::setw(3) << i<<": " << in_weight[i] << "   " << weight[i] << sameOrDiff << in_weight[i] - weight[i] << std::endl;
        /*
          if (in_weight[i] != weight[i]) {
          sameWeight=false;
          std::cout << "\n\nin and out weight DIFFER\n\n" << std::endl;
          std::cout << "i="<<i<<", in=" << in_weight[i] << " , out=" << weight[i] << std::endl;
          break;
          }
        */
      }
    }

    {
      ArrayRCP<LO> in_procWinner = in_procWinner_->getDataNonConst(0);
      ArrayRCP<LO> procWinner = procWinner_.getDataNonConst(0);
      if (MyPid == 0) std::cout << "==============\nprocWinner\n==============\n" << std::endl;
      for (size_t i=0; i < procWinner_.getLocalLength(); ++i) {
        if (in_procWinner[i] != procWinner[i]) sameOrDiff = "  <<<<";
        else                           sameOrDiff = " ";
        std::cout << std::setw(3) << i<<": " << in_procWinner[i] << "   " << procWinner[i] << sameOrDiff << std::endl;
        /*
          if (in_procWinner[i] != procWinner[i]) {
          sameWinner=false;
          std::cout << "\n\nin and out procWinner DIFFER\n\n" << std::endl;
          std::cout << "i="<<i<<", in=" << in_procWinner[i] << ", out=" << procWinner[i] << std::endl;
          break;
          }
        */
      }
    }

    {
      if (companion != NULL) {
        ArrayRCP<LO> in_comp = in_companion->getDataNonConst(0);
        ArrayRCP<LO> comp = companion->getDataNonConst(0);
        if (MyPid == 0) std::cout << "==============\ncompanion\n==============\n" << std::endl;
        for (size_t i=0; i < companion->getLocalLength(); ++i) {
          if (in_comp[i] != comp[i]) sameOrDiff = "  <<<<";
          else                           sameOrDiff = " ";
          std::cout << std::setw(3) << i<<": " << in_comp[i] << "   " << comp[i] << sameOrDiff << std::endl;
          /*
            if (in_comp[i] != comp[i]) {
            sameComp=false;
            std::cout << "\n\nin and out companion DIFFER\n\n" << std::endl;
            std::cout << "i="<<i<<", in=" << in_comp[i] << ", out=" << comp[i] << std::endl;
            break;
            }
          */
        }
      }
    }
#endif
  } //ArbitrateAndCommunicate(Vector&, LOVector &, LOVector *, const bool) const
Example #9
0
  void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::AggregateLeftovers(GraphBase const &graph, Aggregates &aggregates) const {
    Monitor m(*this, "AggregateLeftovers");

    my_size_t nVertices = graph.GetNodeNumVertices();
    int exp_nRows    = aggregates.GetMap()->getNodeNumElements(); // Tentative fix... was previously exp_nRows = nVertices + graph.GetNodeNumGhost();
    int myPid        = graph.GetComm()->getRank();
    my_size_t nAggregates  = aggregates.GetNumAggregates();

    int minNodesPerAggregate = GetMinNodesPerAggregate();

    const RCP<const Map> nonUniqueMap = aggregates.GetMap(); //column map of underlying graph
    const RCP<const Map> uniqueMap    = graph.GetDomainMap();

    MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> myWidget(uniqueMap, nonUniqueMap);

    //TODO JJH We want to skip this call
    RCP<Xpetra::Vector<double,LO,GO,NO> > distWeights = Xpetra::VectorFactory<double,LO,GO,NO>::Build(nonUniqueMap);

    // Aggregated vertices not "definitively" assigned to processors are
    // arbitrated by ArbitrateAndCommunicate(). There is some
    // additional logic to prevent losing root nodes in arbitration.
    {
      ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>    weights     = distWeights->getDataNonConst(0);

      for (size_t i=0;i<nonUniqueMap->getNodeNumElements();i++) {
        if (procWinner[i] == MUELU_UNASSIGNED) {
          if (vertex2AggId[i] != MUELU_UNAGGREGATED) {
            weights[i] = 1.;
            if (aggregates.IsRoot(i)) weights[i] = 2.;
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Tentatively assign any vertex (ghost or local) which neighbors a root
    // to the aggregate associated with the root.
    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>   weights      = distWeights->getDataNonConst(0);

      for (my_size_t i = 0; i < nVertices; i++) {
        if ( aggregates.IsRoot(i) && (procWinner[i] == myPid) ) {

          // neighOfINode is the neighbor node list of node 'i'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int colj = *it;
            if (vertex2AggId[colj] == MUELU_UNAGGREGATED) {
              weights[colj]= 1.;
              vertex2AggId[colj] = vertex2AggId[i];
            }
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Record the number of aggregated vertices
    GO total_phase_one_aggregated = 0;
    {
      ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);

      GO phase_one_aggregated = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] != MUELU_UNAGGREGATED)
          phase_one_aggregated++;
      }

      sumAll(graph.GetComm(), phase_one_aggregated, total_phase_one_aggregated);

      GO local_nVertices = nVertices, total_nVertices = 0;
      sumAll(graph.GetComm(), local_nVertices, total_nVertices);

      /* Among unaggregated points, see if we can make a reasonable size    */
      /* aggregate out of it. We do this by looking at neighbors and seeing */
      /* how many are unaggregated and on my processor. Loosely,            */
      /* base the number of new aggregates created on the percentage of     */
      /* unaggregated nodes.                                                */

      ArrayRCP<double>    weights      = distWeights->getDataNonConst(0);

      double factor = 1.;
      factor = ((double) total_phase_one_aggregated)/((double)(total_nVertices + 1));
      factor = pow(factor, GetPhase3AggCreation());

      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED)
          {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);
            int rowi_N = neighOfINode.size();

            int nonaggd_neighbors = 0;
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int colj = *it;
              if (vertex2AggId[colj] == MUELU_UNAGGREGATED && colj < nVertices)
                nonaggd_neighbors++;
            }
            if (  (nonaggd_neighbors > minNodesPerAggregate) &&
                  (((double) nonaggd_neighbors)/((double) rowi_N) > factor))
              {
                vertex2AggId[i] = (nAggregates)++;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int colj = *it;
                  if (vertex2AggId[colj]==MUELU_UNAGGREGATED) {
                    vertex2AggId[colj] = vertex2AggId[i];
                    if (colj < nVertices) weights[colj] = 2.;
                    else                  weights[colj] = 1.;
                  }
                }
                aggregates.SetIsRoot(i);
                weights[i] = 2.;
              }
          }
      } // for (i = 0; i < nVertices; i++)

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    //All tentatively assigned vertices are now definitive

    if (IsPrint(Statistics1)) {
      GO Nphase1_agg = nAggregates;
      GO total_aggs;

      sumAll(graph.GetComm(), Nphase1_agg, total_aggs);

      GetOStream(Statistics1, 0) << "Phase 1 - nodes aggregated = " << total_phase_one_aggregated << std::endl;
      GetOStream(Statistics1, 0) << "Phase 1 - total aggregates = " << total_aggs << std::endl;

      GO i = nAggregates - Nphase1_agg;
      { GO ii; sumAll(graph.GetComm(),i,ii); i = ii; }
      GetOStream(Statistics1, 0) << "Phase 3 - additional aggregates = " << i << std::endl;
    }

    // Determine vertices that are not shared by setting Temp to all ones
    // and doing NonUnique2NonUnique(..., ADD). This sums values of all
    // local copies associated with each Gid. Thus, sums > 1 are shared.

    //         std::cout << "exp_nrows=" << exp_nRows << " (nVertices= " << nVertices << ", numGhost=" << graph.GetNodeNumGhost() << ")" << std::endl;
    //         std::cout << "nonUniqueMap=" << nonUniqueMap->getNodeNumElements() << std::endl;

    RCP<Xpetra::Vector<double,LO,GO,NO> > temp_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap,false); //no need to zero out vector in ctor
    temp_->putScalar(1.);

    RCP<Xpetra::Vector<double,LO,GO,NO> > tempOutput_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap);

    myWidget.NonUnique2NonUnique(*temp_, *tempOutput_, Xpetra::ADD);

    std::vector<bool> gidNotShared(exp_nRows);
    {
      ArrayRCP<const double> tempOutput = tempOutput_->getData(0);
      for (int i = 0; i < exp_nRows; i++) {
        if (tempOutput[i] > 1.)
          gidNotShared[i] = false;
        else
          gidNotShared[i] = true;
      }
    }

    // Phase 4.
    double nAggregatesTarget;
    nAggregatesTarget = ((double)  uniqueMap->getGlobalNumElements())* (((double) uniqueMap->getGlobalNumElements())/ ((double) graph.GetGlobalNumEdges()));

    GO nAggregatesLocal=nAggregates, nAggregatesGlobal; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

    LO minNAggs; minAll(graph.GetComm(), nAggregates, minNAggs);
    LO maxNAggs; maxAll(graph.GetComm(), nAggregates, maxNAggs);

    //
    // Only do this phase if things look really bad. THIS
    // CODE IS PRETTY EXPERIMENTAL
    //
#define MUELU_PHASE4BUCKETS 6
    if ((nAggregatesGlobal < graph.GetComm()->getSize()) &&
        (2.5*nAggregatesGlobal < nAggregatesTarget) &&
        (minNAggs ==0) && (maxNAggs <= 1)) {

      // Modify seed of the random algorithm used by temp_->randomize()
      {
        typedef Teuchos::ScalarTraits<double> scalarTrait; // temp_ is of type double.
        scalarTrait::seedrandom(static_cast<unsigned int>(myPid*2 + (int) (11*scalarTrait::random())));
        int k = (int)ceil( (10.*myPid)/graph.GetComm()->getSize());
        for (int i = 0; i < k+7; i++) scalarTrait::random();
        temp_->setSeed(static_cast<unsigned int>(scalarTrait::random()));
      }

      temp_->randomize();

      ArrayRCP<double> temp = temp_->getDataNonConst(0);

      // build a list of candidate root nodes (vertices not adjacent
      // to aggregated vertices)

      my_size_t nCandidates = 0;
      global_size_t nCandidatesGlobal;

      ArrayRCP<LO> candidates = Teuchos::arcp<LO>(nVertices+1);

      double priorThreshold = 0.;
      for (int kkk = 0; kkk < MUELU_PHASE4BUCKETS; kkk++) {

        {
          ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
          ArrayView<const LO> vertex2AggIdView = vertex2AggId();
          RootCandidates(nVertices, vertex2AggIdView, graph, candidates, nCandidates, nCandidatesGlobal);
          // views on distributed vectors are freed here.
        }

        double nTargetNewGuys =  nAggregatesTarget - nAggregatesGlobal;
        double threshold      =  priorThreshold + (1. - priorThreshold)*nTargetNewGuys/(nCandidatesGlobal + .001);

        threshold = (threshold*(kkk+1.))/((double) MUELU_PHASE4BUCKETS);
        priorThreshold = threshold;

        {
          ArrayRCP<LO>     vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
          ArrayRCP<double> weights      = distWeights->getDataNonConst(0);

          for (int k = 0; k < nCandidates; k++ ) {
            int i = candidates[k];
            if ((vertex2AggId[i] == MUELU_UNAGGREGATED) && (fabs(temp[i])  < threshold)) {
              // Note: priorThreshold <= fabs(temp[i]) <= 1

              // neighOfINode is the neighbor node list of node 'iNode'.
              ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

              if (neighOfINode.size() > minNodesPerAggregate) { //TODO: check if this test is exactly was we want to do
                int count = 0;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int Adjacent    = *it;
                  // This might not be true if someone close to i
                  // is chosen as a root via fabs(temp[]) < Threshold
                  if (vertex2AggId[Adjacent] == MUELU_UNAGGREGATED){
                    count++;
                    vertex2AggId[Adjacent] = nAggregates;
                    weights[Adjacent] = 1.;
                  }
                }
                if (count >= minNodesPerAggregate) {
                  vertex2AggId[i] = nAggregates++;
                  weights[i] = 2.;
                  aggregates.SetIsRoot(i);
                }
                else { // undo things
                  for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                    int Adjacent    = *it;
                    if (vertex2AggId[Adjacent] == nAggregates){
                      vertex2AggId[Adjacent] = MUELU_UNAGGREGATED;
                      weights[Adjacent] = 0.;
                    }
                  }
                }
              }
            }
          }
          // views on distributed vectors are freed here.
        }
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
        nAggregatesLocal=nAggregates;
        sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

        // check that there are no aggregates sizes below minNodesPerAggregate

        aggregates.SetNumAggregates(nAggregates);

        RemoveSmallAggs(aggregates, minNodesPerAggregate, distWeights, myWidget);

        nAggregates = aggregates.GetNumAggregates();
      }   // one possibility
    }

    // Initialize things for Phase 5. This includes building the transpose
    // of the matrix ONLY for transposed rows that correspond to unaggregted
    // ghost vertices. Further, the transpose is only a local transpose.
    // Nonzero edges which exist on other processors are not represented.


    int observedNAgg=-1; //number of aggregates that contain vertices on this process

    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      for(LO k = 0; k < vertex2AggId.size(); ++k )
        if(vertex2AggId[k]>observedNAgg)
          observedNAgg=vertex2AggId[k];
      observedNAgg++;
    }

    ArrayRCP<int> Mark = Teuchos::arcp<int>(exp_nRows+1);
    ArrayRCP<int> agg_incremented = Teuchos::arcp<int>(observedNAgg);
    ArrayRCP<int> SumOfMarks = Teuchos::arcp<int>(observedNAgg);

    for (int i = 0; i < exp_nRows; i++)   Mark[i] = MUELU_DISTONE_VERTEX_WEIGHT;
    for (int i = 0; i < agg_incremented.size(); i++) agg_incremented[i] = 0;
    for (int i = 0; i < SumOfMarks.size(); i++) SumOfMarks[i] = 0;

    // Grab the transpose matrix graph for unaggregated ghost vertices.
    //     a) count the number of nonzeros per row in the transpose
    std::vector<int> RowPtr(exp_nRows+1-nVertices);
    //{
    ArrayRCP<const LO> vertex2AggIdCst = aggregates.GetVertex2AggId()->getData(0);

    for (int i = nVertices; i < exp_nRows;  i++) RowPtr[i-nVertices] = 0;
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          RowPtr[j-nVertices]++;
        }
      }
    }

    //     b) Convert RowPtr[i] to point to 1st first nnz spot in row i.

    int iSum = 0, iTemp;
    for (int i = nVertices; i < exp_nRows;  i++) {
      iTemp = RowPtr[i-nVertices];
      RowPtr[i-nVertices] = iSum;
      iSum += iTemp;
    }
    RowPtr[exp_nRows-nVertices] = iSum;
    std::vector<LO> cols(iSum+1);

    //     c) Traverse matrix and insert entries in proper location.
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          cols[RowPtr[j-nVertices]++] = i;
        }
      }
    }

    //     d) RowPtr[i] points to beginning of row i+1 so shift by one location.
    for (int i = exp_nRows; i > nVertices;  i--)
      RowPtr[i-nVertices] = RowPtr[i-1-nVertices];
    RowPtr[0] = 0;

    // views on distributed vectors are freed here.
    vertex2AggIdCst = Teuchos::null;
    //}

    int bestScoreCutoff;
    int thresholds[10] = {300,200,100,50,25,13,7,4,2,0};

    // Stick unaggregated vertices into existing aggregates as described above.

    {
      int ncalls=0;

      for (int kk = 0; kk < 10; kk += 2) {
        bestScoreCutoff = thresholds[kk];

        ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
        ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);
        ArrayRCP<double> weights       = distWeights->getDataNonConst(0);

        for (int i = 0; i < exp_nRows; i++) {

          if (vertex2AggId[i] == MUELU_UNAGGREGATED) {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode;

            // Grab neighboring vertices which is either in graph for local ids
            // or sits in transposed fragment just constructed above for ghosts.
            if (i < nVertices) {
              neighOfINode = graph.getNeighborVertices(i);
            }
            else {
              LO *rowi_col = NULL, rowi_N;
              rowi_col = &(cols[RowPtr[i-nVertices]]);
              rowi_N   = RowPtr[i+1-nVertices] - RowPtr[i-nVertices];

              neighOfINode = ArrayView<const LO>(rowi_col, rowi_N);
            }
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];

              //Adjacent is aggregated and either I own the aggregate
              // or I could own the aggregate after arbitration.
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED))){
                SumOfMarks[AdjacentAgg] += Mark[Adjacent];
              }
            }
            int best_score = MUELU_NOSCORE;
            int best_agg = -1;
            int BestMark = -1;
            bool cannotLoseAllFriends=false; // Used to address possible loss of vertices in arbitration of shared nodes discussed above. (Initialized to false only to avoid a compiler warning).

            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              //Adjacent is unaggregated, has some value and no
              //other processor has definitively claimed him
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  (SumOfMarks[AdjacentAgg] != 0) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED ))) {

                // first figure out the penalty associated with
                // AdjacentAgg having already been incremented
                // during this phase, then compute score.

                double penalty = (double) (INCR_SCALING*agg_incremented[AdjacentAgg]);
                if (penalty > MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]))
                  penalty = MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]);
                int score = SumOfMarks[AdjacentAgg]- ((int) floor(penalty));

                if (score > best_score) {
                  best_agg             = AdjacentAgg;
                  best_score           = score;
                  BestMark             = Mark[Adjacent];
                  cannotLoseAllFriends = false;

                  // This address issue mentioned above by checking whether
                  // Adjacent could be lost in arbitration. weight==0 means that
                  // Adjacent was not set during this loop of Phase 5 (and so it
                  // has already undergone arbitration). GidNotShared == true
                  // obviously implies that Adjacent cannot be lost to arbitration
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                }
                // Another vertex within current best aggregate found.
                // We should have (best_score == score). We need to see
                // if we can improve BestMark and cannotLoseAllFriends.
                else if (best_agg == AdjacentAgg) {
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                  if (Mark[Adjacent] > BestMark) BestMark = Mark[Adjacent];
                }
              }
            }
            // Clean up
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              if (AdjacentAgg >= 0) SumOfMarks[AdjacentAgg] = 0;
            }
            // Tentatively assign vertex to best_agg.
            if ( (best_score >= bestScoreCutoff) && (cannotLoseAllFriends)) {

              TEUCHOS_TEST_FOR_EXCEPTION(best_agg == -1 || BestMark == -1, MueLu::Exceptions::RuntimeError, "MueLu::CoupledAggregationFactory internal error"); // should never happen

              vertex2AggId[i] = best_agg;
              weights[i] = best_score;
              agg_incremented[best_agg]++;
              Mark[i] = (int) ceil(   ((double) BestMark)/2.);
            }
          }

          // views on distributed vectors are freed here.
        }

        vertex2AggId = Teuchos::null;
        procWinner   = Teuchos::null;
        weights      = Teuchos::null;

        ++ncalls;
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
      }

      //       if (graph.GetComm()->getRank()==0)
      //         std::cout << "#calls to Arb&Comm=" << ncalls << std::endl;
    }

    // Phase 6: Aggregate remain unaggregated vertices and try at all costs
    //          to avoid small aggregates.
    //          One case where we can find ourselves in this situation
    //          is if all vertices vk adjacent to v have already been
    //          put in other processor's aggregates and v does not have
    //          a direct connection to a local vertex in any of these
    //          aggregates.

    int Nleftover = 0, Nsingle = 0;
    {

      ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<double> weights       = distWeights->getDataNonConst(0);
      ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);

      int count = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED) {
          Nleftover++;

          // neighOfINode is the neighbor node list of node 'iNode'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          // We don't want too small of an aggregate. So lets see if there is an
          // unaggregated neighbor that we can also put with this vertex

          vertex2AggId[i] = nAggregates;
          weights[i] = 1.;
          if (count == 0) aggregates.SetIsRoot(i);
          count++;
          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int j = *it;
            if ((j != i)&&(vertex2AggId[j] == MUELU_UNAGGREGATED)&&
                (j < nVertices)) {
              vertex2AggId[j] = nAggregates;
              weights[j] = 1.;
              count++;
            }
          }
          if ( count >= minNodesPerAggregate) {
            nAggregates++;
            count = 0;
          }
        }
      }

      // We have something which is under minNodesPerAggregate when
      if (count != 0) {
#ifdef FIXME
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            if ((vertex2AggId[i] == nAggregates) && (procWinner[i] == myPid)) {
              vertex2AggId[i] = 0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#else
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            // TW: This is not a real fix. This may produce ugly bad aggregates!
            // I removed the procWinner[i] == myPid check. it makes no sense to me since
            // it leaves vertex2AggId[i] == nAggregates -> crash in ComputeAggregateSizes().
            // Maybe it's better to add the leftovers to the last generated agg on the current proc.
            // The best solution would be to add them to the "next"/nearest aggregate, that may be
            // on an other processor
            if (vertex2AggId[i] == nAggregates) {
              vertex2AggId[i] = nAggregates-1; //0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#endif
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, false);

    if (IsPrint(Statistics1)) {
      GO total_Nsingle=0;   sumAll(graph.GetComm(), (GO)Nsingle,     total_Nsingle);
      GO total_Nleftover=0; sumAll(graph.GetComm(), (GO)Nleftover,   total_Nleftover);
      // GO total_aggs;        sumAll(graph.GetComm(), (GO)nAggregates, total_aggs);
      // GetOStream(Statistics1, 0) << "Phase 6 - total aggregates = " << total_aggs << std::endl;
      GetOStream(Statistics1, 0) << "Phase 6 - leftovers = " << total_Nleftover << " and singletons = " << total_Nsingle << std::endl;
    }

    aggregates.SetNumAggregates(nAggregates);

  } //AggregateLeftovers