Ejemplo n.º 1
0
Mark* score4K(Dice *dice[])
{
    int i, j, count;
    Mark *mark = (Mark *)malloc(sizeof(Mark));

    /* tests each die value for a 4k or better */
    for(i = 1; i <= 6; i++)
    {
        count = 0;

        /* counts the number of occurence of the die */
        for(j = 0; j < 5; j++)
            if(i == dice[j]->value)
                    count++;

        /* if 4k or better breka */
        if(count >= 4)
        {
            mark->value = sumAll(dice);
            break;
        }
    }

    mark->vol = (count >= 4) ? 0 : 1;
    return mark;
}
Ejemplo n.º 2
0
  std::string MHDRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::PrintLoadBalancingInfo(const Matrix & Ac, const std::string & msgTag) {
    std::stringstream ss(std::stringstream::out);

    // TODO: provide a option to skip this (to avoid global communication)
      // TODO: skip if nproc == 1

    //nonzero imbalance
    size_t numMyNnz  = Ac.getNodeNumEntries();
    GO maxNnz, minNnz;
    RCP<const Teuchos::Comm<int> > comm = Ac.getRowMap()->getComm();
    maxAll(comm,(GO)numMyNnz,maxNnz);
    //min nnz over all proc (disallow any processors with 0 nnz)
    minAll(comm, (GO)((numMyNnz > 0) ? numMyNnz : maxNnz), minNnz);
    double imbalance = ((double) maxNnz) / minNnz;

    size_t numMyRows = Ac.getNodeNumRows();
    //Check whether Ac is spread over more than one process.
    GO numActiveProcesses=0;
    sumAll(comm, (GO)((numMyRows > 0) ? 1 : 0), numActiveProcesses);

    //min, max, and avg # rows per proc
    GO minNumRows, maxNumRows;
    double avgNumRows;
    maxAll(comm, (GO)numMyRows, maxNumRows);
    minAll(comm, (GO)((numMyRows > 0) ? numMyRows : maxNumRows), minNumRows);
    assert(numActiveProcesses > 0);
    avgNumRows = Ac.getGlobalNumRows() / numActiveProcesses;

    ss << msgTag << " # processes with rows = " << numActiveProcesses << std::endl;
    ss << msgTag << " min # rows per proc = " << minNumRows << ", max # rows per proc = " << maxNumRows << ", avg # rows per proc = " << avgNumRows << std::endl;
    ss << msgTag << " nonzero imbalance = " << imbalance << std::endl;

    return ss.str();
  }
Ejemplo n.º 3
0
  void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::RootCandidates(my_size_t nVertices,
  ArrayView<const LO> & vertex2AggId, GraphBase const &graph,
                      ArrayRCP<LO> &candidates, my_size_t &nCandidates, global_size_t &nCandidatesGlobal) const
  {
    nCandidates = 0;

    for (my_size_t i = 0; i < nVertices; i++ ) {
      if (vertex2AggId[i] == MUELU_UNAGGREGATED) {
        bool noAggdNeighbors = true;

        // neighOfINode is the neighbor node list of node 'iNode'.
        ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

        for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
          int adjacent    = *it;
          if (vertex2AggId[adjacent] != MUELU_UNAGGREGATED)
            noAggdNeighbors = false;
        }
        if (noAggdNeighbors == true) candidates[nCandidates++] = i;
      }
    }

    sumAll(graph.GetComm(), (GO)nCandidates, nCandidatesGlobal);

  } //RootCandidates
Ejemplo n.º 4
0
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::CheckMainDiagonal(RCP<Matrix> & Ac) const {
    // plausibility check: no zeros on diagonal
    RCP<Vector> diagVec = VectorFactory::Build(Ac->getRowMap());
    Ac->getLocalDiagCopy(*diagVec);

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    LO lZeroDiags = 0;
    Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0);
    for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) {
      if (diagVal[r] == zero) {
        lZeroDiags++;

        if (repairZeroDiagonals_) {
          GO grid = Ac->getRowMap()->getGlobalElement(r);
          LO lcid = Ac->getColMap()->getLocalElement(grid);
          Teuchos::ArrayRCP<LO> indout(1, lcid);
          Teuchos::ArrayRCP<SC> valout(1, one);

          Ac->insertLocalValues(r, indout.view(0, indout.size()), valout.view(0, valout.size()));
        }
      }
    }

    if (IsPrint(Warnings0)) {
      const RCP<const Teuchos::Comm<int> > & comm = Ac->getRowMap()->getComm();
      GO lZeroDiagsGO = Teuchos::as<GO>(lZeroDiags); /* LO->GO conversion */
      GO gZeroDiags   = 0;
      sumAll(comm, lZeroDiagsGO, gZeroDiags);
      if (repairZeroDiagonals_) GetOStream(Warnings0,0) << "RAPFactory (WARNING): repaired " << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;
      else                      GetOStream(Warnings0,0) << "RAPFactory (WARNING): found "    << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;
    }
  }
Ejemplo n.º 5
0
// Returns the roll's value for a given scoring choice
const int DiceRoll::rollValue(const ChoiceAction choice) const
{
	switch(choice) {
	case ACES:
		return sumIfEqual(1);
	case TWOS:
		return sumIfEqual(2);
	case THREES:
		return sumIfEqual(3);
	case FOURS:
		return sumIfEqual(4);
	case FIVES:
		return sumIfEqual(5);
	case SIXES:
		return sumIfEqual(6);
	case THREE_OF_A_KIND:
		if (isThreeOfAKind())
			return sumAll();
		return 0;
	case FOUR_OF_A_KIND:
		if (isFourOfAKind())
			return sumAll();
		return 0;
	case FULL_HOUSE:
		if (isFullHouse())
			return 25;
		return 0;
	case SMALL_STRAIGHT:
		if (isSmallStraight())
			return 30;
		return 0;
	case LARGE_STRAIGHT:
		if (isLargeStraight())
			return 40;
		return 0;
	case YAHTZEE:
		if (isYahtzee())
			return 50;
		return 0;
	case CHANCE:
		return sumAll();
	case ROLL:
		assert(false); // Unscorable!
	}
}
Ejemplo n.º 6
0
Mark* scoreChance(Dice *dice[])
{
    Mark *temp = score4K(dice), 
         *mark = (Mark *)malloc(sizeof(Mark));

    mark->value  = sumAll(dice);
    mark->vol    = (temp->value > 0) ? 0 : 1;
    free(temp);

    return mark;
}
Ejemplo n.º 7
0
int main(void)
{
	int j[] = {4,10,19,42,15,53,36}, capacity = 7;
	
	printf("The smallest value in the vector is: %d\n", minValue(j, capacity));
	printf("The product of all the values in the vector is: %d\n", productOfAll(j, capacity));
	printf("The maximum value in the vector is: %d\n", maxValue(j, capacity));
	printf("The index of the minimum value of the vector is: %d\n", minIndex(j, capacity));
	printf("The sum of the vectors is: %d\n", sumAll(j, capacity));
	printf("The max value index is: %d\n", maxIndex(j, capacity));
	


}
Ejemplo n.º 8
0
 GlobalOrdinal Aggregates<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::GetNumGlobalAggregates() const {
   LO nAggregates = GetNumAggregates();
   GO nGlobalAggregates; sumAll(vertex2AggId_->getMap()->getComm(), (GO)nAggregates, nGlobalAggregates);
   return nGlobalAggregates;
 }
Ejemplo n.º 9
0
  void Newton::solveNonLinear() {
    casadi_log("Newton::solveNonLinear:begin");

    // Set up timers for profiling
    double time_zero=0;
    double time_start=0;
    double time_stop=0;
    if (CasadiOptions::profiling && !CasadiOptions::profilingBinary) {
      time_zero = getRealTime();
      CasadiOptions::profilingLog  << "start " << this << ":" <<getOption("name") << std::endl;
    }

    // Pass the inputs to J
    for (int i=0; i<getNumInputs(); ++i) {
      if (i!=iin_) jac_.setInput(input(i), i);
    }

    // Aliases
    DMatrix &u = output(iout_);
    DMatrix &J = jac_.output(0);
    DMatrix &F = jac_.output(1+iout_);

    // Perform the Newton iterations
    int iter=0;

    bool success = true;

    while (true) {
      // Break if maximum number of iterations already reached
      if (iter >= max_iter_) {
        log("evaluate", "Max. iterations reached.");
        stats_["return_status"] = "max_iteration_reached";
        success = false;
        break;
      }

      // Start a new iteration
      iter++;

      // Print progress
      if (monitored("step") || monitored("stepsize")) {
        std::cout << "Step " << iter << "." << std::endl;
      }

      if (monitored("step")) {
        std::cout << "  u = " << u << std::endl;
      }

      // Use u to evaluate J
      jac_.setInput(u, iin_);
      for (int i=0; i<getNumInputs(); ++i)
        if (i!=iin_) jac_.setInput(input(i), i);

      if (CasadiOptions::profiling) {
        time_start = getRealTime(); // Start timer
      }

      jac_.evaluate();

      // Write out profiling information
      if (CasadiOptions::profiling && !CasadiOptions::profilingBinary) {
        time_stop = getRealTime(); // Stop timer
        CasadiOptions::profilingLog
            << (time_stop-time_start)*1e6 << " ns | "
            << (time_stop-time_zero)*1e3 << " ms | "
            << this << ":" << getOption("name") << ":0|" << jac_.get() << ":"
            << jac_.getOption("name") << "|evaluate jacobian" << std::endl;
      }

      if (monitored("F")) std::cout << "  F = " << F << std::endl;
      if (monitored("normF"))
        std::cout << "  F (min, max, 1-norm, 2-norm) = "
                  << (*std::min_element(F.data().begin(), F.data().end()))
                  << ", " << (*std::max_element(F.data().begin(), F.data().end()))
                  << ", " << sumAll(fabs(F)) << ", " << sqrt(sumAll(F*F)) << std::endl;
      if (monitored("J")) std::cout << "  J = " << J << std::endl;

      double abstol = 0;
      if (numeric_limits<double>::infinity() != abstol_) {
        abstol = std::max((*std::max_element(F.data().begin(),
                                                  F.data().end())),
                               -(*std::min_element(F.data().begin(),
                                                   F.data().end())));
        if (abstol <= abstol_) {
          casadi_log("Converged to acceptable tolerance - abstol: " << abstol_);
          break;
        }
      }

      // Prepare the linear solver with J
      linsol_.setInput(J, LINSOL_A);

      if (CasadiOptions::profiling) {
        time_start = getRealTime(); // Start timer
      }
      linsol_.prepare();
      // Write out profiling information
      if (CasadiOptions::profiling && !CasadiOptions::profilingBinary) {
        time_stop = getRealTime(); // Stop timer
        CasadiOptions::profilingLog
            << (time_stop-time_start)*1e6 << " ns | "
            << (time_stop-time_zero)*1e3 << " ms | "
            << this << ":" << getOption("name")
            << ":1||prepare linear system" << std::endl;
      }

      if (CasadiOptions::profiling) {
        time_start = getRealTime(); // Start timer
      }
      // Solve against F
      linsol_.solve(&F.front(), 1, false);
      if (CasadiOptions::profiling && !CasadiOptions::profilingBinary) {
        time_stop = getRealTime(); // Stop timer
        CasadiOptions::profilingLog
            << (time_stop-time_start)*1e6 << " ns | "
            << (time_stop-time_zero)*1e3 << " ms | "
            << this << ":" << getOption("name") << ":2||solve linear system" << std::endl;
      }

      if (monitored("step")) {
        std::cout << "  step = " << F << std::endl;
      }

      double abstolStep=0;
      if (numeric_limits<double>::infinity() != abstolStep_) {
        abstolStep = std::max((*std::max_element(F.data().begin(),
                                                  F.data().end())),
                               -(*std::min_element(F.data().begin(),
                                                   F.data().end())));
        if (monitored("stepsize")) {
          std::cout << "  stepsize = " << abstolStep << std::endl;
        }
        if (abstolStep <= abstolStep_) {
          casadi_log("Converged to acceptable tolerance - abstolStep: " << abstolStep_);
          break;
        }
      }

      if (print_iteration_) {
        // Only print iteration header once in a while
        if (iter % 10==0) {
          printIteration(std::cout);
        }

        // Print iteration information
        printIteration(std::cout, iter, abstol, abstolStep);
      }

      // Update Xk+1 = Xk - J^(-1) F
      std::transform(u.begin(), u.end(), F.begin(), u.begin(), std::minus<double>());

      // Get auxiliary outputs
      for (int i=0; i<getNumOutputs(); ++i) {
        if (i!=iout_) jac_.getOutput(output(i), 1+i);
      }
    }

    // Store the iteration count
    if (gather_stats_) stats_["iter"] = iter;

    if (success) stats_["return_status"] = "success";

    // Factorization up-to-date
    fact_up_to_date_ = true;

    casadi_log("Newton::solveNonLinear():end after " << iter << " steps");
  }
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    ParameterList pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (pL.get<std::string>("aggregation: mode") == "old") {
      if (pL.get<bool>("UseOnePtAggregationAlgorithm")             == true)   algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
      if (pL.get<bool>("UsePreserveDirichletAggregationAlgorithm") == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("UseUncoupledAggregationAlgorithm")         == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("UseMaxLinkAggregationAlgorithm")           == true)   algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
      if (pL.get<bool>("UseEmergencyAggregationAlgorithm")         == true)   algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));

    } else {
      if (pL.get<bool>("aggregation: preserve Dirichlet points")   == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 1" )             == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2a")             == true)   algos_.push_back(rcp(new AggregationPhase2aAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2b")             == true)   algos_.push_back(rcp(new AggregationPhase2bAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 3" )             == true)   algos_.push_back(rcp(new AggregationPhase3Algorithm            (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    }

    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name");
    RCP<const Map> OnePtMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO numRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(numRows, READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null)
      for (LO i = 0; i < numRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = BOUNDARY;

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (OnePtMap != Teuchos::null) {
      for (LO i = 0; i < numRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        for (LO kr = 0; kr < nDofsPerNode; kr++)
          if (OnePtMap->isNodeGlobalElement(grid + kr))
            aggStat[i] = ONEPT;
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(numRows), numGlobalRows);

    LO numNonAggregatedNodes = numRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = numRows - numNonAggregatedNodes, numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(),  numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        if (aggPercent > 99.99 && aggPercent < 100.00) {
          // Due to round off (for instance, for 140465733/140466897), we could
          // get 100.00% display even if there are some remaining nodes. This
          // is bad from the users point of view. It is much better to change
          // it to display 99.99%.
          aggPercent = 99.99;
        }
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void LocalAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::CoarsenUncoupled(GraphBase const & graph, Aggregates & aggregates) const {
    Monitor m(*this, "Coarsen Uncoupled");

    std::string orderingType;
    switch(ordering_) {
      case NATURAL:
        orderingType="Natural";
        break;
      case RANDOM:
        orderingType="Random";
        break;
      case GRAPH:
        orderingType="Graph";
        break;
      default:
        break;
    }
    GetOStream(Runtime1) << "Ordering:                  " << orderingType << std::endl;
    GetOStream(Runtime1) << "Min nodes per aggregate:   " << minNodesPerAggregate_ << std::endl;
    GetOStream(Runtime1) << "Max nbrs already selected: " << maxNeighAlreadySelected_ << std::endl;

    /* Create Aggregation object */
    my_size_t nAggregates = 0;

    /* ============================================================= */
    /* aggStat indicates whether this node has been aggreated, and   */
    /* vertex2AggId stores the aggregate number where this node has  */
    /* been aggregated into.                                         */
    /* ============================================================= */

    Teuchos::ArrayRCP<NodeState> aggStat;
    const my_size_t nRows = graph.GetNodeNumVertices();
    if (nRows > 0) aggStat = Teuchos::arcp<NodeState>(nRows);
    for ( my_size_t i = 0; i < nRows; ++i ) aggStat[i] = READY;

    /* ============================================================= */
    /* Phase 1  :                                                    */
    /*    for all nodes, form a new aggregate with its neighbors     */
    /*    if the number of its neighbors having been aggregated does */
    /*    not exceed a given threshold                               */
    /*    (GetMaxNeighAlreadySelected() = 0 ===> Vanek's scheme) */
    /* ============================================================= */

    /* some general variable declarations */
    Teuchos::ArrayRCP<LO> randomVector;
    RCP<MueLu::LinkedList> nodeList; /* list storing the next node to pick as a root point for ordering_ == GRAPH */
    MueLu_SuperNode  *aggHead=NULL, *aggCurrent=NULL, *supernode=NULL;
    /**/

    if ( ordering_ == RANDOM )       /* random ordering */
      {
        //TODO: could be stored in a class that respect interface of LinkedList

        randomVector = Teuchos::arcp<LO>(nRows); //size_t or int ?-> to be propagated
        for (my_size_t i = 0; i < nRows; ++i) randomVector[i] = i;
        RandomReorder(randomVector);
      }
    else if ( ordering_ == GRAPH )  /* graph ordering */
      {
        nodeList = rcp(new MueLu::LinkedList());
        nodeList->Add(0);
      }

    /* main loop */
    {
      LO iNode  = 0;
      LO iNode2 = 0;

      Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); // output only: contents ignored

      while (iNode2 < nRows)
        {

          /*------------------------------------------------------ */
          /* pick the next node to aggregate                       */
          /*------------------------------------------------------ */

          if      ( ordering_ == NATURAL ) iNode = iNode2++;
          else if ( ordering_ == RANDOM )  iNode = randomVector[iNode2++];
          else if ( ordering_ == GRAPH )
            {
              if ( nodeList->IsEmpty() )
                {
                  for ( int jNode = 0; jNode < nRows; ++jNode )
                    {
                      if ( aggStat[jNode] == READY )
                        {
                          nodeList->Add(jNode); //TODO optim: not necessary to create a node. Can just set iNode value and skip the end
                          break;
                        }
                    }
                }
              if ( nodeList->IsEmpty() ) break; /* end of the while loop */ //TODO: coding style :(

              iNode = nodeList->Pop();
            }
          else {
            throw(Exceptions::RuntimeError("CoarsenUncoupled: bad aggregation ordering option"));
          }

          /*------------------------------------------------------ */
          /* consider further only if the node is in READY mode    */
          /*------------------------------------------------------ */

          if ( aggStat[iNode] == READY )
            {
              // neighOfINode is the neighbor node list of node 'iNode'.
              Teuchos::ArrayView<const LO> neighOfINode = graph.getNeighborVertices(iNode);
              typename Teuchos::ArrayView<const LO>::size_type length = neighOfINode.size();

              supernode = new MueLu_SuperNode;
              try {
                supernode->list = Teuchos::arcp<int>(length+1);
              } catch (std::bad_alloc&) {
                TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu::LocalAggregationAlgorithm::CoarsenUncoupled(): Error: couldn't allocate memory for supernode! length=" + Teuchos::toString(length));
              }

              supernode->maxLength = length;
              supernode->length = 1;
              supernode->list[0] = iNode;

              int selectFlag = 1;
              {
                /*--------------------------------------------------- */
                /* count the no. of neighbors having been aggregated  */
                /*--------------------------------------------------- */

                int count = 0;
                for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it)
                  {
                    int index = *it;
                    if ( index < nRows )
                      {
                        if ( aggStat[index] == READY ||
                             aggStat[index] == NOTSEL )
                          supernode->list[supernode->length++] = index;
                        else count++;

                      }
                  }

                /*--------------------------------------------------- */
                /* if there are too many neighbors aggregated or the  */
                /* number of nodes in the new aggregate is too few,   */
                /* don't do this one                                  */
                /*--------------------------------------------------- */

                if ( count > GetMaxNeighAlreadySelected() ) selectFlag = 0;
              }

              // Note: the supernode length is actually 1 more than the
              //       number of nodes in the candidate aggregate. The
              //       root is counted twice. I'm not sure if this is
              //       a bug or a feature ... so I'll leave it and change
              //       < to <= in the if just below.

              if (selectFlag != 1 ||
                  supernode->length <= GetMinNodesPerAggregate())
                {
                  aggStat[iNode] = NOTSEL;
                  delete supernode;
                  if ( ordering_ == GRAPH ) /* if graph ordering */
                    {
                      for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it)
                        {
                          int index = *it;
                          if  ( index < nRows && aggStat[index] == READY )
                            {
                              nodeList->Add(index);
                            }
                        }
                    }
                }
              else
                {
                  aggregates.SetIsRoot(iNode);
                  for ( int j = 0; j < supernode->length; ++j )
                    {
                      int jNode = supernode->list[j];
                      aggStat[jNode] = SELECTED;
                      vertex2AggId[jNode] = nAggregates;
                      if ( ordering_ == GRAPH ) /* if graph ordering */
                        {

                          Teuchos::ArrayView<const LO> neighOfJNode = graph.getNeighborVertices(jNode);

                          for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfJNode.begin(); it != neighOfJNode.end(); ++it)
                            {
                              int index = *it;
                              if ( index < nRows && aggStat[index] == READY )
                                {
                                  nodeList->Add(index);
                                }
                            }
                        }
                    }
                  supernode->next = NULL;
                  supernode->index = nAggregates;
                  if ( nAggregates == 0 )
                    {
                      aggHead = supernode;
                      aggCurrent = supernode;
                    }
                  else
                    {
                      aggCurrent->next = supernode;
                      aggCurrent = supernode;
                    }
                  nAggregates++;
                  // unused aggCntArray[nAggregates] = supernode->length;
                }
            }
        } // end of 'for'

      // views on distributed vectors are freed here.

    } // end of 'main loop'

    nodeList = Teuchos::null;

    /* Update aggregate object */
    aggregates.SetNumAggregates(nAggregates);

    /* Verbose */
    {
      const RCP<const Teuchos::Comm<int> > & comm = graph.GetComm();

      if (IsPrint(Warnings0)) {
        GO localReady=0, globalReady;

        // Compute 'localReady'
        for ( my_size_t i = 0; i < nRows; ++i )
          if (aggStat[i] == READY) localReady++;

        // Compute 'globalReady'
        sumAll(comm, localReady, globalReady);

        if(globalReady > 0)
          GetOStream(Warnings0) << "Warning: " << globalReady << " READY nodes left" << std::endl;
      }

      if (IsPrint(Statistics1)) {
        // Compute 'localSelected'
        LO localSelected=0;
        for ( my_size_t i = 0; i < nRows; ++i )
          if ( aggStat[i] == SELECTED ) localSelected++;

        // Compute 'globalSelected'
        GO globalSelected; sumAll(comm, (GO)localSelected, globalSelected);

        // Compute 'globalNRows'
        GO globalNRows; sumAll(comm, (GO)nRows, globalNRows);

        GetOStream(Statistics1) << "Nodes aggregated = " << globalSelected << " (" << globalNRows << ")" << std::endl;
      }

      if (IsPrint(Statistics1)) {
        GO nAggregatesGlobal; sumAll(comm, (GO)nAggregates, nAggregatesGlobal);
        GetOStream(Statistics1) << "Total aggregates = " << nAggregatesGlobal << std::endl;
      }

    } // verbose

      /* ------------------------------------------------------------- */
      /* clean up                                                      */
      /* ------------------------------------------------------------- */

    aggCurrent = aggHead;
    while ( aggCurrent != NULL )
      {
        supernode = aggCurrent;
        aggCurrent = aggCurrent->next;
        delete supernode;
      }

  } // CoarsenUncoupled
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const ParameterList& pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    bool bUseOnePtAggregationAlgorithm             = pL.get<bool>("UseOnePtAggregationAlgorithm");
    bool bUseSmallAggregationAlgorithm             = pL.get<bool>("UseSmallAggregatesAggregationAlgorithm");
    bool bUsePreserveDirichletAggregationAlgorithm = pL.get<bool>("UsePreserveDirichletAggregationAlgorithm");
    bool bUseUncoupledAggregationAglorithm         = pL.get<bool>("UseUncoupledAggregationAlgorithm");
    bool bUseMaxLinkAggregationAlgorithm           = pL.get<bool>("UseMaxLinkAggregationAlgorithm");
    bool bUseIsolatedNodeAggregationAglorithm      = pL.get<bool>("UseIsolatedNodeAggregationAlgorithm");
    bool bUseEmergencyAggregationAlgorithm         = pL.get<bool>("UseEmergencyAggregationAlgorithm");

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (bUseOnePtAggregationAlgorithm)             algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
    if (bUseSmallAggregationAlgorithm)             algos_.push_back(rcp(new SmallAggregationAlgorithm             (graphFact)));
    if (bUseUncoupledAggregationAglorithm)         algos_.push_back(rcp(new UncoupledAggregationAlgorithm         (graphFact)));
    if (bUseMaxLinkAggregationAlgorithm)           algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
    if (bUsePreserveDirichletAggregationAlgorithm) algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
    if (bUseIsolatedNodeAggregationAglorithm)      algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    if (bUseEmergencyAggregationAlgorithm)         algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));


    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name"), mapSmallAggName = pL.get<std::string>("SmallAgg aggregate map name");
    RCP<const Map> OnePtMap, SmallAggMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }
    if (mapSmallAggName.length()) {
      RCP<const FactoryBase> mapSmallAggFact = GetFactory("SmallAgg aggregate map factory");
      SmallAggMap = currentLevel.Get<RCP<const Map> >(mapSmallAggName, mapSmallAggFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO nRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(nRows, NodeStats::READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = NodeStats::BOUNDARY;
    }

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (SmallAggMap != Teuchos::null || OnePtMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        if (SmallAggMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (SmallAggMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::SMALLAGG;
          }
        }

        if (OnePtMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (OnePtMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::ONEPT;
          }
        }
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(nRows), numGlobalRows);

    LO numNonAggregatedNodes = nRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = nRows - numNonAggregatedNodes,  numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(), numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
Ejemplo n.º 13
0
int main(int argc, char **argv) {
  int numbers[5] = {3, 4, 1, 7, 4};
  int sum = sumAll(numbers, 5);
  printf("sum is: %d\n", sum);
  return 0;
}
Ejemplo n.º 14
0
Int_t main(Int_t argc, Char_t *argv[])
{
   ROOT::Mpi::TEnvironment env(argc, argv);
   ROOT::Mpi::TIntraCommunicator world;
   TVectorT<Double_t> mResult;
   Double_t fScalarResult;
   TVectorT<Double_t> v1(elements);
   TVectorT<Double_t> v2(elements);

   for (Int_t i = 0; i < elements; i++) {
      v1[i] = i + (i + world.Size());
      v2[i] = i * (i + world.Size());

   }

///////////////////////////////////////////////
//Testing methdos with results in single Rank//
///////////////////////////////////////////////
   ROOT::Mpi::Math::TVectorTWrapper<Double_t> add(v1);
   add.Addition(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sub(v1);
   sub.Subtraction(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> dot(v1);
   dot.Dot(v2, root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm2Sqr(v1);
   norm2Sqr.Norm2Sqr(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm1(v1);
   norm1.Norm1(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> min(v1);
   min.Min(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> max(v1);
   max.Max(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> normalize(v1);
   normalize.Normalize(root);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sum(v1);
   sum.Sum(root);

   if (world.Rank() == root) {
      add.GetResult(mResult);
      MpiCompareTVectorTest(mResult, v1 + v2, world.Rank(), "Vector Addition Single");

      sub.GetResult(mResult);
      MpiCompareTVectorTest(mResult, v1 - v2, world.Rank(), "Vector Subtraction Single");

      dot.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, Dot(v1, v2) , world.Rank(), "Vector Dot Product Single");

      norm2Sqr.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Norm2Sqr() , world.Rank(), "Vector Norm2Sqr Single");

      norm1.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Norm1() , world.Rank(), "Vector Norm1 Single");

      min.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Min() , world.Rank(), "Vector Min Single");

      max.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Max() , world.Rank(), "Vector Max Single");

      normalize.GetResult(mResult);
      MpiCompareTest(mResult.Norm2Sqr(), ((1 / TMath::Sqrt(v1.Norm2Sqr()))*v1).Norm2Sqr() , world.Rank(), "Vector Normalize Single");

      sum.GetResult(fScalarResult);
      MpiCompareTest(fScalarResult, v1.Sum(), world.Rank(), "Vector Sum Single");
   }

///////////////////////////////////////////////
//Testing methdos with results in all ranks  //
///////////////////////////////////////////////

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> addAll(v1);
   add.Addition(v2);

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> subAll(v1);
   sub.Subtraction(v2);

   add.GetResult(mResult);
   MpiCompareTVectorTest(mResult, v1 + v2, world.Rank(), "Vector Addition All");

   sub.GetResult(mResult);
   MpiCompareTVectorTest(mResult, v1 - v2, world.Rank(), "Vector Subtraction All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> dotAll(v1);
   dotAll.Dot(v2);
   dotAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, Dot(v1, v2) , world.Rank(), "Vector Dot Product All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm2SqrAll(v2);
   norm2SqrAll.Norm2Sqr();
   norm2SqrAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Norm2Sqr() , world.Rank(), "Vector Norm2Sqr All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> norm1All(v2);
   norm1All.Norm1();
   norm1All.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Norm1() , world.Rank(), "Vector Norm1 All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> minAll(v2);
   minAll.Min();
   minAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Min() , world.Rank(), "Vector Min All");


   ROOT::Mpi::Math::TVectorTWrapper<Double_t> maxAll(v2);
   maxAll.Max();
   maxAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Max() , world.Rank(), "Vector Max All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> normalizeAll(v2);
   normalizeAll.Normalize();
   normalizeAll.GetResult(mResult);
   //show if the vector is normalize, then Norm2Sqr of result is near to 1
   MpiCompareTest(mResult.Norm2Sqr(), ((1 / TMath::Sqrt(v2.Norm2Sqr()))*v2).Norm2Sqr() , world.Rank(), "Vector Normalize All");

   ROOT::Mpi::Math::TVectorTWrapper<Double_t> sumAll(v2);
   sumAll.Sum();
   sumAll.GetResult(fScalarResult);
   MpiCompareTest(fScalarResult, v2.Sum() , world.Rank(), "Vector Sum All");


   return 0;
}
Ejemplo n.º 15
0
  void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::AggregateLeftovers(GraphBase const &graph, Aggregates &aggregates) const {
    Monitor m(*this, "AggregateLeftovers");

    my_size_t nVertices = graph.GetNodeNumVertices();
    int exp_nRows    = aggregates.GetMap()->getNodeNumElements(); // Tentative fix... was previously exp_nRows = nVertices + graph.GetNodeNumGhost();
    int myPid        = graph.GetComm()->getRank();
    my_size_t nAggregates  = aggregates.GetNumAggregates();

    int minNodesPerAggregate = GetMinNodesPerAggregate();

    const RCP<const Map> nonUniqueMap = aggregates.GetMap(); //column map of underlying graph
    const RCP<const Map> uniqueMap    = graph.GetDomainMap();

    MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> myWidget(uniqueMap, nonUniqueMap);

    //TODO JJH We want to skip this call
    RCP<Xpetra::Vector<double,LO,GO,NO> > distWeights = Xpetra::VectorFactory<double,LO,GO,NO>::Build(nonUniqueMap);

    // Aggregated vertices not "definitively" assigned to processors are
    // arbitrated by ArbitrateAndCommunicate(). There is some
    // additional logic to prevent losing root nodes in arbitration.
    {
      ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>    weights     = distWeights->getDataNonConst(0);

      for (size_t i=0;i<nonUniqueMap->getNodeNumElements();i++) {
        if (procWinner[i] == MUELU_UNASSIGNED) {
          if (vertex2AggId[i] != MUELU_UNAGGREGATED) {
            weights[i] = 1.;
            if (aggregates.IsRoot(i)) weights[i] = 2.;
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Tentatively assign any vertex (ghost or local) which neighbors a root
    // to the aggregate associated with the root.
    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>   weights      = distWeights->getDataNonConst(0);

      for (my_size_t i = 0; i < nVertices; i++) {
        if ( aggregates.IsRoot(i) && (procWinner[i] == myPid) ) {

          // neighOfINode is the neighbor node list of node 'i'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int colj = *it;
            if (vertex2AggId[colj] == MUELU_UNAGGREGATED) {
              weights[colj]= 1.;
              vertex2AggId[colj] = vertex2AggId[i];
            }
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Record the number of aggregated vertices
    GO total_phase_one_aggregated = 0;
    {
      ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);

      GO phase_one_aggregated = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] != MUELU_UNAGGREGATED)
          phase_one_aggregated++;
      }

      sumAll(graph.GetComm(), phase_one_aggregated, total_phase_one_aggregated);

      GO local_nVertices = nVertices, total_nVertices = 0;
      sumAll(graph.GetComm(), local_nVertices, total_nVertices);

      /* Among unaggregated points, see if we can make a reasonable size    */
      /* aggregate out of it. We do this by looking at neighbors and seeing */
      /* how many are unaggregated and on my processor. Loosely,            */
      /* base the number of new aggregates created on the percentage of     */
      /* unaggregated nodes.                                                */

      ArrayRCP<double>    weights      = distWeights->getDataNonConst(0);

      double factor = 1.;
      factor = ((double) total_phase_one_aggregated)/((double)(total_nVertices + 1));
      factor = pow(factor, GetPhase3AggCreation());

      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED)
          {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);
            int rowi_N = neighOfINode.size();

            int nonaggd_neighbors = 0;
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int colj = *it;
              if (vertex2AggId[colj] == MUELU_UNAGGREGATED && colj < nVertices)
                nonaggd_neighbors++;
            }
            if (  (nonaggd_neighbors > minNodesPerAggregate) &&
                  (((double) nonaggd_neighbors)/((double) rowi_N) > factor))
              {
                vertex2AggId[i] = (nAggregates)++;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int colj = *it;
                  if (vertex2AggId[colj]==MUELU_UNAGGREGATED) {
                    vertex2AggId[colj] = vertex2AggId[i];
                    if (colj < nVertices) weights[colj] = 2.;
                    else                  weights[colj] = 1.;
                  }
                }
                aggregates.SetIsRoot(i);
                weights[i] = 2.;
              }
          }
      } // for (i = 0; i < nVertices; i++)

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    //All tentatively assigned vertices are now definitive

    if (IsPrint(Statistics1)) {
      GO Nphase1_agg = nAggregates;
      GO total_aggs;

      sumAll(graph.GetComm(), Nphase1_agg, total_aggs);

      GetOStream(Statistics1, 0) << "Phase 1 - nodes aggregated = " << total_phase_one_aggregated << std::endl;
      GetOStream(Statistics1, 0) << "Phase 1 - total aggregates = " << total_aggs << std::endl;

      GO i = nAggregates - Nphase1_agg;
      { GO ii; sumAll(graph.GetComm(),i,ii); i = ii; }
      GetOStream(Statistics1, 0) << "Phase 3 - additional aggregates = " << i << std::endl;
    }

    // Determine vertices that are not shared by setting Temp to all ones
    // and doing NonUnique2NonUnique(..., ADD). This sums values of all
    // local copies associated with each Gid. Thus, sums > 1 are shared.

    //         std::cout << "exp_nrows=" << exp_nRows << " (nVertices= " << nVertices << ", numGhost=" << graph.GetNodeNumGhost() << ")" << std::endl;
    //         std::cout << "nonUniqueMap=" << nonUniqueMap->getNodeNumElements() << std::endl;

    RCP<Xpetra::Vector<double,LO,GO,NO> > temp_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap,false); //no need to zero out vector in ctor
    temp_->putScalar(1.);

    RCP<Xpetra::Vector<double,LO,GO,NO> > tempOutput_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap);

    myWidget.NonUnique2NonUnique(*temp_, *tempOutput_, Xpetra::ADD);

    std::vector<bool> gidNotShared(exp_nRows);
    {
      ArrayRCP<const double> tempOutput = tempOutput_->getData(0);
      for (int i = 0; i < exp_nRows; i++) {
        if (tempOutput[i] > 1.)
          gidNotShared[i] = false;
        else
          gidNotShared[i] = true;
      }
    }

    // Phase 4.
    double nAggregatesTarget;
    nAggregatesTarget = ((double)  uniqueMap->getGlobalNumElements())* (((double) uniqueMap->getGlobalNumElements())/ ((double) graph.GetGlobalNumEdges()));

    GO nAggregatesLocal=nAggregates, nAggregatesGlobal; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

    LO minNAggs; minAll(graph.GetComm(), nAggregates, minNAggs);
    LO maxNAggs; maxAll(graph.GetComm(), nAggregates, maxNAggs);

    //
    // Only do this phase if things look really bad. THIS
    // CODE IS PRETTY EXPERIMENTAL
    //
#define MUELU_PHASE4BUCKETS 6
    if ((nAggregatesGlobal < graph.GetComm()->getSize()) &&
        (2.5*nAggregatesGlobal < nAggregatesTarget) &&
        (minNAggs ==0) && (maxNAggs <= 1)) {

      // Modify seed of the random algorithm used by temp_->randomize()
      {
        typedef Teuchos::ScalarTraits<double> scalarTrait; // temp_ is of type double.
        scalarTrait::seedrandom(static_cast<unsigned int>(myPid*2 + (int) (11*scalarTrait::random())));
        int k = (int)ceil( (10.*myPid)/graph.GetComm()->getSize());
        for (int i = 0; i < k+7; i++) scalarTrait::random();
        temp_->setSeed(static_cast<unsigned int>(scalarTrait::random()));
      }

      temp_->randomize();

      ArrayRCP<double> temp = temp_->getDataNonConst(0);

      // build a list of candidate root nodes (vertices not adjacent
      // to aggregated vertices)

      my_size_t nCandidates = 0;
      global_size_t nCandidatesGlobal;

      ArrayRCP<LO> candidates = Teuchos::arcp<LO>(nVertices+1);

      double priorThreshold = 0.;
      for (int kkk = 0; kkk < MUELU_PHASE4BUCKETS; kkk++) {

        {
          ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
          ArrayView<const LO> vertex2AggIdView = vertex2AggId();
          RootCandidates(nVertices, vertex2AggIdView, graph, candidates, nCandidates, nCandidatesGlobal);
          // views on distributed vectors are freed here.
        }

        double nTargetNewGuys =  nAggregatesTarget - nAggregatesGlobal;
        double threshold      =  priorThreshold + (1. - priorThreshold)*nTargetNewGuys/(nCandidatesGlobal + .001);

        threshold = (threshold*(kkk+1.))/((double) MUELU_PHASE4BUCKETS);
        priorThreshold = threshold;

        {
          ArrayRCP<LO>     vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
          ArrayRCP<double> weights      = distWeights->getDataNonConst(0);

          for (int k = 0; k < nCandidates; k++ ) {
            int i = candidates[k];
            if ((vertex2AggId[i] == MUELU_UNAGGREGATED) && (fabs(temp[i])  < threshold)) {
              // Note: priorThreshold <= fabs(temp[i]) <= 1

              // neighOfINode is the neighbor node list of node 'iNode'.
              ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

              if (neighOfINode.size() > minNodesPerAggregate) { //TODO: check if this test is exactly was we want to do
                int count = 0;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int Adjacent    = *it;
                  // This might not be true if someone close to i
                  // is chosen as a root via fabs(temp[]) < Threshold
                  if (vertex2AggId[Adjacent] == MUELU_UNAGGREGATED){
                    count++;
                    vertex2AggId[Adjacent] = nAggregates;
                    weights[Adjacent] = 1.;
                  }
                }
                if (count >= minNodesPerAggregate) {
                  vertex2AggId[i] = nAggregates++;
                  weights[i] = 2.;
                  aggregates.SetIsRoot(i);
                }
                else { // undo things
                  for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                    int Adjacent    = *it;
                    if (vertex2AggId[Adjacent] == nAggregates){
                      vertex2AggId[Adjacent] = MUELU_UNAGGREGATED;
                      weights[Adjacent] = 0.;
                    }
                  }
                }
              }
            }
          }
          // views on distributed vectors are freed here.
        }
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
        nAggregatesLocal=nAggregates;
        sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

        // check that there are no aggregates sizes below minNodesPerAggregate

        aggregates.SetNumAggregates(nAggregates);

        RemoveSmallAggs(aggregates, minNodesPerAggregate, distWeights, myWidget);

        nAggregates = aggregates.GetNumAggregates();
      }   // one possibility
    }

    // Initialize things for Phase 5. This includes building the transpose
    // of the matrix ONLY for transposed rows that correspond to unaggregted
    // ghost vertices. Further, the transpose is only a local transpose.
    // Nonzero edges which exist on other processors are not represented.


    int observedNAgg=-1; //number of aggregates that contain vertices on this process

    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      for(LO k = 0; k < vertex2AggId.size(); ++k )
        if(vertex2AggId[k]>observedNAgg)
          observedNAgg=vertex2AggId[k];
      observedNAgg++;
    }

    ArrayRCP<int> Mark = Teuchos::arcp<int>(exp_nRows+1);
    ArrayRCP<int> agg_incremented = Teuchos::arcp<int>(observedNAgg);
    ArrayRCP<int> SumOfMarks = Teuchos::arcp<int>(observedNAgg);

    for (int i = 0; i < exp_nRows; i++)   Mark[i] = MUELU_DISTONE_VERTEX_WEIGHT;
    for (int i = 0; i < agg_incremented.size(); i++) agg_incremented[i] = 0;
    for (int i = 0; i < SumOfMarks.size(); i++) SumOfMarks[i] = 0;

    // Grab the transpose matrix graph for unaggregated ghost vertices.
    //     a) count the number of nonzeros per row in the transpose
    std::vector<int> RowPtr(exp_nRows+1-nVertices);
    //{
    ArrayRCP<const LO> vertex2AggIdCst = aggregates.GetVertex2AggId()->getData(0);

    for (int i = nVertices; i < exp_nRows;  i++) RowPtr[i-nVertices] = 0;
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          RowPtr[j-nVertices]++;
        }
      }
    }

    //     b) Convert RowPtr[i] to point to 1st first nnz spot in row i.

    int iSum = 0, iTemp;
    for (int i = nVertices; i < exp_nRows;  i++) {
      iTemp = RowPtr[i-nVertices];
      RowPtr[i-nVertices] = iSum;
      iSum += iTemp;
    }
    RowPtr[exp_nRows-nVertices] = iSum;
    std::vector<LO> cols(iSum+1);

    //     c) Traverse matrix and insert entries in proper location.
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          cols[RowPtr[j-nVertices]++] = i;
        }
      }
    }

    //     d) RowPtr[i] points to beginning of row i+1 so shift by one location.
    for (int i = exp_nRows; i > nVertices;  i--)
      RowPtr[i-nVertices] = RowPtr[i-1-nVertices];
    RowPtr[0] = 0;

    // views on distributed vectors are freed here.
    vertex2AggIdCst = Teuchos::null;
    //}

    int bestScoreCutoff;
    int thresholds[10] = {300,200,100,50,25,13,7,4,2,0};

    // Stick unaggregated vertices into existing aggregates as described above.

    {
      int ncalls=0;

      for (int kk = 0; kk < 10; kk += 2) {
        bestScoreCutoff = thresholds[kk];

        ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
        ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);
        ArrayRCP<double> weights       = distWeights->getDataNonConst(0);

        for (int i = 0; i < exp_nRows; i++) {

          if (vertex2AggId[i] == MUELU_UNAGGREGATED) {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode;

            // Grab neighboring vertices which is either in graph for local ids
            // or sits in transposed fragment just constructed above for ghosts.
            if (i < nVertices) {
              neighOfINode = graph.getNeighborVertices(i);
            }
            else {
              LO *rowi_col = NULL, rowi_N;
              rowi_col = &(cols[RowPtr[i-nVertices]]);
              rowi_N   = RowPtr[i+1-nVertices] - RowPtr[i-nVertices];

              neighOfINode = ArrayView<const LO>(rowi_col, rowi_N);
            }
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];

              //Adjacent is aggregated and either I own the aggregate
              // or I could own the aggregate after arbitration.
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED))){
                SumOfMarks[AdjacentAgg] += Mark[Adjacent];
              }
            }
            int best_score = MUELU_NOSCORE;
            int best_agg = -1;
            int BestMark = -1;
            bool cannotLoseAllFriends=false; // Used to address possible loss of vertices in arbitration of shared nodes discussed above. (Initialized to false only to avoid a compiler warning).

            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              //Adjacent is unaggregated, has some value and no
              //other processor has definitively claimed him
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  (SumOfMarks[AdjacentAgg] != 0) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED ))) {

                // first figure out the penalty associated with
                // AdjacentAgg having already been incremented
                // during this phase, then compute score.

                double penalty = (double) (INCR_SCALING*agg_incremented[AdjacentAgg]);
                if (penalty > MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]))
                  penalty = MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]);
                int score = SumOfMarks[AdjacentAgg]- ((int) floor(penalty));

                if (score > best_score) {
                  best_agg             = AdjacentAgg;
                  best_score           = score;
                  BestMark             = Mark[Adjacent];
                  cannotLoseAllFriends = false;

                  // This address issue mentioned above by checking whether
                  // Adjacent could be lost in arbitration. weight==0 means that
                  // Adjacent was not set during this loop of Phase 5 (and so it
                  // has already undergone arbitration). GidNotShared == true
                  // obviously implies that Adjacent cannot be lost to arbitration
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                }
                // Another vertex within current best aggregate found.
                // We should have (best_score == score). We need to see
                // if we can improve BestMark and cannotLoseAllFriends.
                else if (best_agg == AdjacentAgg) {
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                  if (Mark[Adjacent] > BestMark) BestMark = Mark[Adjacent];
                }
              }
            }
            // Clean up
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              if (AdjacentAgg >= 0) SumOfMarks[AdjacentAgg] = 0;
            }
            // Tentatively assign vertex to best_agg.
            if ( (best_score >= bestScoreCutoff) && (cannotLoseAllFriends)) {

              TEUCHOS_TEST_FOR_EXCEPTION(best_agg == -1 || BestMark == -1, MueLu::Exceptions::RuntimeError, "MueLu::CoupledAggregationFactory internal error"); // should never happen

              vertex2AggId[i] = best_agg;
              weights[i] = best_score;
              agg_incremented[best_agg]++;
              Mark[i] = (int) ceil(   ((double) BestMark)/2.);
            }
          }

          // views on distributed vectors are freed here.
        }

        vertex2AggId = Teuchos::null;
        procWinner   = Teuchos::null;
        weights      = Teuchos::null;

        ++ncalls;
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
      }

      //       if (graph.GetComm()->getRank()==0)
      //         std::cout << "#calls to Arb&Comm=" << ncalls << std::endl;
    }

    // Phase 6: Aggregate remain unaggregated vertices and try at all costs
    //          to avoid small aggregates.
    //          One case where we can find ourselves in this situation
    //          is if all vertices vk adjacent to v have already been
    //          put in other processor's aggregates and v does not have
    //          a direct connection to a local vertex in any of these
    //          aggregates.

    int Nleftover = 0, Nsingle = 0;
    {

      ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<double> weights       = distWeights->getDataNonConst(0);
      ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);

      int count = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED) {
          Nleftover++;

          // neighOfINode is the neighbor node list of node 'iNode'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          // We don't want too small of an aggregate. So lets see if there is an
          // unaggregated neighbor that we can also put with this vertex

          vertex2AggId[i] = nAggregates;
          weights[i] = 1.;
          if (count == 0) aggregates.SetIsRoot(i);
          count++;
          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int j = *it;
            if ((j != i)&&(vertex2AggId[j] == MUELU_UNAGGREGATED)&&
                (j < nVertices)) {
              vertex2AggId[j] = nAggregates;
              weights[j] = 1.;
              count++;
            }
          }
          if ( count >= minNodesPerAggregate) {
            nAggregates++;
            count = 0;
          }
        }
      }

      // We have something which is under minNodesPerAggregate when
      if (count != 0) {
#ifdef FIXME
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            if ((vertex2AggId[i] == nAggregates) && (procWinner[i] == myPid)) {
              vertex2AggId[i] = 0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#else
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            // TW: This is not a real fix. This may produce ugly bad aggregates!
            // I removed the procWinner[i] == myPid check. it makes no sense to me since
            // it leaves vertex2AggId[i] == nAggregates -> crash in ComputeAggregateSizes().
            // Maybe it's better to add the leftovers to the last generated agg on the current proc.
            // The best solution would be to add them to the "next"/nearest aggregate, that may be
            // on an other processor
            if (vertex2AggId[i] == nAggregates) {
              vertex2AggId[i] = nAggregates-1; //0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#endif
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, false);

    if (IsPrint(Statistics1)) {
      GO total_Nsingle=0;   sumAll(graph.GetComm(), (GO)Nsingle,     total_Nsingle);
      GO total_Nleftover=0; sumAll(graph.GetComm(), (GO)Nleftover,   total_Nleftover);
      // GO total_aggs;        sumAll(graph.GetComm(), (GO)nAggregates, total_aggs);
      // GetOStream(Statistics1, 0) << "Phase 6 - total aggregates = " << total_aggs << std::endl;
      GetOStream(Statistics1, 0) << "Phase 6 - leftovers = " << total_Nleftover << " and singletons = " << total_Nsingle << std::endl;
    }

    aggregates.SetNumAggregates(nAggregates);

  } //AggregateLeftovers
Ejemplo n.º 16
0
  void AlgebraicPermutationStrategy<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildPermutation(const Teuchos::RCP<Matrix> & A, const Teuchos::RCP<const Map> permRowMap, Level & currentLevel, const FactoryBase* genFactory) const {
#ifndef HAVE_MUELU_INST_COMPLEX_INT_INT

  const Teuchos::RCP< const Teuchos::Comm< int > > comm = A->getRowMap()->getComm();
  int numProcs = comm->getSize();
  int myRank   = comm->getRank();

  /*if( permRowMap == Teuchos::null ) {
    permRowMap = A->getRowMap(); // use full row map of A
  }*/

  size_t nDofsPerNode = 1;
  if (A->IsView("stridedMaps")) {
    Teuchos::RCP<const Map> permRowMapStrided = A->getRowMap("stridedMaps");
    nDofsPerNode = Teuchos::rcp_dynamic_cast<const StridedMap>(permRowMapStrided)->getFixedBlockSize();
  }

  //GetOStream(Runtime0, 0) << "Perform generation of permutation operators on " << mapName_ << " map with " << permRowMap->getGlobalNumElements() << " elements" << std::endl;

  std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> > permutedDiagCandidates;
  std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> > keepDiagonalEntries;
  std::vector<Scalar> Weights;

  // loop over all local rows in matrix A and keep diagonal entries if corresponding
  // matrix rows are not contained in permRowMap
  for (size_t row = 0; row < A->getRowMap()->getNodeNumElements(); row++) {
    GlobalOrdinal grow = A->getRowMap()->getGlobalElement(row);

    if(permRowMap->isNodeGlobalElement(grow) == true) continue;

    size_t nnz = A->getNumEntriesInLocalRow(row);

    // extract local row information from matrix
    Teuchos::ArrayView<const LocalOrdinal> indices;
    Teuchos::ArrayView<const Scalar> vals;
    A->getLocalRowView(row, indices, vals);

    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(indices.size()) != nnz, Exceptions::RuntimeError, "MueLu::PermutationFactory::Build: number of nonzeros not equal to number of indices? Error.");

    // find column entry with max absolute value
    GlobalOrdinal gMaxValIdx = 0;
    Scalar norm1 = 0.0;
    Scalar maxVal = 0.0;
    for (size_t j = 0; j < Teuchos::as<size_t>(indices.size()); j++) {
      norm1 += std::abs(vals[j]);
      if(std::abs(vals[j]) > maxVal) {
        maxVal = std::abs(vals[j]);
        gMaxValIdx = A->getColMap()->getGlobalElement(indices[j]);
      }
    }

    if(grow == gMaxValIdx) // only keep row/col pair if it's diagonal dominant!!!
      keepDiagonalEntries.push_back(std::make_pair(grow,grow));
  }

  //////////
  // handle rows that are marked to be relevant for permutations
  for (size_t row = 0; row < permRowMap->getNodeNumElements(); row++) {
    GlobalOrdinal grow = permRowMap->getGlobalElement(row);
    LocalOrdinal lArow = A->getRowMap()->getLocalElement(grow);
    size_t nnz = A->getNumEntriesInLocalRow(lArow);

    // extract local row information from matrix
    Teuchos::ArrayView<const LocalOrdinal> indices;
    Teuchos::ArrayView<const Scalar> vals;
    A->getLocalRowView(lArow, indices, vals);

    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(indices.size()) != nnz, Exceptions::RuntimeError, "MueLu::PermutationFactory::Build: number of nonzeros not equal to number of indices? Error.");

    // find column entry with max absolute value
    GlobalOrdinal gMaxValIdx = 0;
    Scalar norm1 = 0.0;
    Scalar maxVal = 0.0;
    for (size_t j = 0; j < Teuchos::as<size_t>(indices.size()); j++) {
      norm1 += std::abs(vals[j]);
      if(std::abs(vals[j]) > maxVal) {
        maxVal = std::abs(vals[j]);
        gMaxValIdx = A->getColMap()->getGlobalElement(indices[j]);
      }
    }

    if(std::abs(maxVal) > 0.0) { // keep only max Entries \neq 0.0
      permutedDiagCandidates.push_back(std::make_pair(grow,gMaxValIdx));
      Weights.push_back(maxVal/(norm1*Teuchos::as<Scalar>(nnz)));
    } else {
      std::cout << "ATTENTION: row " << grow << " has only zero entries -> singular matrix!" << std::endl;
    }

  }

  // sort Weights in descending order
  std::vector<int> permutation;
  sortingPermutation(Weights,permutation);

  // create new vector with exactly one possible entry for each column

  // each processor which requests the global column id gcid adds 1 to gColVec
  // gColVec will be summed up over all processors and communicated to gDomVec
  // which is based on the non-overlapping domain map of A.

  Teuchos::RCP<Vector> gColVec = VectorFactory::Build(A->getColMap());
  Teuchos::RCP<Vector> gDomVec = VectorFactory::Build(A->getDomainMap());
  gColVec->putScalar(0.0);
  gDomVec->putScalar(0.0);

  // put in all keep diagonal entries
  for (typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::const_iterator p = keepDiagonalEntries.begin(); p != keepDiagonalEntries.end(); ++p) {
    gColVec->sumIntoGlobalValue((*p).second,1.0);
  }

  Teuchos::RCP<Export> exporter = ExportFactory::Build(gColVec->getMap(), gDomVec->getMap());
  gDomVec->doExport(*gColVec,*exporter,Xpetra::ADD);  // communicate blocked gcolids to all procs
  gColVec->doImport(*gDomVec,*exporter,Xpetra::INSERT);

  std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> > permutedDiagCandidatesFiltered; // TODO reserve memory
  std::map<GlobalOrdinal, Scalar> gColId2Weight;

  Teuchos::ArrayRCP< Scalar > ddata = gColVec->getDataNonConst(0);
  for(size_t i = 0; i < permutedDiagCandidates.size(); ++i) {
    // loop over all candidates
    std::pair<GlobalOrdinal, GlobalOrdinal> pp = permutedDiagCandidates[permutation[i]];
    GlobalOrdinal grow = pp.first;
    GlobalOrdinal gcol = pp.second;

    LocalOrdinal lcol = A->getColMap()->getLocalElement(gcol);
    //Teuchos::ArrayRCP< Scalar > ddata = gColVec->getDataNonConst(0);
    if(ddata[lcol] > 0.0){
      continue; // skip lcol: column already handled by another row
    }

    // mark column as already taken
    ddata[lcol]++;

    permutedDiagCandidatesFiltered.push_back(std::make_pair(grow,gcol));
    gColId2Weight[gcol] = Weights[permutation[i]];
  }

  // communicate how often each column index is requested by the different procs
  gDomVec->doExport(*gColVec,*exporter,Xpetra::ADD);
  gColVec->doImport(*gDomVec,*exporter,Xpetra::INSERT); // probably not needed // TODO check me

  //*****************************************************************************************
  // first communicate ALL global ids of column indices which are requested by more
  // than one proc to all other procs
  // detect which global col indices are requested by more than one proc
  // and store them in the multipleColRequests vector
  std::vector<GlobalOrdinal> multipleColRequests; // store all global column indices from current processor that are also
                                                  // requested by another processor. This is possible, since they are stored
                                                  // in gDomVec which is based on the nonoverlapping domain map. That is, each
                                                  // global col id is handled by exactly one proc.
  std::queue<GlobalOrdinal> unusedColIdx; // unused column indices on current processor

  for(size_t sz = 0; sz<gDomVec->getLocalLength(); ++sz) {
    Teuchos::ArrayRCP< const Scalar > arrDomVec = gDomVec->getData(0);
    if(arrDomVec[sz] > 1.0) {
      multipleColRequests.push_back(gDomVec->getMap()->getGlobalElement(sz));
    } else if(arrDomVec[sz] == 0.0) {
      unusedColIdx.push(gDomVec->getMap()->getGlobalElement(sz));
    }
  }

  // communicate the global number of column indices which are requested by more than one proc
  LocalOrdinal localMultColRequests  = Teuchos::as<LocalOrdinal>(multipleColRequests.size());
  LocalOrdinal globalMultColRequests = 0;

  // sum up all entries in multipleColRequests over all processors
  sumAll(gDomVec->getMap()->getComm(), (LocalOrdinal)localMultColRequests, globalMultColRequests);

  if(globalMultColRequests > 0) {
    // special handling: two processors request the same global column id.
    // decide which processor gets it

    // distribute number of multipleColRequests to all processors
    // each processor stores how many column ids for exchange are handled by the cur proc
    std::vector<GlobalOrdinal> numMyMultColRequests(numProcs,0);
    std::vector<GlobalOrdinal> numGlobalMultColRequests(numProcs,0);
    numMyMultColRequests[myRank] = localMultColRequests;
    Teuchos::reduceAll(*comm,Teuchos::REDUCE_MAX,numProcs,&numMyMultColRequests[0],&numGlobalMultColRequests[0]);

    // communicate multipleColRequests entries to all processors
    int nMyOffset = 0;
    for (int i=0; i<myRank-1; i++)
      nMyOffset += numGlobalMultColRequests[i]; // calculate offset to store the weights on the corresponding place in procOverlappingWeights

    GlobalOrdinal zero=0;
    std::vector<GlobalOrdinal> procMultRequestedColIds(globalMultColRequests,zero);
    std::vector<GlobalOrdinal> global_procMultRequestedColIds(globalMultColRequests,zero);

    // loop over all local column GIDs that are also requested by other procs
    for(size_t i = 0; i < multipleColRequests.size(); i++) {
      procMultRequestedColIds[nMyOffset + i] = multipleColRequests[i]; // all weights are > 0 ?
    }

    // template ordinal, package (double)
    Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, Teuchos::as<int>(globalMultColRequests), &procMultRequestedColIds[0], &global_procMultRequestedColIds[0]);

    // loop over global_procOverlappingWeights and eliminate wrong entries...
    for (size_t k = 0; k<global_procMultRequestedColIds.size(); k++) {
      GlobalOrdinal globColId = global_procMultRequestedColIds[k];

      std::vector<Scalar> MyWeightForColId(numProcs,0);
      std::vector<Scalar> GlobalWeightForColId(numProcs,0);

      if(gColVec->getMap()->isNodeGlobalElement(globColId)) {
        MyWeightForColId[myRank] = gColId2Weight[globColId];
      } else {
        MyWeightForColId[myRank] = 0.0;
      }

      Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, numProcs, &MyWeightForColId[0], &GlobalWeightForColId[0]);

      if(gColVec->getMap()->isNodeGlobalElement(globColId)) {
        // note: 2 procs could have the same weight for a column index.
        // pick the first one.
        Scalar winnerValue = 0.0;
        int winnerProcRank = 0;
        for (int proc = 0; proc < numProcs; proc++) {
          if(GlobalWeightForColId[proc] > winnerValue) {
            winnerValue = GlobalWeightForColId[proc];
            winnerProcRank = proc;
          }
        }

        // winnerProcRank is the winner for handling globColId.
        // winnerProcRank is unique (even if two procs have the same weight for a column index)

        if(myRank != winnerProcRank) {
          // remove corresponding entry from permutedDiagCandidatesFiltered
          typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::iterator p = permutedDiagCandidatesFiltered.begin();
          while(p != permutedDiagCandidatesFiltered.end() )
          {
            if((*p).second == globColId)
              p = permutedDiagCandidatesFiltered.erase(p);
            else
              p++;
          }
        }

      } // end if isNodeGlobalElement
    } // end loop over global_procOverlappingWeights and eliminate wrong entries...
  } // end if globalMultColRequests > 0

  // put together all pairs:
  //size_t sizeRowColPairs = keepDiagonalEntries.size() + permutedDiagCandidatesFiltered.size();
  std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> > RowColPairs;
  RowColPairs.insert( RowColPairs.end(), keepDiagonalEntries.begin(), keepDiagonalEntries.end());
  RowColPairs.insert( RowColPairs.end(), permutedDiagCandidatesFiltered.begin(), permutedDiagCandidatesFiltered.end());

#ifdef DEBUG_OUTPUT
  //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
  // plausibility check
  gColVec->putScalar(0.0);
  gDomVec->putScalar(0.0);
  typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::iterator pl = RowColPairs.begin();
  while(pl != RowColPairs.end() )
  {
    //GlobalOrdinal ik = (*pl).first;
    GlobalOrdinal jk = (*pl).second;

    gColVec->sumIntoGlobalValue(jk,1.0);
    pl++;
  }
  gDomVec->doExport(*gColVec,*exporter,Xpetra::ADD);
  for(size_t sz = 0; sz<gDomVec->getLocalLength(); ++sz) {
    Teuchos::ArrayRCP< const Scalar > arrDomVec = gDomVec->getData(0);
    if(arrDomVec[sz] > 1.0) {
      GetOStream(Runtime0,0) << "RowColPairs has multiple column [" << sz << "]=" << arrDomVec[sz] << std::endl;
    } else if(arrDomVec[sz] == 0.0) {
      GetOStream(Runtime0,0) << "RowColPairs has empty column [" << sz << "]=" << arrDomVec[sz] << std::endl;
    }
  }
  //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
#endif

  //////////////////////////////////////////////////
  // assumption: on each processor RowColPairs now contains
  // a valid set of (row,column) pairs, where the row entries
  // are a subset of the processor's rows and the column entries
  // are unique throughout all processors.
  // Note: the RowColPairs are only defined for a subset of all rows,
  // so there might be rows without an entry in RowColPairs.
  // It can be, that some rows seem to be missing in RowColPairs, since
  // the entry in that row with maximum absolute value has been reserved
  // by another row already (e.g. as already diagonal dominant row outside
  // of perRowMap).
  // In fact, the RowColPairs vector only defines the (row,column) pairs
  // that will be definitely moved to the diagonal after permutation.

#ifdef DEBUG_OUTPUT
  //  for (typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::const_iterator p = RowColPairs.begin(); p != RowColPairs.end(); ++p) {
  //    std::cout << "proc: " << myRank << " r/c: " << (*p).first << "/" << (*p).second << std::endl;
  //  }
  //    for (typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::const_iterator p = RowColPairs.begin(); p != RowColPairs.end(); ++p)
  //    {
  ////      if((*p).first != (*p).second) std::cout << "difference: " << (*p).first << " " << (*p).second << std::endl;
  //      std::cout << (*p).first +1 << " " << (*p).second+1 << std::endl;
  //    }
  //    std::cout << "\n";
#endif

  // vectors to store permutation information
  Teuchos::RCP<Vector> Pperm  = VectorFactory::Build(A->getRowMap());
  Teuchos::RCP<Vector> Qperm  = VectorFactory::Build(A->getDomainMap()); // global variant (based on domain map)
  Teuchos::RCP<Vector> lQperm = VectorFactory::Build(A->getColMap());  // local variant (based on column map)

  Teuchos::ArrayRCP< Scalar > PpermData = Pperm->getDataNonConst(0);
  Teuchos::ArrayRCP< Scalar > QpermData = Qperm->getDataNonConst(0);

  Pperm->putScalar(0.0);
  Qperm->putScalar(0.0);
  lQperm->putScalar(0.0);

  // setup exporter for Qperm
  Teuchos::RCP<Export> QpermExporter = ExportFactory::Build(lQperm->getMap(), Qperm->getMap());

  Teuchos::RCP<Vector> RowIdStatus  = VectorFactory::Build(A->getRowMap());
  Teuchos::RCP<Vector> ColIdStatus  = VectorFactory::Build(A->getDomainMap()); // global variant (based on domain map)
  Teuchos::RCP<Vector> lColIdStatus = VectorFactory::Build(A->getColMap()); // local variant (based on column map)
  Teuchos::RCP<Vector> ColIdUsed   = VectorFactory::Build(A->getDomainMap()); // mark column ids to be already in use
  Teuchos::ArrayRCP< Scalar > RowIdStatusArray = RowIdStatus->getDataNonConst(0);
  Teuchos::ArrayRCP< Scalar > ColIdStatusArray = ColIdStatus->getDataNonConst(0);
  Teuchos::ArrayRCP< Scalar > lColIdStatusArray = lColIdStatus->getDataNonConst(0);
  Teuchos::ArrayRCP< Scalar > ColIdUsedArray   = ColIdUsed->getDataNonConst(0); // not sure about this
  RowIdStatus->putScalar(0.0);
  ColIdStatus->putScalar(0.0);
  lColIdStatus->putScalar(0.0);
  ColIdUsed->putScalar(0.0);   // no column ids are used

  // count wide-range permutations
  // a wide-range permutation is defined as a permutation of rows/columns which do not
  // belong to the same node
  LocalOrdinal lWideRangeRowPermutations = 0;
  GlobalOrdinal gWideRangeRowPermutations = 0;
  LocalOrdinal lWideRangeColPermutations = 0;
  GlobalOrdinal gWideRangeColPermutations = 0;

  // run 1: mark all "identity" permutations
  typename std::vector<std::pair<GlobalOrdinal, GlobalOrdinal> >::iterator p = RowColPairs.begin();
  while(p != RowColPairs.end() )
  {
    GlobalOrdinal ik = (*p).first;
    GlobalOrdinal jk = (*p).second;

    LocalOrdinal lik = A->getRowMap()->getLocalElement(ik);
    LocalOrdinal ljk = A->getColMap()->getLocalElement(jk);

    if(RowIdStatusArray[lik] == 0.0) {
      RowIdStatusArray[lik] = 1.0; // use this row id
      lColIdStatusArray[ljk] = 1.0; // use this column id
      Pperm->replaceLocalValue(lik, ik);
      lQperm->replaceLocalValue(ljk, ik); // use column map
      ColIdUsed->replaceGlobalValue(ik,1.0); // ik is now used
      p = RowColPairs.erase(p);

      // detect wide range permutations
      if(floor(ik/nDofsPerNode) != floor(jk/nDofsPerNode)) {
        lWideRangeColPermutations++;
      }
    }
    else
      p++;
  }

  // communicate column map -> domain map
  Qperm->doExport(*lQperm,*QpermExporter,Xpetra::ABSMAX);
  ColIdStatus->doExport(*lColIdStatus,*QpermExporter,Xpetra::ABSMAX);

  // plausibility check
  if(RowColPairs.size()>0) GetOStream(Warnings0,0) << "MueLu::PermutationFactory: There are Row/Col pairs left!!!" << std::endl; // TODO fix me

  // close Pperm

  // count, how many row permutations are missing on current proc
  size_t cntFreeRowIdx = 0;
  std::queue<GlobalOrdinal> qFreeGRowIdx;  // store global row ids of "free" rows
  for (size_t lik = 0; lik < RowIdStatus->getLocalLength(); ++lik) {
    if(RowIdStatusArray[lik] == 0.0) {
      cntFreeRowIdx++;
      qFreeGRowIdx.push(RowIdStatus->getMap()->getGlobalElement(lik));
    }
  }

  // fix Pperm
  for (size_t lik = 0; lik < RowIdStatus->getLocalLength(); ++lik) {
    if(RowIdStatusArray[lik] == 0.0) {
      RowIdStatusArray[lik] = 1.0; // use this row id
      Pperm->replaceLocalValue(lik, qFreeGRowIdx.front());
      // detect wide range permutations
      if(floor(qFreeGRowIdx.front()/nDofsPerNode) != floor(RowIdStatus->getMap()->getGlobalElement(lik)/nDofsPerNode)) {
        lWideRangeRowPermutations++;
      }
      qFreeGRowIdx.pop();
    }
  }

  // close Qperm (free permutation entries in Qperm)
  size_t cntFreeColIdx = 0;
  std::queue<GlobalOrdinal> qFreeGColIdx;  // store global column ids of "free" available columns
  for (size_t ljk = 0; ljk < ColIdStatus->getLocalLength(); ++ljk) {
    if(ColIdStatusArray[ljk] == 0.0) {
      cntFreeColIdx++;
      qFreeGColIdx.push(ColIdStatus->getMap()->getGlobalElement(ljk));
    }
  }

  size_t cntUnusedColIdx = 0;
  std::queue<GlobalOrdinal> qUnusedGColIdx;  // store global column ids of "free" available columns
  for (size_t ljk = 0; ljk < ColIdUsed->getLocalLength(); ++ljk) {
    if(ColIdUsedArray[ljk] == 0.0) {
      cntUnusedColIdx++;
      qUnusedGColIdx.push(ColIdUsed->getMap()->getGlobalElement(ljk));
    }
  }

  // fix Qperm with local entries
  for (size_t ljk = 0; ljk < ColIdStatus->getLocalLength(); ++ljk) {
    // stop if no (local) unused column idx are left
    if(cntUnusedColIdx == 0) break;

    if(ColIdStatusArray[ljk] == 0.0) {
      ColIdStatusArray[ljk] = 1.0; // use this row id
      Qperm->replaceLocalValue(ljk, qUnusedGColIdx.front()); // loop over ColIdStatus (lives on domain map)
      ColIdUsed->replaceGlobalValue(qUnusedGColIdx.front(),1.0); // ljk is now used, too
      // detect wide range permutations
      if(floor(qUnusedGColIdx.front()/nDofsPerNode) != floor(ColIdStatus->getMap()->getGlobalElement(ljk)/nDofsPerNode)) {
        lWideRangeColPermutations++;
      }
      qUnusedGColIdx.pop();
      cntUnusedColIdx--;
      cntFreeColIdx--;
    }
  }

  //Qperm->doExport(*lQperm,*QpermExporter,Xpetra::ABSMAX); // no export necessary, since changes only locally
  //ColIdStatus->doExport(*lColIdStatus,*QpermExporter,Xpetra::ABSMAX);

  // count, how many unused column idx are needed on current processor
  // to complete Qperm
  cntFreeColIdx = 0;
  for (size_t ljk = 0; ljk < ColIdStatus->getLocalLength(); ++ljk) { // TODO avoid this loop
    if(ColIdStatusArray[ljk] == 0.0) {
      cntFreeColIdx++;
    }
  }

  GlobalOrdinal global_cntFreeColIdx = 0;
  LocalOrdinal  local_cntFreeColIdx = cntFreeColIdx;
  sumAll(comm, Teuchos::as<GlobalOrdinal>(local_cntFreeColIdx), global_cntFreeColIdx);
#ifdef DEBUG_OUTPUT
  std::cout << "global # of empty column idx entries in Qperm: " << global_cntFreeColIdx << std::endl;
#endif

  // avoid global communication if possible
  if(global_cntFreeColIdx > 0) {

    // 1) count how many unused column ids are left
    GlobalOrdinal global_cntUnusedColIdx = 0;
    LocalOrdinal  local_cntUnusedColIdx = cntUnusedColIdx;
    sumAll(comm, Teuchos::as<GlobalOrdinal>(local_cntUnusedColIdx), global_cntUnusedColIdx);
#ifdef DEBUG_OUTPUT
    std::cout << "global # of unused column idx: " << global_cntUnusedColIdx << std::endl;
#endif

    // 2) communicate how many unused column ids are available on procs
    std::vector<LocalOrdinal> local_UnusedColIdxOnProc (numProcs);
    std::vector<LocalOrdinal> global_UnusedColIdxOnProc(numProcs);
    local_UnusedColIdxOnProc[myRank] = local_cntUnusedColIdx;
    Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, numProcs, &local_UnusedColIdxOnProc[0], &global_UnusedColIdxOnProc[0]);

#ifdef DEBUG_OUTPUT
    std::cout << "PROC " << myRank << " global num unused indices per proc: ";
    for (size_t ljk = 0; ljk < global_UnusedColIdxOnProc.size(); ++ljk) {
      std::cout << " " << global_UnusedColIdxOnProc[ljk];
    }
    std::cout << std::endl;
#endif

    // 3) build array of length global_cntUnusedColIdx to globally replicate unused column idx
    std::vector<GlobalOrdinal> local_UnusedColIdxVector(Teuchos::as<size_t>(global_cntUnusedColIdx));
    std::vector<GlobalOrdinal> global_UnusedColIdxVector(Teuchos::as<size_t>(global_cntUnusedColIdx));
    GlobalOrdinal global_cntUnusedColIdxStartIter = 0;
    for(int proc=0; proc<myRank; proc++) {
      global_cntUnusedColIdxStartIter += global_UnusedColIdxOnProc[proc];
    }
    for(GlobalOrdinal k = global_cntUnusedColIdxStartIter; k < global_cntUnusedColIdxStartIter+local_cntUnusedColIdx; k++) {
      local_UnusedColIdxVector[k] = qUnusedGColIdx.front();
      qUnusedGColIdx.pop();
    }
    Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, Teuchos::as<int>(global_cntUnusedColIdx), &local_UnusedColIdxVector[0], &global_UnusedColIdxVector[0]);
#ifdef DEBUG_OUTPUT
    std::cout << "PROC " << myRank << " global UnusedGColIdx: ";
    for (size_t ljk = 0; ljk < global_UnusedColIdxVector.size(); ++ljk) {
      std::cout << " " << global_UnusedColIdxVector[ljk];
    }
    std::cout << std::endl;
#endif



    // 4) communicate, how many column idx are needed on each processor
    //    to complete Qperm
    std::vector<LocalOrdinal> local_EmptyColIdxOnProc (numProcs);
    std::vector<LocalOrdinal> global_EmptyColIdxOnProc(numProcs);
    local_EmptyColIdxOnProc[myRank] = local_cntFreeColIdx;
    Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, numProcs, &local_EmptyColIdxOnProc[0], &global_EmptyColIdxOnProc[0]);

#ifdef DEBUG_OUTPUT
    std::cout << "PROC " << myRank << " global num of needed column indices: ";
    for (size_t ljk = 0; ljk < global_EmptyColIdxOnProc.size(); ++ljk) {
      std::cout << " " << global_EmptyColIdxOnProc[ljk];
    }
    std::cout << std::endl;
#endif

    // 5) determine first index in global_UnusedColIdxVector for unused column indices,
    //    that are marked to be used by this processor
    GlobalOrdinal global_UnusedColStartIdx = 0;
    for(int proc=0; proc<myRank; proc++) {
      global_UnusedColStartIdx += global_EmptyColIdxOnProc[proc];
    }

#ifdef DEBUG_OUTPUT
    GetOStream(Statistics0,0) << "PROC " << myRank << " is allowd to use the following column gids: ";
    for(GlobalOrdinal k = global_UnusedColStartIdx; k < global_UnusedColStartIdx + Teuchos::as<GlobalOrdinal>(cntFreeColIdx); k++) {
      GetOStream(Statistics0,0) << global_UnusedColIdxVector[k] << " ";
    }
    GetOStream(Statistics0,0) << std::endl;
#endif

    // 6.) fix Qperm with global entries
    GlobalOrdinal array_iter = 0;
    for (size_t ljk = 0; ljk < ColIdStatus->getLocalLength(); ++ljk) {

      if(ColIdStatusArray[ljk] == 0.0) {
        ColIdStatusArray[ljk] = 1.0; // use this row id
        Qperm->replaceLocalValue(ljk, global_UnusedColIdxVector[global_UnusedColStartIdx + array_iter]);
        ColIdUsed->replaceGlobalValue(global_UnusedColIdxVector[global_UnusedColStartIdx + array_iter],1.0);
        // detect wide range permutations
        if(floor(global_UnusedColIdxVector[global_UnusedColStartIdx + array_iter]/nDofsPerNode) != floor(ColIdStatus->getMap()->getGlobalElement(ljk)/nDofsPerNode)) {
          lWideRangeColPermutations++;
        }
        array_iter++;
        //cntUnusedColIdx--; // check me
      }
    }
  } // end if global_cntFreeColIdx > 0
  /////////////////// Qperm should be fine now...


  // create new empty Matrix
  Teuchos::RCP<CrsMatrixWrap> permPTmatrix = Teuchos::rcp(new CrsMatrixWrap(A->getRowMap(),1,Xpetra::StaticProfile));
  Teuchos::RCP<CrsMatrixWrap> permQTmatrix = Teuchos::rcp(new CrsMatrixWrap(A->getRowMap(),1,Xpetra::StaticProfile));

  for(size_t row=0; row<A->getNodeNumRows(); row++) {
    Teuchos::ArrayRCP<GlobalOrdinal> indoutP(1,Teuchos::as<GO>(PpermData[row])); // column idx for Perm^T
    Teuchos::ArrayRCP<GlobalOrdinal> indoutQ(1,Teuchos::as<GO>(QpermData[row])); // column idx for Qperm
    Teuchos::ArrayRCP<Scalar> valout(1,1.0);
    permPTmatrix->insertGlobalValues(A->getRowMap()->getGlobalElement(row), indoutP.view(0,indoutP.size()), valout.view(0,valout.size()));
    permQTmatrix->insertGlobalValues (A->getRowMap()->getGlobalElement(row), indoutQ.view(0,indoutQ.size()), valout.view(0,valout.size()));
  }

  permPTmatrix->fillComplete();
  permQTmatrix->fillComplete();

  Teuchos::RCP<Matrix> permPmatrix = Utils2::Transpose(permPTmatrix,true);

  for(size_t row=0; row<permPTmatrix->getNodeNumRows(); row++) {
    if(permPTmatrix->getNumEntriesInLocalRow(row) != 1)
      GetOStream(Warnings0,0) <<"#entries in row " << row << " of permPTmatrix is " << permPTmatrix->getNumEntriesInLocalRow(row) << std::endl;
    if(permPmatrix->getNumEntriesInLocalRow(row) != 1)
      GetOStream(Warnings0,0) <<"#entries in row " << row << " of permPmatrix is " << permPmatrix->getNumEntriesInLocalRow(row) << std::endl;
    if(permQTmatrix->getNumEntriesInLocalRow(row) != 1)
      GetOStream(Warnings0,0) <<"#entries in row " << row << " of permQmatrix is " << permQTmatrix->getNumEntriesInLocalRow(row) << std::endl;
  }

  // build permP * A * permQT
  Teuchos::RCP<Matrix> ApermQt = Utils::Multiply(*A, false, *permQTmatrix, false);
  Teuchos::RCP<Matrix> permPApermQt = Utils::Multiply(*permPmatrix, false, *ApermQt, false);

  /*
  MueLu::Utils<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Write("A.mat", *A);
  MueLu::Utils<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Write("permP.mat", *permPmatrix);
  MueLu::Utils<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Write("permQt.mat", *permQTmatrix);
  MueLu::Utils<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Write("permPApermQt.mat", *permPApermQt);
  */
  // build scaling matrix
  Teuchos::RCP<Vector> diagVec = VectorFactory::Build(permPApermQt->getRowMap(),true);
  Teuchos::RCP<Vector> invDiagVec = VectorFactory::Build(permPApermQt->getRowMap(),true);
  Teuchos::ArrayRCP< const Scalar > diagVecData = diagVec->getData(0);
  Teuchos::ArrayRCP< Scalar > invDiagVecData = invDiagVec->getDataNonConst(0);

  permPApermQt->getLocalDiagCopy(*diagVec);
  for(size_t i = 0; i<diagVec->getMap()->getNodeNumElements(); ++i) {
    if(diagVecData[i] != 0.0)
      invDiagVecData[i] = 1/diagVecData[i];
    else {
      invDiagVecData[i] = 1.0;
      GetOStream(Statistics0,0) << "MueLu::PermutationFactory: found zero on diagonal in row " << i << std::endl;
    }
  }

  Teuchos::RCP<CrsMatrixWrap> diagScalingOp = Teuchos::rcp(new CrsMatrixWrap(permPApermQt->getRowMap(),1,Xpetra::StaticProfile));

  for(size_t row=0; row<A->getNodeNumRows(); row++) {
    Teuchos::ArrayRCP<GlobalOrdinal> indout(1,permPApermQt->getRowMap()->getGlobalElement(row)); // column idx for Perm^T
    Teuchos::ArrayRCP<Scalar> valout(1,invDiagVecData[row]);
    diagScalingOp->insertGlobalValues(A->getRowMap()->getGlobalElement(row), indout.view(0,indout.size()), valout.view(0,valout.size()));
  }
  diagScalingOp->fillComplete();

  Teuchos::RCP<Matrix> scaledA = Utils::Multiply(*diagScalingOp, false, *permPApermQt, false);
  currentLevel.Set("A", Teuchos::rcp_dynamic_cast<Matrix>(scaledA), genFactory/*this*/);

  currentLevel.Set("permA", Teuchos::rcp_dynamic_cast<Matrix>(permPApermQt), genFactory/*this*/);  // TODO careful with this!!!
  currentLevel.Set("permP", Teuchos::rcp_dynamic_cast<Matrix>(permPmatrix), genFactory/*this*/);
  currentLevel.Set("permQT", Teuchos::rcp_dynamic_cast<Matrix>(permQTmatrix), genFactory/*this*/);
  currentLevel.Set("permScaling", Teuchos::rcp_dynamic_cast<Matrix>(diagScalingOp), genFactory/*this*/);

  //// count row permutations
  // count zeros on diagonal in P -> number of row permutations
  Teuchos::RCP<Vector> diagPVec = VectorFactory::Build(permPmatrix->getRowMap(),true);
  permPmatrix->getLocalDiagCopy(*diagPVec);
  Teuchos::ArrayRCP< const Scalar > diagPVecData = diagPVec->getData(0);
  LocalOrdinal lNumRowPermutations = 0;
  GlobalOrdinal gNumRowPermutations = 0;
  for(size_t i = 0; i<diagPVec->getMap()->getNodeNumElements(); ++i) {
    if(diagPVecData[i] == 0.0) {
      lNumRowPermutations++;
    }
  }

  // sum up all entries in multipleColRequests over all processors
  sumAll(diagPVec->getMap()->getComm(), Teuchos::as<GlobalOrdinal>(lNumRowPermutations), gNumRowPermutations);

  //// count column permutations
  // count zeros on diagonal in Q^T -> number of column permutations
  Teuchos::RCP<Vector> diagQTVec = VectorFactory::Build(permQTmatrix->getRowMap(),true);
  permQTmatrix->getLocalDiagCopy(*diagQTVec);
  Teuchos::ArrayRCP< const Scalar > diagQTVecData = diagQTVec->getData(0);
  LocalOrdinal lNumColPermutations = 0;
  GlobalOrdinal gNumColPermutations = 0;
  for(size_t i = 0; i<diagQTVec->getMap()->getNodeNumElements(); ++i) {
    if(diagQTVecData[i] == 0.0) {
      lNumColPermutations++;
    }
  }

  // sum up all entries in multipleColRequests over all processors
  sumAll(diagQTVec->getMap()->getComm(), Teuchos::as<GlobalOrdinal>(lNumColPermutations), gNumColPermutations);

  currentLevel.Set("#RowPermutations", gNumRowPermutations, genFactory/*this*/);
  currentLevel.Set("#ColPermutations", gNumColPermutations, genFactory/*this*/);
  currentLevel.Set("#WideRangeRowPermutations", gWideRangeRowPermutations, genFactory/*this*/);
  currentLevel.Set("#WideRangeColPermutations", gWideRangeColPermutations, genFactory/*this*/);

  GetOStream(Statistics0, 0) << "#Row    permutations/max possible permutations: " << gNumRowPermutations << "/" << diagPVec->getMap()->getGlobalNumElements() << std::endl;
  GetOStream(Statistics0, 0) << "#Column permutations/max possible permutations: " << gNumColPermutations << "/" << diagQTVec->getMap()->getGlobalNumElements() << std::endl;
  GetOStream(Runtime1, 0) << "#wide range row permutations: " << gWideRangeRowPermutations << " #wide range column permutations: " << gWideRangeColPermutations << std::endl;

#else
#warning PermutationFactory not compiling/working for Scalar==complex.
#endif // #ifndef HAVE_MUELU_INST_COMPLEX_INT_INT


  }
Ejemplo n.º 17
0
Matrix& LASSO::train(Matrix& X, Matrix& Y) {

	/*XNX = [X, -X];
		H_G = XNX' * XNX;
		D = repmat(diag(H_G), [1, n_y]);
		XNXTY = XNX' * Y;
	    A = (X' * X + lambda  * eye(p)) \ (X' * Y);*/

	Matrix& XNX = horzcat(2, &X, &uminus(X));
	Matrix& H_G = XNX.transpose().mtimes(XNX);
	double* Q = new double[size(H_G, 1)];
	for (int i = 0; i < size(H_G, 1); i++) {
		Q[i] = H_G.getEntry(i, i);
	}
	Matrix& XNXTY = XNX.transpose().mtimes(Y);
	Matrix& A = mldivide(
			plus(X.transpose().mtimes(X), times(lambda, eye(p))),
			X.transpose().mtimes(Y)
	);

	/*AA = [subplus(A); subplus(-A)];
		C = -XNXTY + lambda;
		Grad = C + H_G * AA;
		tol = epsilon * norm(Grad);
		PGrad = zeros(size(Grad));*/

	Matrix& AA = vertcat(2, &subplus(A), &subplus(uminus(A)));
	Matrix& C = plus(uminus(XNXTY), lambda);
	Matrix& Grad = plus(C, mtimes(H_G, AA));
	double tol = epsilon * norm(Grad);
	Matrix& PGrad = zeros(size(Grad));

	std::list<double> J;
	double fval = 0;
	// J(1) = sum(sum((Y - X * A).^2)) / 2 + lambda * sum(sum(abs(A)));
	if (calc_OV) {
		fval = sumAll(pow(minus(Y, mtimes(X, A)), 2)) / 2 +
				lambda * sum(sum(abs(A)));
		J.push_back(fval);
	}

	/*Matrix I_k = null;
		Matrix I_k_com = null;*/
	Matrix& I_k = Grad.copy();
	double d = 0;
	int k = 0;

	DenseVector& SFPlusCi = *new DenseVector(AA.getColumnDimension());
	Matrix& S = H_G;
	Vector** SRows = null;
	if (typeid(H_G) == typeid(DenseMatrix))
		SRows = denseMatrix2DenseRowVectors(S);
	else
		SRows = sparseMatrix2SparseRowVectors(S);

	Vector** CRows = null;
	if (typeid(C) == typeid(DenseMatrix))
		CRows = denseMatrix2DenseRowVectors(C);
	else
		CRows = sparseMatrix2SparseRowVectors(C);

	double** FData = ((DenseMatrix&) AA).getData();
	double* FRow = null;
	double* pr = null;
	int K = 2 * p;

	while (true) {

		/*I_k = Grad < 0 | AA > 0;
		    I_k_com = not(I_k);
		    PGrad(I_k) = Grad(I_k);
		    PGrad(I_k_com) = 0;*/

		_or(I_k, lt(Grad, 0), gt(AA, 0));
		Matrix& I_k_com = _not(I_k);
		assign(PGrad, Grad);
		logicalIndexingAssignment(PGrad, I_k_com, 0);

		d = norm(PGrad, inf);
		if (d < tol) {
			if (verbose)
				println("Converge successfully!");
			break;
		}

		/*for i = 1:2*p
		            AA(i, :) = max(AA(i, :) - (C(i, :) + H_G(i, :) * AA) ./ (D(i, :)), 0);
		    end
		    A = AA(1:p,:) - AA(p+1:end,:);*/

		for (int i = 0; i < K; i++) {
			// SFPlusCi = SRows[i].operate(AA);
			operate(SFPlusCi, *SRows[i], AA);
			plusAssign(SFPlusCi, *CRows[i]);
			timesAssign(SFPlusCi, 1 / Q[i]);
			pr = SFPlusCi.getPr();
			// F(i, :) = max(F(i, :) - (S(i, :) * F + C(i, :)) / D[i]), 0);
			// F(i, :) = max(F(i, :) - SFPlusCi, 0)
			FRow = FData[i];
			for (int j = 0; j < AA.getColumnDimension(); j++) {
				FRow[j] = max(FRow[j] - pr[j], 0);
			}
		}

		// Grad = plus(C, mtimes(H_G, AA));
		plus(Grad, C, mtimes(H_G, AA));

		k = k + 1;
		if (k > maxIter) {
			if (verbose)
				println("Maximal iterations");
			break;
		}

		if (calc_OV) {
			fval = sum(sum(pow(minus(Y, mtimes(XNX, AA)), 2))) / 2 +
					lambda * sum(sum(abs(AA)));
			J.push_back(fval);
		}

		if (k % 10 == 0 && verbose) {
			if (calc_OV)
				fprintf("Iter %d - ||PGrad||: %f, ofv: %f\n", k, d, J.back());
			else
				fprintf("Iter %d - ||PGrad||: %f\n", k, d);

		}

	}

	Matrix& res = minus(
			AA.getSubMatrix(0, p - 1, 0, ny - 1),
			AA.getSubMatrix(p, 2 * p - 1, 0, ny - 1)
	);
	return res;

}