void CoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const
  {
    FactoryMonitor m(*this, "Build", currentLevel);

    RCP<Aggregates> aggregates;
    {
      //TODO check for reuse of aggregates here

      // Level Get
      RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

      // Build
      aggregates = rcp(new Aggregates(*graph));
      aggregates->setObjectLabel("UC");

      algo1_.CoarsenUncoupled(*graph, *aggregates);
      algo2_.AggregateLeftovers(*graph, *aggregates);

    }

    aggregates->AggregatesCrossProcessors(true);

    // Level Set
    Set(currentLevel, "Aggregates", aggregates);

    if (IsPrint(Statistics0)) {
      aggregates->describe(GetOStream(Statistics0, 0), getVerbLevel());
    }

  }
Esempio n. 2
0
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::CheckMainDiagonal(RCP<Matrix> & Ac) const {
    // plausibility check: no zeros on diagonal
    RCP<Vector> diagVec = VectorFactory::Build(Ac->getRowMap());
    Ac->getLocalDiagCopy(*diagVec);

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    LO lZeroDiags = 0;
    Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0);
    for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) {
      if (diagVal[r] == zero) {
        lZeroDiags++;

        if (repairZeroDiagonals_) {
          GO grid = Ac->getRowMap()->getGlobalElement(r);
          LO lcid = Ac->getColMap()->getLocalElement(grid);
          Teuchos::ArrayRCP<LO> indout(1, lcid);
          Teuchos::ArrayRCP<SC> valout(1, one);

          Ac->insertLocalValues(r, indout.view(0, indout.size()), valout.view(0, valout.size()));
        }
      }
    }

    if (IsPrint(Warnings0)) {
      const RCP<const Teuchos::Comm<int> > & comm = Ac->getRowMap()->getComm();
      GO lZeroDiagsGO = Teuchos::as<GO>(lZeroDiags); /* LO->GO conversion */
      GO gZeroDiags   = 0;
      sumAll(comm, lZeroDiagsGO, gZeroDiags);
      if (repairZeroDiagonals_) GetOStream(Warnings0,0) << "RAPFactory (WARNING): repaired " << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;
      else                      GetOStream(Warnings0,0) << "RAPFactory (WARNING): found "    << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;
    }
  }
Esempio n. 3
0
/*
 * clean_string - clean up a string possibly containing garbage
 *
 * *sigh* Before the kiddies find this new and exciting way of
 * annoying opers, lets clean up what is sent to local opers
 * -Dianora
 */
char *
clean_string(char *dest, const unsigned char *src, ssize_t len)
{
  char *d    = dest;
  assert(0 != dest);
  assert(0 != src);

  if (dest == NULL || src == NULL)
    return NULL;

  len -= 3;  /* allow for worst case, '^A\0' */

  while (*src && (len > 0))
  {
    if (*src & 0x80)             /* if high bit is set */
      *d++ = '.';
    else if (!IsPrint(*src))       /* if NOT printable */
    {
      *d++ = '^';
      --len;
      *d++ = 0x40 + *src;   /* turn it into a printable */
    }
    else
      *d++ = *src;

    ++src, --len;
  }

  *d = '\0';
  return dest;
}
Esempio n. 4
0
void mutt_display_sanitize (char *s)
{
  for (; *s; s++) {
    if (!IsPrint (*s))
      *s = '?';
  }
}
Esempio n. 5
0
    /*! @brief Constructor

        @param[in] object      Reference to the class instance that is creating this SubMonitor.
        @param[in] msg         String that indicates what the SubMonitor is monitoring, e.g., "Build"
        @param[in] level       The MueLu Level object.
        @param[in] msgLevel    Governs whether information should be printed.
        @param[in] timerLevel  Governs whether timing information should be *gathered*.  Setting this to NoTimeReport prevents the creation of timers.
    */
    SubFactoryMonitor(const BaseClass& object, const std::string & msg, const Level & level, MsgType msgLevel = Runtime1, MsgType timerLevel = Timings1)
      : SubMonitor(object, msg, msgLevel, timerLevel)
    {
      if (IsPrint(TimingsByLevel)) {
        levelTimeMonitor_ = rcp(new TimeMonitor(object, object.ShortClassName() + ": " +  msg + " (sub, total, level=" + Teuchos::Utils::toString(level.GetLevelID()) + ")", timerLevel));
      }
    }
Esempio n. 6
0
void DrawCharacters(Page *page)
{
    size_t start = page->start;
    size_t i;
    for (i = start; i < page->end; i++) {
        short x = character_positions[i].x;
        short y = character_positions[i].y;

        if (IsPrint(text[i])) {
draw:
            ;
            XChar2b font_code;
            XFontStruct *font = SelectFont(text[i], &font_code);
            GC gc = XCreateGC(disp, back_buffer, 0, NULL);
            XCopyGC(disp, default_gc, GCForeground | GCBackground, gc);
            XSetFont(disp, gc, font->fid);
            XDrawString16(disp, back_buffer, gc,
                          x, y + (font->ascent - default_font->ascent), &font_code, 1);
            XFreeGC(disp, gc);
        } else {
            if (EqAscii2b(text[i], '\n')) {
                // DOWNWARDS ARROW WITH TIP LEFTWARDS
                XChar2b symbol = { .byte1 = 0x21, .byte2 = 0xb2 };
                XDrawString16(disp, back_buffer, control_gc,
                              x, y,
                              &symbol, 1);
            } else if (EqAscii2b(text[i], '\t')) {
                ;
            } else {
                goto draw;
            }
        }
    }
void mutt_safe_path (char *s, size_t l, ADDRESS *a)
{
  char *p;

  mutt_save_path (s, l, a);
  for (p = s; *p; p++)
    if (*p == '/' || ISSPACE (*p) || !IsPrint ((unsigned char) *p))
      *p = '_';
}
static void clean_error_buf(void)
{
  char *s;
  for(s = Errorbuf; *s; s++)
  {
    if(!IsPrint(*s))
      *s = '.';
  }
}
Esempio n. 9
0
    /*! @brief Constructor

        @param[in] object      Reference to the class instance that is creating this SubMonitor.
        @param[in] msg         String that indicates what the SubMonitor is monitoring, e.g., "Build".
        @param[in] level       The MueLu Level object.
        @param[in] msgLevel    Governs whether information should be printed.
        @param[in] timerLevel  Governs whether timing information should be *gathered*.  Setting this to NoTimeReport prevents the creation of timers.

      TODO: code factorization
    */
    FactoryMonitor(const BaseClass& object, const std::string & msg, const Level & level, MsgType msgLevel = static_cast<MsgType>(Test | Runtime0), MsgType timerLevel = Timings0)
      : Monitor(object, msg, msgLevel, timerLevel),
      timerMonitorExclusive_(object, object.ShortClassName() + " : " + msg, timerLevel)
    {
      if (IsPrint(TimingsByLevel)) {
        levelTimeMonitor_ = rcp(new TimeMonitor(object, object.ShortClassName() + ": " +  msg + " (total, level=" + Teuchos::Utils::toString(level.GetLevelID()) + ")", timerLevel));
        levelTimeMonitorExclusive_ = rcp(new MutuallyExclusiveTimeMonitor<Level>(object, object.ShortClassName() + " " + MUELU_TIMER_AS_STRING + " : " + msg + " (level=" + Teuchos::Utils::toString(level.GetLevelID()) + ")", timerLevel));
      }
    }
Esempio n. 10
0
  Teuchos::FancyOStream & VerboseObject::GetOStream(MsgType type, int thisProcRankOnly) const {
    if (!IsPrint(type, thisProcRankOnly))
      return *blackHole_;

    Teuchos::FancyOStream& os = *getOStream();
    if (!(type & ((Extreme | Test) ^ Warnings)))
      os << "\n******* WARNING *******" << std::endl;

    return os;
  }
Esempio n. 11
0
     /*! @brief Constructor

        @param[in] object      Reference to the class instance that is creating this MutuallyExclusiveTimeMonitor.
        @param[in] msg         String that indicates what the Monitor is monitoring, e.g., "Build"
        @param[in] timerLevel  Governs whether timing information should be *gathered*.  Setting this to NoTimeReport prevents the creation of timers.
    */
    MutuallyExclusiveTimeMonitor(const BaseClass& object, const std::string& msg, MsgType timerLevel = Timings0)
    {
      // Inherit verbosity from 'object'
      SetVerbLevel(object.GetVerbLevel());
      setOStream(object.getOStream());

      if (IsPrint(timerLevel) &&
          /* disable timer if never printed: */ (IsPrint(RuntimeTimings) || (!IsPrint(NoTimeReport)))) {

        if (!IsPrint(NoTimeReport)) {
          timer_ = MutuallyExclusiveTime<TagName>::getNewTimer("MueLu: " + msg /*+ " (MutuallyExclusive)" */);
        } else {
          timer_ = rcp(new MutuallyExclusiveTime<TagName>     ("MueLu: " + msg /*+ " (MutuallyExclusive)" */));
        }

        timer_->start();
        timer_->incrementNumCalls();
      }
    }
Esempio n. 12
0
    TimeMonitor(const BaseClass& object, const std::string& msg, MsgType timerLevel = Timings0)
    {

      // Inherit verbosity from 'object'
      SetVerbLevel(object.GetVerbLevel());
      setOStream(object.getOStream());

      if (IsPrint(timerLevel) &&
          /* disable timer if never printed: */ (IsPrint(RuntimeTimings) || (!IsPrint(NoTimeReport)))) {

        if (!IsPrint(NoTimeReport)) {
          // TODO: there is no function to register a timer in Teuchos::TimeMonitor after the creation of the timer. But would be useful...
          timer_ = Teuchos::TimeMonitor::getNewTimer("MueLu: " + msg);
        } else {
          timer_ = rcp(new Teuchos::Time("MueLu: " + msg));
        }

        // Start the timer (this is what is done by Teuchos::TimeMonitor)
        timer_->start();
        timer_->incrementNumCalls();
      }
    }
Esempio n. 13
0
    //! Constructor
    PrintMonitor(const BaseClass& object, const std::string& msg, MsgType msgLevel = Runtime0) {

      // Inherit verbosity from 'object'
      SetVerbLevel(object.GetVerbLevel());
      setOStream(object.getOStream());

      // Print description and new indent
      if (IsPrint(msgLevel)) {
        GetOStream(msgLevel, 0) << msg << std::endl;
        tab_ = rcp(new Teuchos::OSTab(getOStream()));
      }

    }
Esempio n. 14
0
  void UserPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    RCP<Matrix>      A             = Get< RCP<Matrix> >      (fineLevel, "A");
    RCP<MultiVector> fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");

    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError, "Block size > 1 has not been implemented");

    const Teuchos::ParameterList& pL = GetParameterList();

    std::string    mapFile   = pL.get<std::string>("mapFileName");
    RCP<const Map> rowMap    = A->getRowMap();
    RCP<const Map> coarseMap = Utils2::ReadMap(mapFile, rowMap->lib(), rowMap->getComm());
    Set(coarseLevel, "CoarseMap", coarseMap);

    std::string matrixFile = pL.get<std::string>("matrixFileName");
    RCP<Matrix> P          = Utils::Read(matrixFile, rowMap, coarseMap, coarseMap, rowMap);
#if 1
    Set(coarseLevel, "P", P);
#else
    // Expand column map by 1
    RCP<Matrix> P1 = Utils::Multiply(*A, false, *P, false);
    P = Utils::Read(matrixFile, rowMap, P1->getColMap(), coarseMap, rowMap);
    Set(coarseLevel, "P", P);
#endif

    RCP<MultiVector> coarseNullspace = MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
    P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits<SC>::one(), Teuchos::ScalarTraits<SC>::zero());
    Set(coarseLevel, "Nullspace", coarseNullspace);

    // Coordinates transfer
    size_t n = Teuchos::as<size_t>(sqrt(coarseMap->getGlobalNumElements()));
    TEUCHOS_TEST_FOR_EXCEPTION(n*n != coarseMap->getGlobalNumElements(), Exceptions::RuntimeError, "Unfortunately, this is not the case, don't know what to do");

    RCP<MultiVector> coarseCoords = MultiVectorFactory::Build(coarseMap, 2);
    ArrayRCP<Scalar> x = coarseCoords->getDataNonConst(0), y = coarseCoords->getDataNonConst(1);
    for (size_t LID = 0; LID < coarseMap->getNodeNumElements(); ++LID) {
      GlobalOrdinal GID = coarseMap->getGlobalElement(LID) - coarseMap->getIndexBase();
      GlobalOrdinal i = GID % n, j = GID/n;
      x[LID] = i;
      y[LID] = j;
    }
    Set(coarseLevel, "Coordinates", coarseCoords);

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*P, "P", params);
    }
  }
Esempio n. 15
0
    ~TimeMonitor() {
      if (timer_ != Teuchos::null) {

        // Stop the timer
        timer_->stop();

        if (IsPrint(RuntimeTimings)) {
          //FIXME: creates lot of barriers. An option to report time of proc0 only instead would be nice
          //FIXME: MPI_COMM_WORLD only... BTW, it is also the case in Teuchos::TimeMonitor...
          //
          // mfh 11 Nov 2012: Actually, Teuchos::TimeMonitor::summarize() has multiple overloads that take a Teuchos::Comm.
          ArrayRCP<double> stats = ReduceMaxMinAvg(timer_->totalElapsedTime(), *Teuchos::DefaultComm<int>::getComm ());

          //FIXME: Not very important for now, but timer will be printed even if verboseLevel of Monitor/Object changed
          //       between Monitor constructor and destructor.
          GetOStream(RuntimeTimings, 0) << "Timer: " << " max=" << stats[0] << " min=" << stats[1] << " avg=" << stats[2] << std::endl;
        }
      }
    }
Esempio n. 16
0
static int exec(FILE* fp, ENC_INFO* einfo)
{
#define NCOL  8

  int c, val, enc;

  enc = einfo->num;

  fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n",
	  einfo->name);

  for (c = 0; c < 256; c++) {
    val = 0;
    if (IsNewline(enc, c))  val |= BIT_CTYPE_NEWLINE;
    if (IsAlpha (enc, c))   val |= (BIT_CTYPE_ALPHA | BIT_CTYPE_ALNUM);
    if (IsBlank (enc, c))   val |= BIT_CTYPE_BLANK;
    if (IsCntrl (enc, c))   val |= BIT_CTYPE_CNTRL;
    if (IsDigit (enc, c))   val |= (BIT_CTYPE_DIGIT | BIT_CTYPE_ALNUM);
    if (IsGraph (enc, c))   val |= BIT_CTYPE_GRAPH;
    if (IsLower (enc, c))   val |= BIT_CTYPE_LOWER;
    if (IsPrint (enc, c))   val |= BIT_CTYPE_PRINT;
    if (IsPunct (enc, c))   val |= BIT_CTYPE_PUNCT;
    if (IsSpace (enc, c))   val |= BIT_CTYPE_SPACE;
    if (IsUpper (enc, c))   val |= BIT_CTYPE_UPPER;
    if (IsXDigit(enc, c))   val |= BIT_CTYPE_XDIGIT;
    if (IsWord  (enc, c))   val |= BIT_CTYPE_WORD;
    if (IsAscii (enc, c))   val |= BIT_CTYPE_ASCII;

    if (c % NCOL == 0) fputs("  ", fp);
    fprintf(fp, "0x%04x", val);
    if (c != 255) fputs(",", fp);
    if (c != 0 && c % NCOL == (NCOL-1))
      fputs("\n", fp);
    else
      fputs(" ", fp);
  }
  fprintf(fp, "};\n");
  return 0;
}
  void LocalAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::CoarsenUncoupled(GraphBase const & graph, Aggregates & aggregates) const {
    Monitor m(*this, "Coarsen Uncoupled");

    std::string orderingType;
    switch(ordering_) {
      case NATURAL:
        orderingType="Natural";
        break;
      case RANDOM:
        orderingType="Random";
        break;
      case GRAPH:
        orderingType="Graph";
        break;
      default:
        break;
    }
    GetOStream(Runtime1) << "Ordering:                  " << orderingType << std::endl;
    GetOStream(Runtime1) << "Min nodes per aggregate:   " << minNodesPerAggregate_ << std::endl;
    GetOStream(Runtime1) << "Max nbrs already selected: " << maxNeighAlreadySelected_ << std::endl;

    /* Create Aggregation object */
    my_size_t nAggregates = 0;

    /* ============================================================= */
    /* aggStat indicates whether this node has been aggreated, and   */
    /* vertex2AggId stores the aggregate number where this node has  */
    /* been aggregated into.                                         */
    /* ============================================================= */

    Teuchos::ArrayRCP<NodeState> aggStat;
    const my_size_t nRows = graph.GetNodeNumVertices();
    if (nRows > 0) aggStat = Teuchos::arcp<NodeState>(nRows);
    for ( my_size_t i = 0; i < nRows; ++i ) aggStat[i] = READY;

    /* ============================================================= */
    /* Phase 1  :                                                    */
    /*    for all nodes, form a new aggregate with its neighbors     */
    /*    if the number of its neighbors having been aggregated does */
    /*    not exceed a given threshold                               */
    /*    (GetMaxNeighAlreadySelected() = 0 ===> Vanek's scheme) */
    /* ============================================================= */

    /* some general variable declarations */
    Teuchos::ArrayRCP<LO> randomVector;
    RCP<MueLu::LinkedList> nodeList; /* list storing the next node to pick as a root point for ordering_ == GRAPH */
    MueLu_SuperNode  *aggHead=NULL, *aggCurrent=NULL, *supernode=NULL;
    /**/

    if ( ordering_ == RANDOM )       /* random ordering */
      {
        //TODO: could be stored in a class that respect interface of LinkedList

        randomVector = Teuchos::arcp<LO>(nRows); //size_t or int ?-> to be propagated
        for (my_size_t i = 0; i < nRows; ++i) randomVector[i] = i;
        RandomReorder(randomVector);
      }
    else if ( ordering_ == GRAPH )  /* graph ordering */
      {
        nodeList = rcp(new MueLu::LinkedList());
        nodeList->Add(0);
      }

    /* main loop */
    {
      LO iNode  = 0;
      LO iNode2 = 0;

      Teuchos::ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); // output only: contents ignored

      while (iNode2 < nRows)
        {

          /*------------------------------------------------------ */
          /* pick the next node to aggregate                       */
          /*------------------------------------------------------ */

          if      ( ordering_ == NATURAL ) iNode = iNode2++;
          else if ( ordering_ == RANDOM )  iNode = randomVector[iNode2++];
          else if ( ordering_ == GRAPH )
            {
              if ( nodeList->IsEmpty() )
                {
                  for ( int jNode = 0; jNode < nRows; ++jNode )
                    {
                      if ( aggStat[jNode] == READY )
                        {
                          nodeList->Add(jNode); //TODO optim: not necessary to create a node. Can just set iNode value and skip the end
                          break;
                        }
                    }
                }
              if ( nodeList->IsEmpty() ) break; /* end of the while loop */ //TODO: coding style :(

              iNode = nodeList->Pop();
            }
          else {
            throw(Exceptions::RuntimeError("CoarsenUncoupled: bad aggregation ordering option"));
          }

          /*------------------------------------------------------ */
          /* consider further only if the node is in READY mode    */
          /*------------------------------------------------------ */

          if ( aggStat[iNode] == READY )
            {
              // neighOfINode is the neighbor node list of node 'iNode'.
              Teuchos::ArrayView<const LO> neighOfINode = graph.getNeighborVertices(iNode);
              typename Teuchos::ArrayView<const LO>::size_type length = neighOfINode.size();

              supernode = new MueLu_SuperNode;
              try {
                supernode->list = Teuchos::arcp<int>(length+1);
              } catch (std::bad_alloc&) {
                TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu::LocalAggregationAlgorithm::CoarsenUncoupled(): Error: couldn't allocate memory for supernode! length=" + Teuchos::toString(length));
              }

              supernode->maxLength = length;
              supernode->length = 1;
              supernode->list[0] = iNode;

              int selectFlag = 1;
              {
                /*--------------------------------------------------- */
                /* count the no. of neighbors having been aggregated  */
                /*--------------------------------------------------- */

                int count = 0;
                for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it)
                  {
                    int index = *it;
                    if ( index < nRows )
                      {
                        if ( aggStat[index] == READY ||
                             aggStat[index] == NOTSEL )
                          supernode->list[supernode->length++] = index;
                        else count++;

                      }
                  }

                /*--------------------------------------------------- */
                /* if there are too many neighbors aggregated or the  */
                /* number of nodes in the new aggregate is too few,   */
                /* don't do this one                                  */
                /*--------------------------------------------------- */

                if ( count > GetMaxNeighAlreadySelected() ) selectFlag = 0;
              }

              // Note: the supernode length is actually 1 more than the
              //       number of nodes in the candidate aggregate. The
              //       root is counted twice. I'm not sure if this is
              //       a bug or a feature ... so I'll leave it and change
              //       < to <= in the if just below.

              if (selectFlag != 1 ||
                  supernode->length <= GetMinNodesPerAggregate())
                {
                  aggStat[iNode] = NOTSEL;
                  delete supernode;
                  if ( ordering_ == GRAPH ) /* if graph ordering */
                    {
                      for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it)
                        {
                          int index = *it;
                          if  ( index < nRows && aggStat[index] == READY )
                            {
                              nodeList->Add(index);
                            }
                        }
                    }
                }
              else
                {
                  aggregates.SetIsRoot(iNode);
                  for ( int j = 0; j < supernode->length; ++j )
                    {
                      int jNode = supernode->list[j];
                      aggStat[jNode] = SELECTED;
                      vertex2AggId[jNode] = nAggregates;
                      if ( ordering_ == GRAPH ) /* if graph ordering */
                        {

                          Teuchos::ArrayView<const LO> neighOfJNode = graph.getNeighborVertices(jNode);

                          for (typename Teuchos::ArrayView<const LO>::const_iterator it = neighOfJNode.begin(); it != neighOfJNode.end(); ++it)
                            {
                              int index = *it;
                              if ( index < nRows && aggStat[index] == READY )
                                {
                                  nodeList->Add(index);
                                }
                            }
                        }
                    }
                  supernode->next = NULL;
                  supernode->index = nAggregates;
                  if ( nAggregates == 0 )
                    {
                      aggHead = supernode;
                      aggCurrent = supernode;
                    }
                  else
                    {
                      aggCurrent->next = supernode;
                      aggCurrent = supernode;
                    }
                  nAggregates++;
                  // unused aggCntArray[nAggregates] = supernode->length;
                }
            }
        } // end of 'for'

      // views on distributed vectors are freed here.

    } // end of 'main loop'

    nodeList = Teuchos::null;

    /* Update aggregate object */
    aggregates.SetNumAggregates(nAggregates);

    /* Verbose */
    {
      const RCP<const Teuchos::Comm<int> > & comm = graph.GetComm();

      if (IsPrint(Warnings0)) {
        GO localReady=0, globalReady;

        // Compute 'localReady'
        for ( my_size_t i = 0; i < nRows; ++i )
          if (aggStat[i] == READY) localReady++;

        // Compute 'globalReady'
        sumAll(comm, localReady, globalReady);

        if(globalReady > 0)
          GetOStream(Warnings0) << "Warning: " << globalReady << " READY nodes left" << std::endl;
      }

      if (IsPrint(Statistics1)) {
        // Compute 'localSelected'
        LO localSelected=0;
        for ( my_size_t i = 0; i < nRows; ++i )
          if ( aggStat[i] == SELECTED ) localSelected++;

        // Compute 'globalSelected'
        GO globalSelected; sumAll(comm, (GO)localSelected, globalSelected);

        // Compute 'globalNRows'
        GO globalNRows; sumAll(comm, (GO)nRows, globalNRows);

        GetOStream(Statistics1) << "Nodes aggregated = " << globalSelected << " (" << globalNRows << ")" << std::endl;
      }

      if (IsPrint(Statistics1)) {
        GO nAggregatesGlobal; sumAll(comm, (GO)nAggregates, nAggregatesGlobal);
        GetOStream(Statistics1) << "Total aggregates = " << nAggregatesGlobal << std::endl;
      }

    } // verbose

      /* ------------------------------------------------------------- */
      /* clean up                                                      */
      /* ------------------------------------------------------------- */

    aggCurrent = aggHead;
    while ( aggCurrent != NULL )
      {
        supernode = aggCurrent;
        aggCurrent = aggCurrent->next;
        delete supernode;
      }

  } // CoarsenUncoupled
Esempio n. 18
0
// 次のページの開始位置、あるいは文書の終端 (== text_length) を返す。
size_t FillPage(size_t start, Page *page, bool draw)
{
    XWindowAttributes attrs;
    XGetWindowAttributes(disp, win, &attrs);

    // ページのサイズ。
    const int LEFT_MARGIN	= 50;
    const int RIGHT_MARGIN	= attrs.width - LEFT_MARGIN;
    const int TOP_MARGIN	= 50;
    const int BOTTOM_MARGIN	= attrs.height - TOP_MARGIN;

    const XChar2b sp = { 0x00, 0x21 };
    const int EM = GetCharWidth(sp);

    // 行の高さ。
    const int LINE_HEIGHT = 22;

    // 現在の文字の描画位置。
    int x = LEFT_MARGIN, y = TOP_MARGIN + font->ascent;

    size_t i;
    for (i = start; i < text_length; i++) {
	// カーソルの描画
	if (draw && i == cursor_position) {
	    XFillRectangle(disp, win, cursor_gc,
			   x, y - font->ascent,
			   CURSOR_WIDTH, font->ascent + font->descent);
	}

	if (IsPrint(text[i])) {
	    // 印字可能文字の場合。

	    int width = GetCharWidth(text[i]);

	    // この文字を描画すると右マージンにかかるようなら改行する。
	    // ただし、行頭に居る場合は改行しない。
	    if ( x + width > RIGHT_MARGIN &&
		 !ForbiddenAtStart(text[i]) && // 行頭禁止文字ならばぶらさげる
		 x != LEFT_MARGIN ) {
		y += LINE_HEIGHT;
		x = LEFT_MARGIN;

		// ページにも収まらない場合、この位置で終了する。
		if (y + font->descent > BOTTOM_MARGIN) {
		    page->start = start;
		    page->end = i;
		    return i;
		}
	    }

	    if (draw) XDrawString16(disp, win, gc,
				    x, y,
				    &text[i], 1);
	    x += width;
	} else {
	    // ラインフィードで改行する。
	    if (EqAscii2b(text[i], '\n')) {
		if (draw) {
		    // DOWNWARDS ARROW WITH TIP LEFTWARDS
		    XChar2b symbol = { .byte1 = 0x21, .byte2 = 0xb2 };
		    XDrawString16(disp, win, control_gc,
				  x, y,
				  &symbol, 1);
		}
		y += LINE_HEIGHT;
		x = LEFT_MARGIN;

		// ページにも収まらない場合、次の位置で終了する。
		// ページ区切り位置での改行は持ち越さない。
		if (y + font->descent > BOTTOM_MARGIN) {
		    page->start = start;
		    page->end = i + 1;
		    return i + 1;
		}
	    } else if (EqAscii2b(text[i], '\t')) {
		int tab = EM * 8;
		x = LEFT_MARGIN + (((x - LEFT_MARGIN) / tab) + 1) * tab;
	    }
	}
    }
    if (draw && i == cursor_position) {
	XFillRectangle(disp, win, cursor_gc,
		       x, y - font->ascent,
		       CURSOR_WIDTH, font->ascent + font->descent);
    }
    if (draw) XDrawString(disp, win, control_gc,
			  x, y,
			  "[EOF]", 5);
    // 全てのテキストを配置した。
    page->start = start;
    page->end = text_length;
    return text_length;
}
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const ParameterList& pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    bool bUseOnePtAggregationAlgorithm             = pL.get<bool>("UseOnePtAggregationAlgorithm");
    bool bUseSmallAggregationAlgorithm             = pL.get<bool>("UseSmallAggregatesAggregationAlgorithm");
    bool bUsePreserveDirichletAggregationAlgorithm = pL.get<bool>("UsePreserveDirichletAggregationAlgorithm");
    bool bUseUncoupledAggregationAglorithm         = pL.get<bool>("UseUncoupledAggregationAlgorithm");
    bool bUseMaxLinkAggregationAlgorithm           = pL.get<bool>("UseMaxLinkAggregationAlgorithm");
    bool bUseIsolatedNodeAggregationAglorithm      = pL.get<bool>("UseIsolatedNodeAggregationAlgorithm");
    bool bUseEmergencyAggregationAlgorithm         = pL.get<bool>("UseEmergencyAggregationAlgorithm");

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (bUseOnePtAggregationAlgorithm)             algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
    if (bUseSmallAggregationAlgorithm)             algos_.push_back(rcp(new SmallAggregationAlgorithm             (graphFact)));
    if (bUseUncoupledAggregationAglorithm)         algos_.push_back(rcp(new UncoupledAggregationAlgorithm         (graphFact)));
    if (bUseMaxLinkAggregationAlgorithm)           algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
    if (bUsePreserveDirichletAggregationAlgorithm) algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
    if (bUseIsolatedNodeAggregationAglorithm)      algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    if (bUseEmergencyAggregationAlgorithm)         algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));


    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name"), mapSmallAggName = pL.get<std::string>("SmallAgg aggregate map name");
    RCP<const Map> OnePtMap, SmallAggMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }
    if (mapSmallAggName.length()) {
      RCP<const FactoryBase> mapSmallAggFact = GetFactory("SmallAgg aggregate map factory");
      SmallAggMap = currentLevel.Get<RCP<const Map> >(mapSmallAggName, mapSmallAggFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO nRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(nRows, NodeStats::READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = NodeStats::BOUNDARY;
    }

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (SmallAggMap != Teuchos::null || OnePtMap != Teuchos::null) {
      for (LO i = 0; i < nRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        if (SmallAggMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (SmallAggMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::SMALLAGG;
          }
        }

        if (OnePtMap != null) {
          for (LO kr = 0; kr < nDofsPerNode; kr++) {
            if (OnePtMap->isNodeGlobalElement(grid + kr))
              aggStat[i] = MueLu::NodeStats::ONEPT;
          }
        }
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(nRows), numGlobalRows);

    LO numNonAggregatedNodes = nRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = nRows - numNonAggregatedNodes,  numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(), numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::CheckRepairMainDiagonal(RCP<Matrix>& Ac) const {
    const Teuchos::ParameterList& pL = GetParameterList();
    bool repairZeroDiagonals = pL.get<bool>("RepairMainDiagonal");
    bool checkAc             = pL.get<bool>("CheckMainDiagonal");

    if (!checkAc && !repairZeroDiagonals)
      return;

    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    Teuchos::RCP<Teuchos::ParameterList> p = Teuchos::rcp(new Teuchos::ParameterList());
    p->set("DoOptimizeStorage", true);

    RCP<const Map> rowMap = Ac->getRowMap();
    RCP<Vector> diagVec = VectorFactory::Build(rowMap);
    Ac->getLocalDiagCopy(*diagVec);

    LO lZeroDiags = 0;
    Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0);

    for (size_t i = 0; i < rowMap->getNodeNumElements(); i++) {
      if (diagVal[i] == zero) {
        lZeroDiags++;
      }
    }
    GO gZeroDiags;
    MueLu_sumAll(rowMap->getComm(), Teuchos::as<GO>(lZeroDiags), gZeroDiags);

    if (repairZeroDiagonals && gZeroDiags > 0) {
      // TAW: If Ac has empty rows, put a 1 on the diagonal of Ac. Be aware that Ac might have empty rows AND columns.
      // The columns might not exist in the column map at all.
      //
      // It would be nice to add the entries to the original matrix Ac. But then we would have to use
      // insertLocalValues. However we cannot add new entries for local column indices that do not exist in the column map
      // of Ac (at least Epetra is not able to do this).
      //
      // Here we build a diagonal matrix with zeros on the diagonal and ones on the diagonal for the rows where Ac has empty rows
      // We have to build a new matrix to be able to use insertGlobalValues. Then we add the original matrix Ac to our new block
      // diagonal matrix and use the result as new (non-singular) matrix Ac.
      // This is very inefficient.
      //
      // If you know something better, please let me know.
      RCP<Matrix> fixDiagMatrix = Teuchos::null;
      fixDiagMatrix = MatrixFactory::Build(rowMap, 1);
      for (size_t r = 0; r < rowMap->getNodeNumElements(); r++) {
        if (diagVal[r] == zero) {
          GO grid = rowMap->getGlobalElement(r);
          Teuchos::ArrayRCP<GO> indout(1,grid);
          Teuchos::ArrayRCP<SC> valout(1, one);
          fixDiagMatrix->insertGlobalValues(grid,indout.view(0, 1), valout.view(0, 1));
        }
      }
      {
        Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete1"));
        Ac->fillComplete(p);
      }

      MueLu::Utils2<Scalar, LocalOrdinal, GlobalOrdinal, Node>::TwoMatrixAdd(*Ac, false, 1.0, *fixDiagMatrix, 1.0);
      if (Ac->IsView("stridedMaps"))
        fixDiagMatrix->CreateView("stridedMaps", Ac);

      Ac = Teuchos::null;     // free singular coarse level matrix
      Ac = fixDiagMatrix;     // set fixed non-singular coarse level matrix
    }

    // call fillComplete with optimized storage option set to true
    // This is necessary for new faster Epetra MM kernels.
    {
      Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete2"));
      Ac->fillComplete(p);
    }

    // print some output
    if (IsPrint(Warnings0))
      GetOStream(Warnings0) << "RAPFactory (WARNING): " << (repairZeroDiagonals ? "repaired " : "found ")
          << gZeroDiags << " zeros on main diagonal of Ac." << std::endl;

#ifdef HAVE_MUELU_DEBUG // only for debugging
    // check whether Ac has been repaired...
    Ac->getLocalDiagCopy(*diagVec);
    Teuchos::ArrayRCP< Scalar > diagVal2 = diagVec->getDataNonConst(0);
    for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) {
      if (diagVal2[r] == zero) {
        GetOStream(Errors,-1) << "Error: there are zeros left on diagonal after repair..." << std::endl;
        break;
      }
    }
#endif
  }
Esempio n. 21
0
static lexicon
Get()
{
    lexicon c;
    register lexicon *pC = &c;
    register int character;

    if (!Empty) {
	*pC = lifo[rp];
	rp++;
	if (rp == sizeof lifo/sizeof (lexicon)) {
	    rp = 0;
	}
	if (rp == wp) {
	    Empty = 1;
	}
	Full = 0;
    } else {
	character = GetC();
	switch (character) {
	case EOF:
	    pC->type = LEX_END_OF_FILE;
	    break;
	case '^':
	    character = GetC();
	    if (!IsPrint(character)) {
		pC->type = LEX_ILLEGAL;
	    } else {
		pC->type = LEX_CARETED;
		if (character == '?') {
		    character |= 0x40;	/* rubout */
		} else {
		    character &= 0x1f;
		}
	    }
	    break;
	case '\\':
	    character = GetC();
	    if (!IsPrint(character)) {
		pC->type = LEX_ILLEGAL;
	    } else {
		pC->type = LEX_ESCAPED;
		switch (character) {
		case 'E': case 'e':
		    character = ESCAPE;
		    break;
		case 't':
		    character = TAB;
		    break;
		case 'n':
		    character = NEWLINE;
		    break;
		case 'r':
		    character = CARRIAGE_RETURN;
		    break;
		default:
		    pC->type = LEX_ILLEGAL;
		    break;
		}
	    }
	    break;
	default:
	    if ((IsPrint(character)) || isspace(character)) {
		pC->type = LEX_CHAR;
	    } else {
		pC->type = LEX_ILLEGAL;
	    }
	    break;
	}
	pC->value = character;
    }
    return(*pC);
}
  void SaPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildP(Level &fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);
    std::ostringstream levelstr;
    levelstr << coarseLevel.GetLevelID();

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);
      A = Utils2::Transpose(*A, true); // build transpose of A explicitely
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    const ParameterList & pL = GetParameterList();
    Scalar dampingFactor = as<Scalar>(pL.get<double>("sa: damping factor"));
    LO maxEigenIterations = as<LO>(pL.get<int>("sa: eigenvalue estimate num iterations"));
    bool estimateMaxEigen = pL.get<bool>("sa: calculate eigenvalue estimate");
    if (dampingFactor != Teuchos::ScalarTraits<Scalar>::zero()) {

      Scalar lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one() || estimateMaxEigen) {
          GetOStream(Statistics1) << "Calculating max eigenvalue estimate now (max iters = "<< maxEigenIterations << ")" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utils::PowerMethod(*A, true, maxEigenIterations, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel);
        Teuchos::RCP<Vector> invDiag = Utils::GetMatrixDiagonalInverse(*A);

        SC omega = dampingFactor / lambdaMax;

        // finalP = Ptent + (I - \omega D^{-1}A) Ptent
        finalP = Utils::Jacobi(omega, *invDiag, *A, *Ptent, finalP, GetOStream(Statistics2),std::string("MueLu::SaP-")+levelstr.str());
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utils2::Transpose(*finalP, true); // use Utils2 -> specialization for double
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      params->set("printCommInfo",          true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);
    }

  } //Build()
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const {
    {
      FactoryMonitor m(*this, "Computing Ac", coarseLevel);
      std::ostringstream levelstr;
      levelstr << coarseLevel.GetLevelID();

      TEUCHOS_TEST_FOR_EXCEPTION(hasDeclaredInput_==false, Exceptions::RuntimeError,
                                 "MueLu::RAPFactory::Build(): CallDeclareInput has not been called before Build!");

      // Set "Keeps" from params
      const Teuchos::ParameterList& pL = GetParameterList();
      if (pL.get<bool>("Keep AP Pattern"))
        coarseLevel.Keep("AP Pattern",  this);
      if (pL.get<bool>("Keep RAP Pattern"))
        coarseLevel.Keep("RAP Pattern", this);

      RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
      RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P"), AP, Ac;

      // Reuse pattern if available (multiple solve)
      if (coarseLevel.IsAvailable("AP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous AP pattern" << std::endl;

        AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern");
      }

      {
        SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel);

        AP = Utils::Multiply(*A, false, *P, false, AP, GetOStream(Statistics2),true,true,std::string("MueLu::A*P-")+levelstr.str());
      }
      if (pL.get<bool>("Keep AP Pattern"))
        Set(coarseLevel, "AP Pattern", AP);

      // Reuse coarse matrix memory if available (multiple solve)
      if (coarseLevel.IsAvailable("RAP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous RAP pattern" << std::endl;

        Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern");
        // Some eigenvalue may have been cached with the matrix in the previous run.
        // As the matrix values will be updated, we need to reset the eigenvalue.
        Ac->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
      }

      // If we do not modify matrix later, allow optimization of storage.
      // This is necessary for new faster Epetra MM kernels.
      bool doOptimizeStorage = !pL.get<bool>("RepairMainDiagonal");

      const bool doTranspose    = true;
      const bool doFillComplete = true;
      if (pL.get<bool>("transpose: use implicit") == true) {
        SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel);
        Ac = Utils::Multiply(*P,  doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-implicit-")+levelstr.str());

      } else {
        RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");

        SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel);
        Ac = Utils::Multiply(*R, !doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-explicit-")+levelstr.str());
      }

      CheckRepairMainDiagonal(Ac);

      if (IsPrint(Statistics1)) {
        RCP<ParameterList> params = rcp(new ParameterList());;
        params->set("printLoadBalancingInfo", true);
        params->set("printCommInfo",          true);
        GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ac, "Ac", params);
      }

      Set(coarseLevel, "A",           Ac);
      if (pL.get<bool>("Keep RAP Pattern"))
        Set(coarseLevel, "RAP Pattern", Ac);
    }

    if (transferFacts_.begin() != transferFacts_.end()) {
      SubFactoryMonitor m(*this, "Projections", coarseLevel);

      // call Build of all user-given transfer factories
      for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
        RCP<const FactoryBase> fac = *it;
        GetOStream(Runtime0) << "RAPFactory: call transfer factory: " << fac->description() << std::endl;
        fac->CallBuild(coarseLevel);
        // Coordinates transfer is marginally different from all other operations
        // because it is *optional*, and not required. For instance, we may need
        // coordinates only on level 4 if we start repartitioning from that level,
        // but we don't need them on level 1,2,3. As our current Hierarchy setup
        // assumes propagation of dependencies only through three levels, this
        // means that we need to rely on other methods to propagate optional data.
        //
        // The method currently used is through RAP transfer factories, which are
        // simply factories which are called at the end of RAP with a single goal:
        // transfer some fine data to coarser level. Because these factories are
        // kind of outside of the mainline factories, they behave different. In
        // particular, we call their Build method explicitly, rather than through
        // Get calls. This difference is significant, as the Get call is smart
        // enough to know when to release all factory dependencies, and Build is
        // dumb. This led to the following CoordinatesTransferFactory sequence:
        // 1. Request level 0
        // 2. Request level 1
        // 3. Request level 0
        // 4. Release level 0
        // 5. Release level 1
        //
        // The problem is missing "6. Release level 0". Because it was missing,
        // we had outstanding request on "Coordinates", "Aggregates" and
        // "CoarseMap" on level 0.
        //
        // This was fixed by explicitly calling Release on transfer factories in
        // RAPFactory. I am still unsure how exactly it works, but now we have
        // clear data requests for all levels.
        coarseLevel.Release(*fac);
      }
    }

  }
Esempio n. 24
0
bool 
Input::Test( )
{
    bool ok = true;
    cout << "Testing Input" << endl;

    Input & input = Input::Instance();

    try
    {
#if defined(USE_CWIID)
        cout << "To use a Wiimote, hold buttons 1 & 2 down." << endl;
#endif
        cout << "Init()" << endl;
        input.Init( );

        cout << "Number of devices: " << input.NumDevices() << endl;
        cout << "Index\tType\t\tButtons\tPnters\tAxes\tAccels\tName" << endl;
        for ( int i = 0; i < input.NumDevices(); ++i )
        {
            shared_ptr< InputDevice const > pDev =  input.Device( i );
            Assert( pDev != 0 );
            cout << i << "\t" << DeviceTypeName( pDev->Type() ) << "   "
                 << "\t" << pDev->NumButtons() << "\t" << pDev->NumPointers()
                 << "\t" << pDev->NumAxes() << "\t" << pDev->NumAccelerometers()
                 << "\t" << pDev->Name() << endl;
        }
        cout << endl;

        cout << "Shutdown()" << endl;
        input.Shutdown( );

        cout << "Init()" << endl;
        input.Init( );
        for ( int i = 0; i < input.NumDevices(); ++i )
        {
            shared_ptr< InputDevice const > pDev =  input.Device( i );
            Assert( pDev != 0 );
        }
        cout << "Init() again" << endl;
        input.Init( );
        cout << endl;

        cout << "Number of devices: " << input.NumDevices() << endl;
        cout << "Index\tType\t\tButtons\tPnters\tAxes\tAccels\tName" << endl;
        for ( int i = 0; i < input.NumDevices(); ++i )
        {
            shared_ptr< InputDevice const > pDev =  input.Device( i );
            Assert( pDev != 0 );
            cout << i << "\t" << DeviceTypeName( pDev->Type() ) << "   "
                 << "\t" << pDev->NumButtons() << "\t" << pDev->NumPointers()
                 << "\t" << pDev->NumAxes() << "\t" << pDev->NumAccelerometers()
                 << "\t" << pDev->Name() << endl;
        }
        cout << endl;

        cout << "Ready to report events." << endl;
        Timer realTime;
        while ( true )
        {
            input.Update( );
            shared_ptr< InputEvent const > pEvent = input.CheckEvent( );
            if ( pEvent )
            {
                shared_ptr< InputDevice const > pDev = pEvent->Device();
                int button = pEvent->Button();
                cout << "Button=" << hex << setw(2) << setfill('0') << button
                     << dec << setw(0) << setfill(' ');
                if ( (pDev->Type() == InputDevice::Keyboard)
                     && (button <= 0xFF) && IsPrint( (char)button ) )
                    cout << " (" << (char)button << ")";
                cout << "  Device: Type=" << DeviceTypeName( pDev->Type() )
                     << " Name=\"" << pDev->Name() << "\"" << endl;
                cout << "  Buttons down: ";
                for ( int i = 0; i < pDev->NumButtons(); ++i )
                    if ( pDev->ButtonDown( i ) )
                        cout << hex << setw(2) << setfill('0') << i
                             << dec << setw(0) << setfill(' ') << " ";
                cout << endl;
                if ( pDev->NumPointers() > 0 )
                {
                    cout << "  Pointers: ";
                    for ( int i = 0; i < pDev->NumPointers(); ++i )
                        cout << i << ": " << pDev->Pointer( i ) << "   ";
                    cout << endl;
                }
                if ( pDev->NumAxes() > 0 )
                {
                    cout << "  Axes: ";
                    for ( int i = 0; i < pDev->NumAxes(); ++i )
                        cout << i << ": " << pDev->Axis( i ) << "   ";
                    cout << endl;
                }
                if ( pDev->NumAccelerometers() > 0 )
                {
                    cout << "  Accelerometers: ";
                    for ( int i = 0; i < pDev->NumAccelerometers(); ++i )
                        cout << i << ": " << "Accel=" << pDev->Acceleration( i )
                             << " Gravity=" << pDev->Gravity( i ) << "   ";
                    cout << endl;
                }
            }
            if ( realTime.Seconds( ) > 60. )
            {
                cout << "Time's up" << endl;
                break;
            }
        }

        realTime.Reset();
        cout << "SetTextInput( true )" << endl;
        input.SetTextInput( true );
        while ( true )
        {
            input.Update( );
            shared_ptr< InputEvent const > pEvent = input.CheckEvent( );
            if ( pEvent )
            {
                if ( pEvent->Device()->Type() == InputDevice::Keyboard )
                {
                    int button = pEvent->Button();
                    if ( (button < MaximumCodePoint)
                         && IsPrint( (wchar_t)button ) )
                        wcout << (wchar_t)button << flush;
                    else if ( button > 0 )
                        cout << "[" << hex << setw(2) << setfill('0') << button
                             << dec << setw(0) << setfill(' ') << "]" << flush;
                }
            }
            if ( realTime.Seconds( ) > 60. )
            {
                cout << endl << "Time's up" << endl;
                break;
            }
        }
        wcout << endl;
    }
    catch ( Exception & except )
    {
        cout << except.Description( ) << endl;
        ok = false;
    }

    if ( ok )
        cout << "Input PASSED." << endl << endl;
    else
        cout << "Input FAILED." << endl << endl;
    return ok;
}
Esempio n. 25
0
  void ByteSink::Vprintf(const char * format, va_list & ap)
  {
    MFM_API_ASSERT_NONNULL(format);

    u8 p;
    Format::Type type;
    bool alt;
    s32 fieldWidth;
    u8 padChar;
    while ((p = *format++)) {
      if (p != '%') {
        if (p == '\n')          // '\n's _in_the_format_string_ are
          Println();            // treated as packet delimiters!
        else
          Print(p,Format::BYTE);
        continue;
      }

    alt = false;
    fieldWidth = -1;
    padChar = ' ';
  again:
    switch (p = *format++) {
    case '#': alt = true; goto again;

    case '0':
      if (fieldWidth < 0) {
        padChar = '0';
        fieldWidth = 0;
      } else fieldWidth *= 10;
      goto again;

    case '1': case '2': case '3': case '4': case '5':
    case '6': case '7': case '8': case '9':
      if (fieldWidth < 0) fieldWidth = 0;
      fieldWidth = fieldWidth * 10 + (p - '0');
      goto again;

    case 'c':
      {
        u32 ch = va_arg(ap,int);
        if (!alt || IsPrint(ch)) Print(ch,Format::BYTE);
        else {
          Print('[',Format::BYTE);
          if (ch < 0x10)
            Print('0', Format::BYTE);
          Print(ch, Format::HEX);
          Print(']', Format::BYTE);
        }
      }
      break;

    case '<':  // Print the rest of a given &ByteSource
      {
        ByteSource * bs = va_arg(ap,ByteSource*);
        if (!bs) Print("(null)");
        else Copy(*bs);
      }
      break;

    case '@':
      {
        s32 argument = 0;
        if (alt) argument = va_arg(ap,s32);
        ByteSerializable * bs = va_arg(ap,ByteSerializable*);
        if (!bs) Print("(null)");
        else Print(*bs, argument);
      }
      break;

    case 'q':
      Print(va_arg(ap,u64), Format::BEU64);
      break;
    case 'H': type = Format::LEXHD; goto print;
    case 'h': type = Format::BEU16; goto print;
    case 'l': type = Format::BEU32; goto print;

    case 'b': type = Format::BIN; goto printbase;
    case 'o': type = Format::OCT; goto printbase;
    case 'u': type = Format::DEC; goto printbase;
    case 'd':
      if (alt) {
        type = Format::DEC;
        goto printbase;
      } else {
        Print(va_arg(ap,s32), fieldWidth, padChar);
        break;
      }
    case 'x': type = Format::HEX; goto printbase;
    case 't': type = Format::B36; goto printbase;

    case 'D': type = Format::LEX32; goto print;
    case 'X': type = Format::LXX32; goto print;

    printbase:
      PrintInBase(va_arg(ap,u32), type, fieldWidth, padChar);
      break;
    print:
      Print(va_arg(ap,u32),type, fieldWidth, padChar);
      break;

    case 'f': 
    {
      FAIL(INCOMPLETE_CODE);
      /*
      double v =  va_arg(ap,double);
      ByteSink::Print(face,v);
      break;
      */
    }

    /* %Z: Print a null-terminated string INCLUDING a trailing NULL */
    case 'Z': 
    {
      const char * s = va_arg(ap,const char *);
      if (!s) Print("(null)", fieldWidth, padChar);
      else Print(s, fieldWidth, padChar);
    }
    /* FALL THROUGH */
    /* %z: Print a null byte.  Consumes no args */
    case 'z': 
      WriteByte('\0');
      break;

    case 's': {
      const char * s = va_arg(ap,const char *);
      if (!s) Print("(null)", fieldWidth, padChar);
      else Print(s, fieldWidth, padChar);
      break;
    }

    case 'S': {
      const char * s = va_arg(ap,const char *);
      if (!s) Print("(null)", fieldWidth, padChar);
      else PrintDoubleQuotedCStringWithLength(s);
      break;
    }

    case 'p': {
      const void * p = va_arg(ap,void *);
      if (!p) Print("(nullp)");
      else {
        Print("0x");
        Print((uptr) p, Format::HEX);
      }
      break;
    }

    case '%':
      Print(p, Format::BYTE);
      break;

    default:                    // Either I don't know that code, or you're bogus.
      FAIL(BAD_FORMAT_ARG);     // Either way, I die.  You're welcome.
    }
    }
  }
  void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const Teuchos::ParameterList & pL = GetParameterList();
    // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
    // TODO (JG): I don't really know if we want to do this.
    const int    startLevel          = pL.get<int>   ("repartition: start level");
    const LO     minRowsPerProcessor = pL.get<LO>    ("repartition: min rows per proc");
    const double nonzeroImbalance    = pL.get<double>("repartition: max imbalance");
    const bool   remapPartitions     = pL.get<bool>  ("repartition: remap parts");

    // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");

    // ======================================================================================================
    // Determine whether partitioning is needed
    // ======================================================================================================
    // NOTE: most tests include some global communication, which is why we currently only do tests until we make
    // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
    // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.

    // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
    if (currentLevel.GetLevelID() < startLevel) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" <<
          "\n  current level = " << Teuchos::toString(currentLevel.GetLevelID()) <<
          ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    RCP<const Map> rowMap = A->getRowMap();

    // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is
    // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it
    // does not matter.
    RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
    RCP<const Teuchos::Comm<int> > comm     = origComm->duplicate();

    // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
    // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
    // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
    {
      int numActiveProcesses = 0;
      MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses);

      if (numActiveProcesses == 1) {
        GetOStream(Statistics0) << "Repartitioning?  NO:" <<
            "\n  # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;

        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    bool test3 = false, test4 = false;
    std::string msg3, msg4;

    // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
    // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
    if (minRowsPerProcessor > 0) {
      LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
      LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0;
      MueLu_sumAll(comm, haveFewRows, numWithFewRows);
      MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);

      // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
      // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
      // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
      if (numWithFewRows > 0)
        test3 = true;

      msg3 = "\n  min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor);
    }

    // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
    if (!test3) {
      GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries());
      MueLu_maxAll(comm, numMyNnz,                           maxNnz);
      MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
      double imbalance = Teuchos::as<double>(maxNnz)/minNnz;

      if (imbalance > nonzeroImbalance)
        test4 = true;

      msg4 = "\n  nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
    }

    if (!test3 && !test4) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" << msg3 + msg4 << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;

    GO                     indexBase = rowMap->getIndexBase();
    Xpetra::UnderlyingLib  lib       = rowMap->lib();
    int myRank   = comm->getRank();
    int numProcs = comm->getSize();

    RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
    TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();

    // ======================================================================================================
    // Calculate number of partitions
    // ======================================================================================================
    // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
    // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
    GO numPartitions;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    } else {
      if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) {
        // System is too small, migrate it to a single processor
        numPartitions = 1;

      } else {
        // Make sure that each processor has approximately minRowsPerProcessor
        numPartitions = A->getGlobalNumRows() / minRowsPerProcessor;
      }
      numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs));

      currentLevel.Set("number of partitions", numPartitions, NoFactory::get());
    }
    GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl;

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition;
    if (numPartitions == 1) {
      // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
      // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
      // in Zoltan[12]Interface).
      // TODO: We can probably skip more work in this case (like building all extra data structures)
      GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl;
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true);

    } else {
      decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");

      if (decomposition.is_null()) {
        GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    // ======================================================================================================
    // Remap if necessary
    // ======================================================================================================
    // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
    // There are two problems, however.
    // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
    //     This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
    //     each partition is the same.
    // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
    //     Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
    //     GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
    //     processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
    //     be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
    //     algorithm can resolve these differently. For instance, running
    //         mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
    //     with
    //         <ParameterList name="MueLu">
    //           <Parameter name="coarse: max size"                type="int"      value="1"/>
    //           <Parameter name="repartition: enable"             type="bool"     value="true"/>
    //           <Parameter name="repartition: min rows per proc"  type="int"      value="2"/>
    //           <ParameterList name="level 1">
    //             <Parameter name="repartition: remap parts"      type="bool"     value="false/true"/>
    //           </ParameterList>
    //         </ParameterList>
    //     produces different repartitioning for level 2.
    //     This different repartitioning may then escalate into different aggregation for the next level.
    //
    // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
    // Fixing (2) is more complicated.
    // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
    // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
    // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
    // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
    // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
    // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
    // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
    // MPI_COMM_WORLD, which may lead to issues for logging.
    if (remapPartitions) {
      SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);

      DeterminePartitionPlacement(*A, *decomposition, numPartitions);
    }

    // ======================================================================================================
    // Construct importer
    // ======================================================================================================
    // At this point, the following is true:
    //  * Each processors owns 0 or 1 partitions
    //  * If a processor owns a partition, that partition number is equal to the processor rank
    //  * The decomposition vector contains the partitions ids that the corresponding GID belongs to

    ArrayRCP<const GO> decompEntries;
    if (decomposition->getLocalLength() > 0)
      decompEntries = decomposition->getData(0);

#ifdef HAVE_MUELU_DEBUG
    // Test range of partition ids
    int incorrectRank = -1;
    for (int i = 0; i < decompEntries.size(); i++)
      if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
        incorrectRank = myRank;
        break;
      }

    int incorrectGlobalRank = -1;
    MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
    TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
#endif

    Array<GO> myGIDs;
    myGIDs.reserve(decomposition->getLocalLength());

    // Step 0: Construct mapping
    //    part number -> GIDs I own which belong to this part
    // NOTE: my own part GIDs are not part of the map
    typedef std::map<GO, Array<GO> > map_type;
    map_type sendMap;
    for (LO i = 0; i < decompEntries.size(); i++) {
      GO id  = decompEntries[i];
      GO GID = rowMap->getGlobalElement(i);

      if (id == myRank)
        myGIDs     .push_back(GID);
      else
        sendMap[id].push_back(GID);
    }
    decompEntries = Teuchos::null;

    if (IsPrint(Statistics2)) {
      GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
      MueLu_sumAll(comm,numLocalKept, numGlobalKept);
      GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl;
    }

    int numSend = sendMap.size(), numRecv;

    // Arrayify map keys
    Array<GO> myParts(numSend), myPart(1);
    int cnt = 0;
    myPart[0] = myRank;
    for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
      myParts[cnt++] = it->first;

    // Step 1: Find out how many processors send me data
    // partsIndexBase starts from zero, as the processors ids start from zero
    GO partsIndexBase = 0;
    RCP<Map>    partsIHave  = MapFactory   ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
    RCP<Map>    partsIOwn   = MapFactory   ::Build(lib,                                                 numProcs,  myPart(), partsIndexBase, comm);
    RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);

    RCP<GOVector> partsISend    = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave);
    RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn);
    if (numSend) {
      ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
      for (int i = 0; i < numSend; i++)
        partsISendData[i] = 1;
    }
    (numPartsIRecv->getDataNonConst(0))[0] = 0;

    numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
    numRecv = (numPartsIRecv->getData(0))[0];

    // Step 2: Get my GIDs from everybody else
    MPI_Datatype MpiType = MpiTypeTraits<GO>::getType();
    int msgTag = 12345;  // TODO: use Comm::dup for all internal messaging

    // Post sends
    Array<MPI_Request> sendReqs(numSend);
    cnt = 0;
    for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
      MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);

    map_type recvMap;
    size_t totalGIDs = myGIDs.size();
    for (int i = 0; i < numRecv; i++) {
      MPI_Status status;
      MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);

      // Get rank and number of elements from status
      int fromRank = status.MPI_SOURCE, count;
      MPI_Get_count(&status, MpiType, &count);

      recvMap[fromRank].resize(count);
      MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);

      totalGIDs += count;
    }

    // Do waits on send requests
    if (numSend) {
      Array<MPI_Status> sendStatuses(numSend);
      MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
    }

    // Merge GIDs
    myGIDs.reserve(totalGIDs);
    for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
      int offset = myGIDs.size(), len = it->second.size();
      if (len) {
        myGIDs.resize(offset + len);
        memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO));
      }
    }
    // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
    // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
    std::sort(myGIDs.begin(), myGIDs.end());

    // Step 3: Construct importer
    RCP<Map>          newRowMap      = MapFactory   ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
    RCP<const Import> rowMapImporter;
    {
      SubFactoryMonitor m1(*this, "Import construction", currentLevel);
      rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
    }

    Set(currentLevel, "Importer", rowMapImporter);

    // ======================================================================================================
    // Print some data
    // ======================================================================================================
    if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
      // Print the grid of processors
      GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;

      char amActive = (myGIDs.size() ? 1 : 0);
      std::vector<char> areActive(numProcs, 0);
      MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);

      int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
      for (int proc = 0; proc < numProcs; proc += rowWidth) {
        for (int j = 0; j < rowWidth; j++)
          if (proc + j < numProcs)
            GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
          else
          GetOStream(Statistics2) << " ";

        GetOStream(Statistics2) << "      " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
      }
    }

  } // Build
  void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    ParameterList pL = GetParameterList();
    bDefinitionPhase_ = false;  // definition phase is finished, now all aggregation algorithm information is fixed

    // define aggregation algorithms
    RCP<const FactoryBase> graphFact = GetFactory("Graph");

    // TODO Can we keep different aggregation algorithms over more Build calls?
    algos_.clear();
    if (pL.get<std::string>("aggregation: mode") == "old") {
      if (pL.get<bool>("UseOnePtAggregationAlgorithm")             == true)   algos_.push_back(rcp(new OnePtAggregationAlgorithm             (graphFact)));
      if (pL.get<bool>("UsePreserveDirichletAggregationAlgorithm") == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("UseUncoupledAggregationAlgorithm")         == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("UseMaxLinkAggregationAlgorithm")           == true)   algos_.push_back(rcp(new MaxLinkAggregationAlgorithm           (graphFact)));
      if (pL.get<bool>("UseEmergencyAggregationAlgorithm")         == true)   algos_.push_back(rcp(new EmergencyAggregationAlgorithm         (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));

    } else {
      if (pL.get<bool>("aggregation: preserve Dirichlet points")   == true)   algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 1" )             == true)   algos_.push_back(rcp(new AggregationPhase1Algorithm            (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2a")             == true)   algos_.push_back(rcp(new AggregationPhase2aAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 2b")             == true)   algos_.push_back(rcp(new AggregationPhase2bAlgorithm           (graphFact)));
      if (pL.get<bool>("aggregation: enable phase 3" )             == true)   algos_.push_back(rcp(new AggregationPhase3Algorithm            (graphFact)));
                                                                              algos_.push_back(rcp(new IsolatedNodeAggregationAlgorithm      (graphFact)));
    }

    std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name");
    RCP<const Map> OnePtMap;
    if (mapOnePtName.length()) {
      RCP<const FactoryBase> mapOnePtFact = GetFactory("OnePt aggregate map factory");
      OnePtMap = currentLevel.Get<RCP<const Map> >(mapOnePtName, mapOnePtFact.get());
    }

    RCP<const GraphBase> graph = Get< RCP<GraphBase> >(currentLevel, "Graph");

    // Build
    RCP<Aggregates> aggregates = rcp(new Aggregates(*graph));
    aggregates->setObjectLabel("UC");

    const LO numRows = graph->GetNodeNumVertices();

    // construct aggStat information
    std::vector<unsigned> aggStat(numRows, READY);

    ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap();
    if (dirichletBoundaryMap != Teuchos::null)
      for (LO i = 0; i < numRows; i++)
        if (dirichletBoundaryMap[i] == true)
          aggStat[i] = BOUNDARY;

    LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode");
    GO indexBase = graph->GetDomainMap()->getIndexBase();
    if (OnePtMap != Teuchos::null) {
      for (LO i = 0; i < numRows; i++) {
        // reconstruct global row id (FIXME only works for contiguous maps)
        GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase;

        for (LO kr = 0; kr < nDofsPerNode; kr++)
          if (OnePtMap->isNodeGlobalElement(grid + kr))
            aggStat[i] = ONEPT;
      }
    }


    const RCP<const Teuchos::Comm<int> > comm = graph->GetComm();
    GO numGlobalRows = 0;
    if (IsPrint(Statistics1))
      sumAll(comm, as<GO>(numRows), numGlobalRows);

    LO numNonAggregatedNodes = numRows;
    GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0;
    for (size_t a = 0; a < algos_.size(); a++) {
      std::string phase = algos_[a]->description();
      SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel);

      algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes);

      if (IsPrint(Statistics1)) {

        GO numLocalAggregated = numRows - numNonAggregatedNodes, numGlobalAggregated = 0;
        GO numLocalAggs       = aggregates->GetNumAggregates(),  numGlobalAggs = 0;
        sumAll(comm, numLocalAggregated, numGlobalAggregated);
        sumAll(comm, numLocalAggs,       numGlobalAggs);

        double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows);
        if (aggPercent > 99.99 && aggPercent < 100.00) {
          // Due to round off (for instance, for 140465733/140466897), we could
          // get 100.00% display even if there are some remaining nodes. This
          // is bad from the users point of view. It is much better to change
          // it to display 99.99%.
          aggPercent = 99.99;
        }
        GetOStream(Statistics1) << "  aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed
                                   << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n"
                                   << "  remaining  : " << numGlobalRows - numGlobalAggregated << "\n"
                                   << "  aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl;
        numGlobalAggregatedPrev = numGlobalAggregated;
        numGlobalAggsPrev       = numGlobalAggs;
      }
    }

    TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!");

    aggregates->AggregatesCrossProcessors(false);

    Set(currentLevel, "Aggregates", aggregates);

    GetOStream(Statistics0) << aggregates->description() << std::endl;
  }
void mutt_FormatString (char *dest,		/* output buffer */
			size_t destlen,		/* output buffer len */
			const char *src,	/* template string */
			format_t *callback,	/* callback for processing */
			unsigned long data,	/* callback data */
			format_flag flags)	/* callback flags */
{
  char prefix[SHORT_STRING], buf[LONG_STRING], *cp, *wptr = dest, ch;
  char ifstring[SHORT_STRING], elsestring[SHORT_STRING];
  size_t wlen, count, len;

  destlen--; /* save room for the terminal \0 */
  wlen = (flags & M_FORMAT_ARROWCURSOR && option (OPTARROWCURSOR)) ? 3 : 0;
    
  while (*src && wlen < destlen)
  {
    if (*src == '%')
    {
      if (*++src == '%')
      {
	*wptr++ = '%';
	wlen++;
	src++;
	continue;
      }

      if (*src == '?')
      {
	flags |= M_FORMAT_OPTIONAL;
	src++;
      }
      else
      {
	flags &= ~M_FORMAT_OPTIONAL;

	/* eat the format string */
	cp = prefix;
	count = 0;
	while (count < sizeof (prefix) &&
	       (isdigit ((unsigned char) *src) || *src == '.' || *src == '-'))
	{
	  *cp++ = *src++;
	  count++;
	}
	*cp = 0;
      }

      if (!*src)
	break; /* bad format */

      ch = *src++; /* save the character to switch on */

      if (flags & M_FORMAT_OPTIONAL)
      {
        if (*src != '?')
          break; /* bad format */
        src++;

        /* eat the `if' part of the string */
        cp = ifstring;
	count = 0;
        while (count < sizeof (ifstring) && *src && *src != '?' && *src != '&')
	{
          *cp++ = *src++;
	  count++;
	}
        *cp = 0;

	/* eat the `else' part of the string (optional) */
	if (*src == '&')
	  src++; /* skip the & */
	cp = elsestring;
	count = 0;
	while (count < sizeof (elsestring) && *src && *src != '?')
	{
	  *cp++ = *src++;
	  count++;
	}
	*cp = 0;

	if (!*src)
	  break; /* bad format */

        src++; /* move past the trailing `?' */
      }

      /* handle generic cases first */
      if (ch == '>')
      {
	/* right justify to EOL */
	ch = *src++; /* pad char */
	/* calculate space left on line.  if we've already written more data
	   than will fit on the line, ignore the rest of the line */
	count = (COLS < destlen ? COLS : destlen);
	if (count > wlen)
	{
	  count -= wlen; /* how many chars left on this line */
	  mutt_FormatString (buf, sizeof (buf), src, callback, data, flags);
	  len = mutt_strlen (buf);
	  if (count > len)
	  {
	    count -= len; /* how many chars to pad */
	    memset (wptr, ch, count);
	    wptr += count;
	    wlen += count;
	  }
	  if (len + wlen > destlen)
	    len = destlen - wlen;
	  memcpy (wptr, buf, len);
	  wptr += len;
	  wlen += len;
	}
	break; /* skip rest of input */
      }
      else if (ch == '|')
      {
	/* pad to EOL */
	ch = *src++;
	if (destlen > COLS)
	  destlen = COLS;
	if (destlen > wlen)
	{
	  count = destlen - wlen;
	  memset (wptr, ch, count);
	  wptr += count;
	}
	break; /* skip rest of input */
      }
      else
      {
	short tolower =  0;
	
	if (ch == '_')
	{
	  ch = *src++;
	  tolower = 1;
	}
	
	/* use callback function to handle this case */
	src = callback (buf, sizeof (buf), ch, src, prefix, ifstring, elsestring, data, flags);

	if (tolower)
	  mutt_strlower (buf);
	
	if ((len = mutt_strlen (buf)) + wlen > destlen)
	  len = (destlen - wlen > 0) ? (destlen - wlen) : 0;

	memcpy (wptr, buf, len);
	wptr += len;
	wlen += len;
      }
    }
    else if (*src == '\\')
    {
      if (!*++src)
	break;
      switch (*src)
      {
	case 'n':
	  *wptr = '\n';
	  break;
	case 't':
	  *wptr = '\t';
	  break;
	case 'r':
	  *wptr = '\r';
	  break;
	case 'f':
	  *wptr = '\f';
	  break;
	case 'v':
	  *wptr = '\v';
	  break;
	default:
	  *wptr = *src;
	  break;
      }
      src++;
      wptr++;
      wlen++;
    }
    else
    {
      *wptr++ = *src++;
      wlen++;
    }
  }
  *wptr = 0;

  if (flags & M_FORMAT_MAKEPRINT)
  {
    /* Make sure that the string is printable by changing all non-printable
       chars to dots, or spaces for non-printable whitespace */
    for (cp = dest ; *cp ; cp++)
      if (!IsPrint (*cp) &&
	  !((flags & M_FORMAT_TREE) && (*cp <= M_TREE_MAX)))
	*cp = isspace ((unsigned char) *cp) ? ' ' : '.';
  }
}
  void Q2Q1PFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A");

    RCP<const Map>        rowMap  = A->getRowMap();

    Xpetra::global_size_t N = rowMap->getGlobalNumElements();

    int V;
    size_t n = as<size_t>(sqrt(N));
    if (N == n*n) {
      // pressure mode
      V = 1;
      GetOStream(Runtime1) << "Pressure mode" << std::endl;

    } else {
      n = as<size_t>(sqrt(N/2));

      if (N == 2*n*n) {
        // velocity mode
        V = 2;
        GetOStream(Runtime1) << "Velocity mode" << std::endl;
      } else {
        throw Exceptions::RuntimeError("Matrix size (" + toString(N) + ") is incompatible with both velocity and pressure");
      }
    }

    const int             C = 4;
    Xpetra::global_size_t nc = (n-1)/C + 1;
    TEUCHOS_TEST_FOR_EXCEPTION(C*(nc-1)+1 != n, Exceptions::InvalidArgument, "Incorrect dim size: " << n);


    ArrayView<const GO> elementList = rowMap->getNodeElementList();
    GO indexBase = rowMap->getIndexBase();

    // Calculate offsets
    GO offset       = (V == 2 ? 0 : 2*(2*n -1)*(2*n -1));
    GO coarseOffset = (V == 2 ? 0 : 2*(2*nc-1)*(2*nc-1));

    GetOStream(Runtime1) << "offset = " << offset << ", coarseOffset = " << coarseOffset << std::endl;

    Array<GO> coarseList;
    for (LO k = 0; k < elementList.size(); k += V) {
      GO GID = elementList[k] - offset - indexBase;
      GO i = (GID / V) % n, ii = i/C;
      GO j = (GID / V) / n, jj = j/C;

      if (i % C == 0 && j % C == 0)
        for (int q = 0; q < V; q++)
          coarseList.push_back(V*(jj*nc + ii) + q + coarseOffset);
    }

    typedef Teuchos::ScalarTraits<SC> STS;
    SC one = STS::one();

    Xpetra::global_size_t INVALID = Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid();
    std::vector<size_t> stridingInfo(1,1);
    const int           stridedBlockId = -1;
    RCP<Map>         coarseMap       = StridedMapFactory ::Build(rowMap->lib(), INVALID, coarseList, indexBase, stridingInfo, rowMap->getComm(), stridedBlockId, coarseOffset);
    RCP<MultiVector> coarseNullspace = MultiVectorFactory::Build(coarseMap, 1);
    coarseNullspace->putScalar(one);

    int nnzEstimate = 4;
    RCP<Matrix> P = MatrixFactory::Build(rowMap, coarseMap, nnzEstimate);

    Array<GO> inds(nnzEstimate), inds1(nnzEstimate);
    Array<SC> vals(nnzEstimate, one);
    int sz;
    for (LO k = 0; k < elementList.size(); k += V) {
      GO GID = elementList[k] - offset - indexBase;
      GO i = (GID/V) % n, ii = i/C;
      GO j = (GID/V) / n, jj = j/C;

      if        (i % C == 0 && j % C == 0) {
        sz = 1;
        inds[0] =  jj   *nc + ii  ;

      } else if (i % C == 0 && j % C != 0) {
        sz = 2;
        inds[0] =  jj   *nc + ii  ;
        inds[1] = (jj+1)*nc + ii  ;

      } else if (i % C != 0 && j % C == 0) {
        sz = 2;
        inds[0] =  jj   *nc + ii  ;
        inds[1] =  jj   *nc + ii+1;

      } else {
        sz = 4;
        inds[0] =  jj   *nc + ii  ;
        inds[1] =  jj   *nc + ii+1;
        inds[2] = (jj+1)*nc + ii  ;
        inds[3] = (jj+1)*nc + ii+1;
      }

      for (int q = 0; q < V; q++) {
        for (int p = 0; p < sz; p++)
          inds1[p] = V*inds[p]+q + coarseOffset;

        P->insertGlobalValues(elementList[k]+q, inds1.view(0,sz), vals.view(0,sz));
      }
    }

    P->fillComplete(coarseMap, A->getDomainMap());

    // Level Set
    Set(coarseLevel, "Nullspace", coarseNullspace);
    Set(coarseLevel, "P",         P);
    Set(fineLevel,   "CoarseMap", coarseMap);

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      params->set("printCommInfo",          true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*P, "P", params);
    }

  }
  void RebalanceTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    const ParameterList& pL = GetParameterList();

    int implicit   = !pL.get<bool>("repartition: rebalance P and R");
    int writeStart = pL.get<int> ("write start");
    int writeEnd   = pL.get<int> ("write end");

    if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd && IsAvailable(fineLevel, "Coordinates")) {
      std::string fileName = "coordinates_level_0.m";
      RCP<MultiVector> fineCoords = fineLevel.Get< RCP<MultiVector> >("Coordinates");
      if (fineCoords != Teuchos::null)
        Utils::Write(fileName, *fineCoords);
    }

    RCP<const Import> importer = Get<RCP<const Import> >(coarseLevel, "Importer");
    if (implicit) {
      // Save the importer, we'll need it for solve
      coarseLevel.Set("Importer", importer, NoFactory::get());
    }

    RCP<ParameterList> params = rcp(new ParameterList());;
    params->set("printLoadBalancingInfo", true);
    params->set("printCommInfo",          true);

    std::string transferType = pL.get<std::string>("type");
    if (transferType == "Interpolation") {
      RCP<Matrix> originalP = Get< RCP<Matrix> >(coarseLevel, "P");

      {
        // This line must be after the Get call
        SubFactoryMonitor m1(*this, "Rebalancing prolongator", coarseLevel);

        if (implicit || importer.is_null()) {
          GetOStream(Runtime0) << "Using original prolongator" << std::endl;
          Set(coarseLevel, "P", originalP);

        } else {
          // P is the transfer operator from the coarse grid to the fine grid.
          // P must transfer the data from the newly reordered coarse A to the
          // (unchanged) fine A.  This means that the domain map (coarse) of P
          // must be changed according to the new partition. The range map
          // (fine) is kept unchanged.
          //
          // The domain map of P must match the range map of R.  See also note
          // below about domain/range map of R and its implications for P.
          //
          // To change the domain map of P, P needs to be fillCompleted again
          // with the new domain map.  To achieve this, P is copied into a new
          // matrix that is not fill-completed.  The doImport() operation is
          // just used here to make a copy of P: the importer is trivial and
          // there is no data movement involved.  The reordering actually
          // happens during the fillComplete() with domainMap == importer->getTargetMap().
          RCP<Matrix> rebalancedP = originalP;
          RCP<const CrsMatrixWrap> crsOp = rcp_dynamic_cast<const CrsMatrixWrap>(originalP);
          TEUCHOS_TEST_FOR_EXCEPTION(crsOp == Teuchos::null, Exceptions::BadCast, "Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");

          RCP<CrsMatrix> rebalancedP2 = crsOp->getCrsMatrix();
          TEUCHOS_TEST_FOR_EXCEPTION(rebalancedP2 == Teuchos::null, std::runtime_error, "Xpetra::CrsMatrixWrap doesn't have a CrsMatrix");

          {
            SubFactoryMonitor subM(*this, "Rebalancing prolongator -- fast map replacement", coarseLevel);

            RCP<const Import> newImporter = ImportFactory::Build(importer->getTargetMap(), rebalancedP->getColMap());
            rebalancedP2->replaceDomainMapAndImporter(importer->getTargetMap(), newImporter);
          }

          ///////////////////////// EXPERIMENTAL
          // TODO FIXME somehow we have to transfer the striding information of the permuted domain/range maps.
          // That is probably something for an external permutation factory
          //   if (originalP->IsView("stridedMaps"))
          //     rebalancedP->CreateView("stridedMaps", originalP);
          ///////////////////////// EXPERIMENTAL

          Set(coarseLevel, "P", rebalancedP);

          if (IsPrint(Statistics1))
            GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*rebalancedP, "P (rebalanced)", params);
        }
      }

      if (importer.is_null()) {
        if (IsAvailable(coarseLevel, "Nullspace"))
          Set(coarseLevel, "Nullspace", Get<RCP<MultiVector> >(coarseLevel, "Nullspace"));

        if (pL.isParameter("Coordinates") && pL.get< RCP<const FactoryBase> >("Coordinates") != Teuchos::null)
          if (IsAvailable(coarseLevel, "Coordinates"))
            Set(coarseLevel, "Coordinates", Get< RCP<MultiVector> >(coarseLevel, "Coordinates"));

        return;
      }

      if (pL.isParameter("Coordinates") &&
          pL.get< RCP<const FactoryBase> >("Coordinates") != Teuchos::null &&
          IsAvailable(coarseLevel, "Coordinates")) {
        RCP<MultiVector> coords = Get<RCP<MultiVector> >(coarseLevel, "Coordinates");

        // This line must be after the Get call
        SubFactoryMonitor subM(*this, "Rebalancing coordinates", coarseLevel);

        LO nodeNumElts = coords->getMap()->getNodeNumElements();

        // If a process has no matrix rows, then we can't calculate blocksize using the formula below.
        LO myBlkSize = 0, blkSize = 0;
        if (nodeNumElts > 0)
          myBlkSize = importer->getSourceMap()->getNodeNumElements() / nodeNumElts;
        maxAll(coords->getMap()->getComm(), myBlkSize, blkSize);

        RCP<const Import> coordImporter;
        if (blkSize == 1) {
          coordImporter = importer;

        } else {
          // NOTE: there is an implicit assumption here: we assume that dof any node are enumerated consequently
          // Proper fix would require using decomposition similar to how we construct importer in the
          // RepartitionFactory
          RCP<const Map> origMap   = coords->getMap();
          GO             indexBase = origMap->getIndexBase();

          ArrayView<const GO> OEntries   = importer->getTargetMap()->getNodeElementList();
          LO                  numEntries = OEntries.size()/blkSize;
          ArrayRCP<GO> Entries(numEntries);
          for (LO i = 0; i < numEntries; i++)
            Entries[i] = (OEntries[i*blkSize]-indexBase)/blkSize + indexBase;

          RCP<const Map> targetMap = MapFactory::Build(origMap->lib(), origMap->getGlobalNumElements(), Entries(), indexBase, origMap->getComm());
          coordImporter = ImportFactory::Build(origMap, targetMap);
        }

        RCP<MultiVector> permutedCoords  = MultiVectorFactory::Build(coordImporter->getTargetMap(), coords->getNumVectors());
        permutedCoords->doImport(*coords, *coordImporter, Xpetra::INSERT);

        if (pL.get<bool>("useSubcomm") == true)
          permutedCoords->replaceMap(permutedCoords->getMap()->removeEmptyProcesses());

        Set(coarseLevel, "Coordinates", permutedCoords);

        std::string fileName = "rebalanced_coordinates_level_" + toString(coarseLevel.GetLevelID()) + ".m";
        if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd && permutedCoords->getMap() != Teuchos::null)
          Utils::Write(fileName, *permutedCoords);
      }

      if (IsAvailable(coarseLevel, "Nullspace")) {
        RCP<MultiVector> nullspace = Get< RCP<MultiVector> >(coarseLevel, "Nullspace");

        // This line must be after the Get call
        SubFactoryMonitor subM(*this, "Rebalancing nullspace", coarseLevel);

        RCP<MultiVector> permutedNullspace = MultiVectorFactory::Build(importer->getTargetMap(), nullspace->getNumVectors());
        permutedNullspace->doImport(*nullspace, *importer, Xpetra::INSERT);

        if (pL.get<bool>("useSubcomm") == true)
          permutedNullspace->replaceMap(permutedNullspace->getMap()->removeEmptyProcesses());

        Set(coarseLevel, "Nullspace", permutedNullspace);
      }

    } else {
      if (pL.get<bool>("transpose: use implicit") == false) {
        RCP<Matrix> originalR = Get< RCP<Matrix> >(coarseLevel, "R");

        SubFactoryMonitor m2(*this, "Rebalancing restriction", coarseLevel);

        if (implicit || importer.is_null()) {
          GetOStream(Runtime0) << "Using original restrictor" << std::endl;
          Set(coarseLevel, "R", originalR);

        } else {
          RCP<Matrix> rebalancedR;
          {
            SubFactoryMonitor subM(*this, "Rebalancing restriction -- fusedImport", coarseLevel);

            RCP<Map> dummy;         // meaning: use originalR's domain map.
            rebalancedR = MatrixFactory::Build(originalR, *importer, dummy, importer->getTargetMap());
          }
          Set(coarseLevel, "R", rebalancedR);

          ///////////////////////// EXPERIMENTAL
          // TODO FIXME somehow we have to transfer the striding information of the permuted domain/range maps.
          // That is probably something for an external permutation factory
          // if (originalR->IsView("stridedMaps"))
          //   rebalancedR->CreateView("stridedMaps", originalR);
          ///////////////////////// EXPERIMENTAL

          if (IsPrint(Statistics1))
            GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*rebalancedR, "R (rebalanced)", params);
        }
      }
    }
  }