void DropNegativeEntriesFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Matrix filtering (springs)", currentLevel);

    RCP<Matrix> Ain = Get< RCP<Matrix> >(currentLevel, "A");

    LocalOrdinal nDofsPerNode = Ain->GetFixedBlockSize();

    // create new empty Operator
    Teuchos::RCP<Matrix> Aout = MatrixFactory::Build(Ain->getRowMap(), Ain->getGlobalMaxNumRowEntries(), Xpetra::StaticProfile);

    size_t numLocalRows = Ain->getNodeNumRows();
    for(size_t row=0; row<numLocalRows; row++) {
        GlobalOrdinal grid = Ain->getRowMap()->getGlobalElement(row);

        int rDofID = Teuchos::as<int>(grid % nDofsPerNode);

        // extract row information from input matrix
        Teuchos::ArrayView<const LocalOrdinal> indices;
        Teuchos::ArrayView<const Scalar> vals;
        Ain->getLocalRowView(row, indices, vals);

        // just copy all values in output
        Teuchos::ArrayRCP<GlobalOrdinal> indout(indices.size(),Teuchos::ScalarTraits<GlobalOrdinal>::zero());
        Teuchos::ArrayRCP<Scalar>        valout(indices.size(),Teuchos::ScalarTraits<Scalar>::zero());

        size_t nNonzeros = 0;
        for(size_t i=0; i<(size_t)indices.size(); i++) {
            GlobalOrdinal gcid = Ain->getColMap()->getGlobalElement(indices[i]); // global column id

            int cDofID = Teuchos::as<int>(gcid % nDofsPerNode);
            if(rDofID == cDofID && Teuchos::ScalarTraits<Scalar>::magnitude(vals[i]) >= Teuchos::ScalarTraits<Scalar>::magnitude(Teuchos::ScalarTraits<Scalar>::zero())) {
                indout [nNonzeros] = gcid;
                valout [nNonzeros] = vals[i];
                nNonzeros++;
            }
        }
        indout.resize(nNonzeros);
        valout.resize(nNonzeros);

        Aout->insertGlobalValues(Ain->getRowMap()->getGlobalElement(row), indout.view(0,indout.size()), valout.view(0,valout.size()));
    }

    Aout->fillComplete(Ain->getDomainMap(), Ain->getRangeMap());

    // copy block size information
    Aout->SetFixedBlockSize(nDofsPerNode);

    GetOStream(Statistics0, 0) << "Nonzeros in A (input): " << Ain->getGlobalNumEntries() << ", Nonzeros after filtering A: " << Aout->getGlobalNumEntries() << std::endl;

    Set(currentLevel, "A", Aout);
}
Ejemplo n.º 2
0
  void UserPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    RCP<Matrix>      A             = Get< RCP<Matrix> >      (fineLevel, "A");
    RCP<MultiVector> fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");

    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() != 1, Exceptions::RuntimeError, "Block size > 1 has not been implemented");

    const Teuchos::ParameterList& pL = GetParameterList();

    std::string    mapFile   = pL.get<std::string>("mapFileName");
    RCP<const Map> rowMap    = A->getRowMap();
    RCP<const Map> coarseMap = Utils2::ReadMap(mapFile, rowMap->lib(), rowMap->getComm());
    Set(coarseLevel, "CoarseMap", coarseMap);

    std::string matrixFile = pL.get<std::string>("matrixFileName");
    RCP<Matrix> P          = Utils::Read(matrixFile, rowMap, coarseMap, coarseMap, rowMap);
#if 1
    Set(coarseLevel, "P", P);
#else
    // Expand column map by 1
    RCP<Matrix> P1 = Utils::Multiply(*A, false, *P, false);
    P = Utils::Read(matrixFile, rowMap, P1->getColMap(), coarseMap, rowMap);
    Set(coarseLevel, "P", P);
#endif

    RCP<MultiVector> coarseNullspace = MultiVectorFactory::Build(coarseMap, fineNullspace->getNumVectors());
    P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits<SC>::one(), Teuchos::ScalarTraits<SC>::zero());
    Set(coarseLevel, "Nullspace", coarseNullspace);

    // Coordinates transfer
    size_t n = Teuchos::as<size_t>(sqrt(coarseMap->getGlobalNumElements()));
    TEUCHOS_TEST_FOR_EXCEPTION(n*n != coarseMap->getGlobalNumElements(), Exceptions::RuntimeError, "Unfortunately, this is not the case, don't know what to do");

    RCP<MultiVector> coarseCoords = MultiVectorFactory::Build(coarseMap, 2);
    ArrayRCP<Scalar> x = coarseCoords->getDataNonConst(0), y = coarseCoords->getDataNonConst(1);
    for (size_t LID = 0; LID < coarseMap->getNodeNumElements(); ++LID) {
      GlobalOrdinal GID = coarseMap->getGlobalElement(LID) - coarseMap->getIndexBase();
      GlobalOrdinal i = GID % n, j = GID/n;
      x[LID] = i;
      y[LID] = j;
    }
    Set(coarseLevel, "Coordinates", coarseCoords);

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*P, "P", params);
    }
  }
Ejemplo n.º 3
0
  void RebalanceAcFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Computing Ac", coarseLevel);

    RCP<Matrix> originalAc = Get< RCP<Matrix> >(coarseLevel, "A");

    RCP<const Import> rebalanceImporter = Get< RCP<const Import> >(coarseLevel, "Importer");

    if (rebalanceImporter != Teuchos::null) {
      RCP<Matrix> rebalancedAc;
      {
        SubFactoryMonitor subM(*this, "Rebalancing existing Ac", coarseLevel);
        RCP<const Map> targetMap = rebalanceImporter->getTargetMap();

        const ParameterList & pL = GetParameterList();

        ParameterList XpetraList;
        if (pL.get<bool>("useSubcomm") == true) {
          GetOStream(Runtime0,0) << "Replacing maps with a subcommunicator" << std::endl;
          XpetraList.set("Restrict Communicator",true);
        }
        // NOTE: If the communicator is restricted away, Build returns Teuchos::null.
        rebalancedAc = MatrixFactory::Build(originalAc, *rebalanceImporter, targetMap, targetMap, rcp(&XpetraList,false));

        if (!rebalancedAc.is_null())
          rebalancedAc->SetFixedBlockSize(originalAc->GetFixedBlockSize());

        Set(coarseLevel, "A", rebalancedAc);
      }

      if (!rebalancedAc.is_null()) {
        RCP<ParameterList> params = rcp(new ParameterList());
        params->set("printLoadBalancingInfo", true);
        GetOStream(Statistics0, 0) << Utils::PrintMatrixInfo(*rebalancedAc, "Ac (rebalanced)", params);
      }

    } else {
      // Ac already built by the load balancing process and no load balancing needed
      GetOStream(Warnings0, 0) << "No rebalancing" << std::endl;
      GetOStream(Warnings0, 0) << "Jamming A into Level " << coarseLevel.GetLevelID() << " w/ generating factory "
                               << this << std::endl;
      Set(coarseLevel, "A", originalAc);
    }

  } //Build()
  void FilteredAFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Matrix filtering", currentLevel);

    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
    if (currentLevel.Get<bool>("Filtering", currentLevel.GetFactoryManager()->GetFactory("Filtering").get()) == false) {
      GetOStream(Runtime0) << "Filtered matrix is not being constructed as no filtering is being done" << std::endl;
      Set(currentLevel, "A", A);
      return;
    }
    size_t blkSize = A->GetFixedBlockSize();

    const ParameterList& pL = GetParameterList();
    bool lumping = pL.get<bool>("lumping");
    if (lumping)
      GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;

    RCP<GraphBase> G = Get< RCP<GraphBase> >(currentLevel, "Graph");

    SC zero = Teuchos::ScalarTraits<SC>::zero();

    // Both Epetra and Tpetra matrix-matrix multiply use the following trick:
    // if an entry of the left matrix is zero, it does not compute or store the
    // zero value.
    //
    // This trick allows us to bypass constructing a new matrix. Instead, we
    // make a deep copy of the original one, and fill it in with zeros, which
    // are ignored during the prolongator smoothing.
    RCP<Matrix> filteredA = MatrixFactory::Build(A->getCrsGraph());

    filteredA->resumeFill();

    ArrayView<const LO> inds;
    ArrayView<const SC> valsA;
#ifdef ASSUME_DIRECT_ACCESS_TO_ROW
    ArrayView<SC>       vals;
#else
    Array<SC>           vals;
#endif
    Array<char> filter(blkSize * G->GetImportMap()->getNodeNumElements(), 0);

    size_t numGRows = G->GetNodeNumVertices();
    for (size_t i = 0; i < numGRows; i++) {
      // Set up filtering array
      ArrayView<const LO> indsG = G->getNeighborVertices(i);
      for (size_t j = 0; j < as<size_t>(indsG.size()); j++)
        for (size_t k = 0; k < blkSize; k++)
          filter[indsG[j]*blkSize+k] = 1;

      for (size_t k = 0; k < blkSize; k++) {
        LO row = i*blkSize + k;

        A->getLocalRowView(row, inds, valsA);

        size_t nnz = inds.size();
        if (nnz == 0)
          continue;

#ifdef ASSUME_DIRECT_ACCESS_TO_ROW
        // Transform ArrayView<const SC> into ArrayView<SC>
        ArrayView<const SC> vals1;
        filteredA->getLocalRowView(row, inds, vals1);
        vals = ArrayView<SC>(const_cast<SC*>(vals1.getRawPtr()), nnz);

        memcpy(vals.getRawPtr(), valsA.getRawPtr(), nnz*sizeof(SC));
#else
        vals = Array<SC>(valsA);
#endif

        if (lumping == false) {
          for (size_t j = 0; j < nnz; j++)
            if (!filter[inds[j]])
              vals[j] = zero;

        } else {
          LO diagIndex = -1;
          SC diagExtra = zero;

          for (size_t j = 0; j < nnz; j++) {
            if (filter[inds[j]])
              continue;

            if (inds[j] == row) {
              // Remember diagonal position
              diagIndex = j;

            } else {
              diagExtra += vals[j];
            }

            vals[j] = zero;
          }

          // Lump dropped entries
          // NOTE
          //  * Does it make sense to lump for elasticity?
          //  * Is it different for diffusion and elasticity?
          if (diagIndex != -1)
            vals[diagIndex] += diagExtra;
        }

#ifndef ASSUME_DIRECT_ACCESS_TO_ROW
        // Because we used a column map in the construction of the matrix
        // we can just use insertLocalValues here instead of insertGlobalValues
        filteredA->replaceLocalValues(row, inds, vals);
#endif
      }

      // Reset filtering array
      for (size_t j = 0; j < as<size_t> (indsG.size()); j++)
        for (size_t k = 0; k < blkSize; k++)
          filter[indsG[j]*blkSize+k] = 0;
    }

    RCP<ParameterList> fillCompleteParams(new ParameterList);
    fillCompleteParams->set("No Nonlocal Changes", true);
    filteredA->fillComplete(fillCompleteParams);

    filteredA->SetFixedBlockSize(blkSize);

    if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
      // Reuse max eigenvalue from A
      // It is unclear what eigenvalue is the best for the smoothing, but we already may have
      // the D^{-1}A estimate in A, may as well use it.
      // NOTE: ML does that too
      filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
    }

    Set(currentLevel, "A", filteredA);
  }
Ejemplo n.º 5
0
int main(int argc, char *argv[]) {
#include <MueLu_UseShortNames.hpp>

  using Teuchos::RCP; // reference count pointers
  using Teuchos::rcp;
  using Teuchos::TimeMonitor;

  // =========================================================================
  // MPI initialization using Teuchos
  // =========================================================================
  Teuchos::GlobalMPISession mpiSession(&argc, &argv, NULL);

  bool success = false;
  bool verbose = true;
  try {
    RCP< const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();

    // =========================================================================
    // Convenient definitions
    // =========================================================================
    SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

    // Instead of checking each time for rank, create a rank 0 stream
    RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
    Teuchos::FancyOStream& fancyout = *fancy;
    fancyout.setOutputToRootOnly(0);


    // =========================================================================
    // Parameters initialization
    // =========================================================================
    Teuchos::CommandLineProcessor clp(false);

    //GO nx = 100, ny = 100, nz = 100;
    //Galeri::Xpetra::Parameters<GO> matrixParameters(clp, nx, ny, nz, "Laplace2D"); // manage parameters of the test case
    Xpetra::Parameters             xpetraParameters(clp);                          // manage parameters of Xpetra

    std::string xmlFileName = "driver.xml";      clp.setOption("xml",                   &xmlFileName,     "read parameters from a file. Otherwise, this example uses by default 'scalingTest.xml'");
    int    amgAsPrecond     = 1;                 clp.setOption("precond",               &amgAsPrecond,     "apply multigrid as preconditioner");
    int    amgAsSolver      = 0;                 clp.setOption("fixPoint",              &amgAsSolver,      "apply multigrid as solver");
    bool   printTimings     = true;              clp.setOption("timings", "notimings",  &printTimings,     "print timings to screen");
    int    writeMatricesOPT = -2;                clp.setOption("write",                 &writeMatricesOPT, "write matrices to file (-1 means all; i>=0 means level i)");
    double tol              = 1e-6;             clp.setOption("tol",                   &tol,              "solver convergence tolerance");
    std::string krylovMethod = "gmres"; clp.setOption("krylov",                   &krylovMethod,     "outer Krylov method");
    int maxIts = 100; clp.setOption("maxits",           &maxIts,   "maximum number of Krylov iterations");
    int output = 1; clp.setOption("output",           &output,   "how often to print Krylov residual history");
    std::string matrixFileName = "crada1/crada_A.mm"; clp.setOption("matrixfile",            &matrixFileName,   "matrix market file containing matrix");
    std::string rhsFileName = "crada1/crada_b.mm";    clp.setOption("rhsfile",               &rhsFileName,      "matrix market file containing right-hand side");
    std::string nspFileName = "crada1/crada_ns.mm";   clp.setOption("nspfile",               &nspFileName,      "matrix market file containing fine level null space");
    std::string cooFileName = "crada1/crada_coordinates.mm"; clp.setOption("coordinatesfile",&cooFileName,      "matrix market file containing fine level coordinates");
    std::string spcFileName = "crada1/crada_special.mm"; clp.setOption("specialfile",        &spcFileName,      "matrix market file containing fine level special dofs");
    int nPDE = 3; clp.setOption("numpdes",           &nPDE,   "number of PDE equations");
    int nNspVectors = 3; clp.setOption("numnsp", &nNspVectors, "number of nullspace vectors. Only used if null space is read from file. Must be smaller or equal than the number of null space vectors read in from file.");
    std::string convType = "r0"; clp.setOption("convtype",                   &convType,     "convergence type (r0 or none)");

    switch (clp.parse(argc,argv)) {
      case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS; break;
      case Teuchos::CommandLineProcessor::PARSE_ERROR:
      case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; break;
      case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:                               break;
    }

    // =========================================================================
    // Problem construction
    // =========================================================================
    RCP<TimeMonitor> globalTimeMonitor = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("MatrixRead: S - Global Time"))), tm;

    comm->barrier();
    tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("Driver: 1 - Matrix Build")));

    RCP<Matrix> A = Teuchos::null;
    if (matrixFileName != "") {
      fancyout << "Read matrix from file " << matrixFileName << std::endl;
      A = Xpetra::IO<SC,LO,GO,Node>::Read(std::string(matrixFileName), xpetraParameters.GetLib(), comm);
    }
    RCP<const Map>   map = A->getRowMap();
    RCP<MultiVector> nullspace = MultiVectorFactory::Build(A->getDomainMap(),nPDE);
    A->SetFixedBlockSize(nPDE);
    fancyout << "#pdes = " << A->GetFixedBlockSize() << std::endl;

    if (nspFileName != "") {
      fancyout << "Read null space from file " << nspFileName << std::endl;
      nullspace = Xpetra::IO<SC,LO,GO,Node>::ReadMultiVector(std::string(nspFileName), A->getRowMap());
      fancyout << "Found " << nullspace->getNumVectors() << " null space vectors" << std::endl;
      if (nNspVectors > Teuchos::as<int>(nullspace->getNumVectors())) {
        fancyout << "Set number of null space vectors from " << nNspVectors << " to " << nullspace->getNumVectors() << " as only " << nullspace->getNumVectors() << " are provided by " << nspFileName << std::endl;
        nNspVectors = nullspace->getNumVectors();
      }
      if (nNspVectors < 1) {
        fancyout << "Set number of null space vectors from " << nNspVectors << " to " << nullspace->getNumVectors() << ". Note: we need at least one null space vector!!!" << std::endl;
        nNspVectors = nullspace->getNumVectors();
      }
      if (nNspVectors < Teuchos::as<int>(nullspace->getNumVectors())) {
        RCP<MultiVector> temp = MultiVectorFactory::Build(A->getDomainMap(),nNspVectors);
        for(int j=0; j<nNspVectors; j++) {
          Teuchos::ArrayRCP<SC> tempData = temp->getDataNonConst(j);
          Teuchos::ArrayRCP<const SC> nsData   = nullspace->getData(j);
          for (int i=0; i<nsData.size(); ++i) {
            tempData[i] = nsData[i];
          }
        }
        nullspace = Teuchos::null;
        nullspace = temp;
      }
    } else {
      if (nPDE == 1)
        nullspace->putScalar( Teuchos::ScalarTraits<SC>::one() );
      else {
        for (int i=0; i<nPDE; ++i) {
          Teuchos::ArrayRCP<SC> nsData = nullspace->getDataNonConst(i);
          for (int j=0; j<nsData.size(); ++j) {
            GO gel = A->getDomainMap()->getGlobalElement(j) - A->getDomainMap()->getIndexBase();
            if ((gel-i) % nPDE == 0)
              nsData[j] = Teuchos::ScalarTraits<SC>::one();
          }
        }
      }
    }

    RCP<MultiVector> coordinates = Teuchos::null; //MultiVectorFactory::Build(A->getDomainMap(),1);
    if (cooFileName != "") {
      std::vector<GO> myGIDs (map->getNodeNumElements() / A->GetFixedBlockSize());
      // reconstruct map for coordinates
      for(LO r = 0; r < Teuchos::as<LO>(map->getNodeNumElements() / A->GetFixedBlockSize()); ++r) {
        GO gid = map->getGlobalElement(r * A->GetFixedBlockSize());
        myGIDs[r] = gid;
      }

      GO gCntGIDs  = 0;
      GO glCntGIDs = Teuchos::as<GlobalOrdinal>(myGIDs.size());
      MueLu_sumAll(comm,glCntGIDs,gCntGIDs);

      Teuchos::Array<GlobalOrdinal> eltList(myGIDs);
      RCP<const Map> myCoordMap = MapFactory::Build (xpetraParameters.GetLib(),gCntGIDs,eltList(),0,comm);

      fancyout << "Read fine level coordinates from file " << cooFileName << std::endl;
      coordinates = Xpetra::IO<SC,LO,GO,Node>::ReadMultiVector(std::string(cooFileName), myCoordMap);
      fancyout << "Found " << nullspace->getNumVectors() << " null space vectors of length " << myCoordMap->getGlobalNumElements() << std::endl;
    }

    RCP<Map> mySpecialMap = Teuchos::null;
    if (spcFileName != "") {
      // read file on each processor and pick out the special dof numbers which belong to the current proc
      std::ifstream infile(spcFileName);
      std::string line;
      std::vector<GlobalOrdinal> mySpecialGids;
      GlobalOrdinal cnt = 0;   // count overall number of gids
      GlobalOrdinal mycnt = 0; // count only local gids
      while ( std::getline(infile, line)) {
        if(0 == line.find("%")) continue;
        if(0 == line.find(" ")) {
          cnt++;
          GlobalOrdinal gid;
          std::istringstream iss(line);
          iss >> gid;
          gid--; // note, that the matlab vector starts counting at 1 and not 0!
          if(map->isNodeGlobalElement(gid)) {
            mySpecialGids.push_back(gid);
            mycnt++;
          }
        }
      }

      Teuchos::Array<GlobalOrdinal> eltList(mySpecialGids);
      mySpecialMap = MapFactory::Build (xpetraParameters.GetLib(),cnt,eltList(),0,comm);

      // empty processors
      std::vector<size_t> lelePerProc(comm->getSize(),0);
      std::vector<size_t> gelePerProc(comm->getSize(),0);
      lelePerProc[comm->getRank()] = mySpecialMap->getNodeNumElements();
      Teuchos::reduceAll(*comm,Teuchos::REDUCE_MAX,comm->getSize(),&lelePerProc[0],&gelePerProc[0]);
      if(comm->getRank() == 0) {
        fancyout << "Distribution of " << cnt << " special dofs over processors:" << std::endl;
        fancyout << "Proc   #DOFs" << std::endl;
        for(int i=0; i<comm->getSize(); i++) {
         fancyout  << i << "      " << gelePerProc[i] << std::endl;
        }
      }
    }
Ejemplo n.º 6
0
int main(int argc, char *argv[]) {
#include <MueLu_UseShortNames.hpp>

  using Teuchos::RCP; // reference count pointers
  using Teuchos::rcp;
  using Teuchos::TimeMonitor;

  // =========================================================================
  // MPI initialization using Teuchos
  // =========================================================================
  Teuchos::GlobalMPISession mpiSession(&argc, &argv, NULL);
  RCP< const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();

  // =========================================================================
  // Convenient definitions
  // =========================================================================
  SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one();

  // Instead of checking each time for rank, create a rank 0 stream
  RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
  Teuchos::FancyOStream& fancyout = *fancy;
  fancyout.setOutputToRootOnly(0);


  // =========================================================================
  // Parameters initialization
  // =========================================================================
  Teuchos::CommandLineProcessor clp(false);

  GO nx = 100, ny = 100, nz = 100;
  Galeri::Xpetra::Parameters<GO> matrixParameters(clp, nx, ny, nz, "Laplace2D"); // manage parameters of the test case
  Xpetra::Parameters             xpetraParameters(clp);                          // manage parameters of Xpetra

  std::string xmlFileName = "scalingTest.xml"; clp.setOption("xml",                   &xmlFileName,     "read parameters from a file. Otherwise, this example uses by default 'scalingTest.xml'");
  int    amgAsPrecond     = 1;                 clp.setOption("precond",               &amgAsPrecond,     "apply multigrid as preconditioner");
  int    amgAsSolver      = 0;                 clp.setOption("fixPoint",              &amgAsSolver,      "apply multigrid as solver");
  bool   printTimings     = true;              clp.setOption("timings", "notimings",  &printTimings,     "print timings to screen");
  int    writeMatricesOPT = -2;                clp.setOption("write",                 &writeMatricesOPT, "write matrices to file (-1 means all; i>=0 means level i)");
  double tol              = 1e-12;             clp.setOption("tol",                   &tol,              "solver convergence tolerance");
  std::string krylovMethod = "cg"; clp.setOption("krylov",                   &krylovMethod,     "outer Krylov method");
  int maxIts = 100; clp.setOption("maxits",           &maxIts,   "maximum number of Krylov iterations");
  int output = 1; clp.setOption("output",           &output,   "how often to print Krylov residual history");
  std::string matrixFileName = "A.mm"; clp.setOption("matrixfile",           &matrixFileName,   "matrix market file containing matrix");
  std::string rhsFileName = ""; clp.setOption("rhsfile",           &rhsFileName,   "matrix market file containing right-hand side");
  int nPDE = 1; clp.setOption("numpdes",           &nPDE,   "number of PDE equations");
  std::string convType = "r0"; clp.setOption("convtype",                   &convType,     "convergence type (r0 or none)");

  switch (clp.parse(argc,argv)) {
    case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS; break;
    case Teuchos::CommandLineProcessor::PARSE_ERROR:
    case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; break;
    case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:                               break;
  }

  fancyout << "========================================================\n" << xpetraParameters << matrixParameters;

  // =========================================================================
  // Problem construction
  // =========================================================================
  RCP<TimeMonitor> globalTimeMonitor = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("MatrixRead: S - Global Time"))), tm;

  comm->barrier();
  tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1 - Matrix Build")));

  RCP<Matrix> A = Utils::Read(string(matrixFileName), xpetraParameters.GetLib(), comm);
  RCP<const Map>   map = A->getRowMap();
  RCP<MultiVector> nullspace = MultiVectorFactory::Build(A->getDomainMap(),nPDE);
  //RCP<MultiVector> fakeCoordinates = MultiVectorFactory::Build(A->getDomainMap(),1);
  A->SetFixedBlockSize(nPDE);
  std::cout << "#pdes = " << A->GetFixedBlockSize() << std::endl;
  if (nPDE == 1)
    nullspace->putScalar( Teuchos::ScalarTraits<SC>::one() );
  else {
    for (int i=0; i<nPDE; ++i) {
      Teuchos::ArrayRCP<SC> nsData = nullspace->getDataNonConst(i);
      for (int j=0; j<nsData.size(); ++j) {
        GO gel = A->getDomainMap()->getGlobalElement(j) - A->getDomainMap()->getIndexBase();
        if ((gel-i) % nPDE == 0)
          nsData[j] = Teuchos::ScalarTraits<SC>::one();
      }
    }
  }

  comm->barrier();
  tm = Teuchos::null;

  fancyout << "Galeri complete.\n========================================================" << std::endl;

  // =========================================================================
  // Preconditioner construction
  // =========================================================================
  comm->barrier();
  tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1.5 - MueLu read XML")));
  ParameterListInterpreter mueLuFactory(xmlFileName, *comm);

  comm->barrier();
  tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 2 - MueLu Setup")));

  RCP<Hierarchy> H = mueLuFactory.CreateHierarchy();

  // By default, we use Extreme. However, typically the xml file contains verbosity parameter
  // which is used instead
  H->SetDefaultVerbLevel(MueLu::Extreme);

  H->GetLevel(0)->Set("A",           A);
  H->GetLevel(0)->Set("Nullspace",   nullspace);
  //H->GetLevel(0)->Set("Coordinates", fakeCoordinates);

  mueLuFactory.SetupHierarchy(*H);

  comm->barrier();
  tm = Teuchos::null;

  // Print out the hierarchy stats. We should not need this line, but for some reason the
  // print out in the hierarchy construction does not work.
  H->print(fancyout);

  // =========================================================================
  // System solution (Ax = b)
  // =========================================================================
  comm->barrier();
  tm = rcp (new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 3 - LHS and RHS initialization")));

  RCP<Vector> X = VectorFactory::Build(map,1);
  RCP<MultiVector> B = VectorFactory::Build(map,1);

  if (rhsFileName != "")
    B = Utils2::ReadMultiVector(string(rhsFileName), A->getRowMap());
  else
  {
    // we set seed for reproducibility
    X->setSeed(846930886);
    bool useSameRandomGen = false;
    X->randomize(useSameRandomGen);
    A->apply(*X, *B, Teuchos::NO_TRANS, one, zero);

    Teuchos::Array<Teuchos::ScalarTraits<SC>::magnitudeType> norms(1);
    B->norm2(norms);
    //B->scale(1.0/norms[0]);
  }
  X->putScalar(zero);
  tm = Teuchos::null;

  if (writeMatricesOPT > -2)
    H->Write(writeMatricesOPT, writeMatricesOPT);

  comm->barrier();
  if (amgAsSolver) {
    tm = rcp (new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 4 - Fixed Point Solve")));

    H->IsPreconditioner(false);
    Teuchos::Array<Teuchos::ScalarTraits<SC>::magnitudeType> norms(1);
    norms = Utils::ResidualNorm(*A,*X,*B);
    std::cout << "                iter:    0           residual = " << norms[0] << std::endl;
    for (int i=0; i< maxIts; ++i) {
      H->Iterate(*B, *X);
      norms = Utils::ResidualNorm(*A,*X,*B);
      std::cout << "                iter:    " << i+1 << "           residual = " << norms[0] << std::endl;
    }

  } else if (amgAsPrecond) {
#ifdef HAVE_MUELU_BELOS
    tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 5 - Belos Solve")));
    // Operator and Multivector type that will be used with Belos
    typedef MultiVector          MV;
    typedef Belos::OperatorT<MV> OP;
    H->IsPreconditioner(true);

    // Define Operator and Preconditioner
    Teuchos::RCP<OP> belosOp   = Teuchos::rcp(new Belos::XpetraOp<SC, LO, GO, NO, LMO>(A)); // Turns a Xpetra::Matrix object into a Belos operator
    Teuchos::RCP<OP> belosPrec = Teuchos::rcp(new Belos::MueLuOp<SC, LO, GO, NO, LMO>(H));  // Turns a MueLu::Hierarchy object into a Belos operator

    // Construct a Belos LinearProblem object
    RCP< Belos::LinearProblem<SC, MV, OP> > belosProblem = rcp(new Belos::LinearProblem<SC, MV, OP>(belosOp, X, B));
    belosProblem->setRightPrec(belosPrec);

    bool set = belosProblem->setProblem();
    if (set == false) {
      fancyout << "\nERROR:  Belos::LinearProblem failed to set up correctly!" << std::endl;
      return EXIT_FAILURE;
    }

    // Belos parameter list
    Teuchos::ParameterList belosList;
    belosList.set("Maximum Iterations",    maxIts); // Maximum number of iterations allowed
    belosList.set("Convergence Tolerance", tol);    // Relative convergence tolerance requested
    belosList.set("Verbosity",             Belos::Errors + Belos::Warnings + Belos::StatusTestDetails);
    belosList.set("Output Frequency",      output);
    belosList.set("Output Style",          Belos::Brief);
    //belosList.set("Orthogonalization",     "ICGS");
    if (convType == "none") {
      belosList.set("Explicit Residual Scaling",  "None");
      belosList.set("Implicit Residual Scaling",  "None");
    }

    // Create an iterative solver manager
    RCP< Belos::SolverManager<SC, MV, OP> > solver;
    if (krylovMethod == "cg") {
      solver = rcp(new Belos::BlockCGSolMgr<SC, MV, OP>(belosProblem, rcp(&belosList, false)));
    } else if (krylovMethod == "gmres") {
      solver = rcp(new Belos::BlockGmresSolMgr<SC, MV, OP>(belosProblem, rcp(&belosList, false)));
    } else {
      TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, "Invalid Krylov method.  Options are \"cg\" or \" gmres\".");
    }

    // Perform solve
    Belos::ReturnType ret = Belos::Unconverged;
    try {
      ret = solver->solve();

      // Get the number of iterations for this solve.
      fancyout << "Number of iterations performed for this solve: " << solver->getNumIters() << std::endl;

    } catch(...) {
      fancyout << std::endl << "ERROR:  Belos threw an error! " << std::endl;
    }

    // Check convergence
    if (ret != Belos::Converged)
      fancyout << std::endl << "ERROR:  Belos did not converge! " << std::endl;
    else
      fancyout << std::endl << "SUCCESS:  Belos converged!" << std::endl;
#endif //ifdef HAVE_MUELU_BELOS
  }
  comm->barrier();
  tm = Teuchos::null;
  globalTimeMonitor = Teuchos::null;

  if (printTimings) {
    TimeMonitor::summarize(A->getRowMap()->getComm().ptr(), std::cout, false, true, false, Teuchos::Union);
    MueLu::MutuallyExclusiveTime<MueLu::BaseClass>::PrintParentChildPairs();
  }

  return 0;
} //main
void Zoltan2Interface<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& level) const {
    FactoryMonitor m(*this, "Build", level);

    RCP<Matrix>    A      = Get<RCP<Matrix> >(level, "A");
    RCP<const Map> rowMap = A->getRowMap();

    typedef Xpetra::MultiVector<double, LocalOrdinal, GlobalOrdinal, Node> dMultiVector;
    RCP<dMultiVector> coords      = Get<RCP<dMultiVector> >(level, "Coordinates");
    RCP<const Map>    map         = coords->getMap();
    GO                numElements = map->getNodeNumElements();

    LO blkSize  = A->GetFixedBlockSize();

    // Check that the number of local coordinates is consistent with the #rows in A
    TEUCHOS_TEST_FOR_EXCEPTION(rowMap->getNodeNumElements()/blkSize != coords->getLocalLength(), Exceptions::Incompatible,
                               "Coordinate vector length (" + toString(coords->getLocalLength()) << " is incompatible with number of block rows in A ("
                               + toString(rowMap->getNodeNumElements()/blkSize) + "The vector length should be the same as the number of mesh points.");
#ifdef HAVE_MUELU_DEBUG
    GO indexBase = rowMap->getIndexBase();
    GetOStream(Runtime0) << "Checking consistence of row and coordinates maps" << std::endl;
    // Make sure that logical blocks in row map coincide with logical nodes in coordinates map
    ArrayView<const GO> rowElements    = rowMap->getNodeElementList();
    ArrayView<const GO> coordsElements = map   ->getNodeElementList();
    for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
        TEUCHOS_TEST_FOR_EXCEPTION((coordsElements[i]-indexBase)*blkSize + indexBase != rowElements[i*blkSize],
                                   Exceptions::RuntimeError, "i = " << i << ", coords GID = " << coordsElements[i]
                                   << ", row GID = " << rowElements[i*blkSize] << ", blkSize = " << blkSize << std::endl);
#endif

    int numParts = Get<int>(level, "number of partitions");
    if (numParts == 1) {
        // Single processor, decomposition is trivial: all zeros
        RCP<Xpetra::Vector<GO,LO,GO,NO> > decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true);
        Set(level, "Partition", decomposition);
        return;
    } else if (numParts == -1) {
        // No repartitioning
        RCP<Xpetra::Vector<GO,LO,GO,NO> > decomposition = Teuchos::null; //Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true);
        //decomposition->putScalar(Teuchos::as<Scalar>(rowMap->getComm()->getRank()));
        Set(level, "Partition", decomposition);
        return;
    }


    const ParameterList& pL = GetParameterList();

    RCP<const ParameterList> providedList = pL.get<RCP<const ParameterList> >("ParameterList");
    ParameterList Zoltan2Params;
    if (providedList != Teuchos::null)
        Zoltan2Params = *providedList;

    // Merge defalt Zoltan2 parameters with user provided
    // If default and user parameters contain the same parameter name, user one is always preferred
    for (ParameterList::ConstIterator param = defaultZoltan2Params->begin(); param != defaultZoltan2Params->end(); param++) {
        const std::string& pName = defaultZoltan2Params->name(param);
        if (!Zoltan2Params.isParameter(pName))
            Zoltan2Params.set(pName, defaultZoltan2Params->get<std::string>(pName));
    }
    Zoltan2Params.set("num_global_parts", Teuchos::as<int>(numParts));

    GetOStream(Runtime0) << "Zoltan2 parameters:\n----------\n" << Zoltan2Params << "----------" << std::endl;

    const std::string& algo = Zoltan2Params.get<std::string>("algorithm");
    TEUCHOS_TEST_FOR_EXCEPTION(algo != "multijagged" && algo != "rcb", Exceptions::RuntimeError,
                               "Unknown partitioning algorithm: \"" << algo << "\"");

    typedef Zoltan2::XpetraMultiVectorAdapter<dMultiVector>  InputAdapterType;
    typedef Zoltan2::PartitioningProblem<InputAdapterType>   ProblemType;

    int rowWeight = pL.get<int>("rowWeight");
    GetOStream(Runtime0) << "Using weights formula: nnz + " << rowWeight << std::endl;

    Array<double> weightsPerRow(numElements);
    for (LO i = 0; i < numElements; i++) {
        weightsPerRow[i] = 0.0;

        for (LO j = 0; j < blkSize; j++) {
            weightsPerRow[i] += A->getNumEntriesInLocalRow(i*blkSize+j);
            // Zoltan2 pqJagged gets as good partitioning as Zoltan RCB in terms of nnz
            // but Zoltan also gets a good partioning in rows, which sometimes does not
            // happen for Zoltan2. So here is an attempt to get a better row partitioning
            // without significantly screwing up nnz partitioning
            // NOTE: no good heuristic here, the value was chosen almost randomly
            weightsPerRow[i] += rowWeight;
        }
    }

    std::vector<int>           strides;
    std::vector<const double*> weights(1, weightsPerRow.getRawPtr());

    RCP<const Teuchos::MpiComm<int> >            dupMpiComm = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(rowMap->getComm()->duplicate());
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > zoltanComm = dupMpiComm->getRawMpiComm();

    InputAdapterType adapter(coords, weights, strides);
    RCP<ProblemType> problem(new ProblemType(&adapter, &Zoltan2Params, (*zoltanComm)()));

    {
        SubFactoryMonitor m1(*this, "Zoltan2 " + toString(algo), level);
        problem->solve();
    }

    RCP<Xpetra::Vector<GO,LO,GO,NO> > decomposition = Xpetra::VectorFactory<GO,LO,GO,NO>::Build(rowMap, false);
    ArrayRCP<GO>                      decompEntries = decomposition->getDataNonConst(0);

    const typename InputAdapterType::part_t * parts = problem->getSolution().getPartListView();

    for (GO i = 0; i < numElements; i++) {
        int partNum = parts[i];

        for (LO j = 0; j < blkSize; j++)
            decompEntries[i*blkSize + j] = partNum;
    }

    Set(level, "Partition", decomposition);
}
  void ZoltanInterface<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& level) const {
    FactoryMonitor m(*this, "Build", level);

    RCP<Matrix>      A        = Get< RCP<Matrix> >     (level, "A");
    RCP<const Map>   rowMap   = A->getRowMap();

    RCP<MultiVector> Coords   = Get< RCP<MultiVector> >(level, "Coordinates");
    size_t           dim      = Coords->getNumVectors();

    GO               numParts = level.Get<GO>("number of partitions");

    if (numParts == 1) {
      // Running on one processor, so decomposition is the trivial one, all zeros.
      RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true);
      Set(level, "Partition", decomposition);
      return;
    }

    float zoltanVersion_;
    Zoltan_Initialize(0, NULL, &zoltanVersion_);

    RCP<const Teuchos::MpiComm<int> >            dupMpiComm = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(rowMap->getComm()->duplicate());
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > zoltanComm = dupMpiComm->getRawMpiComm();

    RCP<Zoltan> zoltanObj_ = rcp(new Zoltan((*zoltanComm)()));  //extract the underlying MPI_Comm handle and create a Zoltan object
    if (zoltanObj_ == Teuchos::null)
      throw Exceptions::RuntimeError("MueLu::Zoltan : Unable to create Zoltan data structure");

    // Tell Zoltan what kind of local/global IDs we will use.
    // In our case, each GID is two ints and there are no local ids.
    // One can skip this step if the IDs are just single ints.
    int rv;
    if ((rv = zoltanObj_->Set_Param("num_gid_entries", "1")) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_gid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("num_lid_entries", "0") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_lid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("obj_weight_dim", "1") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'obj_weight_dim' returned error code "  + Teuchos::toString(rv));

    if (GetVerbLevel() & Statistics1) zoltanObj_->Set_Param("debug_level", "1");
    else                              zoltanObj_->Set_Param("debug_level", "0");

    zoltanObj_->Set_Param("num_global_partitions", toString(numParts));

    zoltanObj_->Set_Num_Obj_Fn(GetLocalNumberOfRows,      (void *) &*A);
    zoltanObj_->Set_Obj_List_Fn(GetLocalNumberOfNonzeros, (void *) &*A);
    zoltanObj_->Set_Num_Geom_Fn(GetProblemDimension,      (void *) &dim);
    zoltanObj_->Set_Geom_Multi_Fn(GetProblemGeometry,     (void *) Coords.get());

    // Data pointers that Zoltan requires.
    ZOLTAN_ID_PTR import_gids = NULL;  // Global nums of objs to be imported
    ZOLTAN_ID_PTR import_lids = NULL;  // Local indices to objs to be imported
    int   *import_procs       = NULL;  // Proc IDs of procs owning objs to be imported.
    int   *import_to_part     = NULL;  // Partition #s to which imported objs should be assigned.
    ZOLTAN_ID_PTR export_gids = NULL;  // Global nums of objs to be exported
    ZOLTAN_ID_PTR export_lids = NULL;  // local indices to objs to be exported
    int   *export_procs       = NULL;  // Proc IDs of destination procs for objs to be exported.
    int   *export_to_part     = NULL;  // Partition #s for objs to be exported.
    int   num_imported;                // Number of objs to be imported.
    int   num_exported;                // Number of objs to be exported.
    int   newDecomp;                   // Flag indicating whether the decomposition has changed
    int   num_gid_entries;             // Number of array entries in a global ID.
    int   num_lid_entries;

    {
      SubFactoryMonitor m1(*this, "Zoltan RCB", level);
      rv = zoltanObj_->LB_Partition(newDecomp, num_gid_entries, num_lid_entries,
                                    num_imported, import_gids, import_lids, import_procs, import_to_part,
                                    num_exported, export_gids, export_lids, export_procs, export_to_part);
      if (rv == ZOLTAN_FATAL)
        throw Exceptions::RuntimeError("Zoltan::LB_Partition() returned error code");
    }

    // TODO check that A's row map is 1-1.  Zoltan requires this.

    RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition;
    if (newDecomp) {
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, false); // Don't initialize, will be overwritten
      ArrayRCP<GO> decompEntries = decomposition->getDataNonConst(0);

      int mypid = rowMap->getComm()->getRank();
      for (typename ArrayRCP<GO>::iterator i = decompEntries.begin(); i != decompEntries.end(); ++i)
        *i = mypid;

      LO blockSize = A->GetFixedBlockSize();
      for (int i = 0; i < num_exported; ++i) {
        // We have assigned Zoltan gids to first row GID in the block
        // NOTE: Zoltan GIDs are different from GIDs in the Coordinates vector
        LO  localEl = rowMap->getLocalElement(export_gids[i]);
        int partNum = export_to_part[i];
        for (LO j = 0; j < blockSize; ++j)
          decompEntries[localEl + j] = partNum;
      }
    }

    Set(level, "Partition", decomposition);

    zoltanObj_->LB_Free_Part(&import_gids, &import_lids, &import_procs, &import_to_part);
    zoltanObj_->LB_Free_Part(&export_gids, &export_lids, &export_procs, &export_to_part);

  } //Build()
Ejemplo n.º 9
0
  TEUCHOS_UNIT_TEST(Zoltan, Build3PDEs)
  {

    typedef Teuchos::ScalarTraits<Scalar> ST;

    out << "version: " << MueLu::Version() << std::endl;
    out << std::endl;
    out << "This tests that the partitioning produced by Zoltan is \"reasonable\" for a matrix" << std::endl;
    out << "that has a random number of nonzeros per row and 3 DOFs per mesh point.  Good results have been precomputed" << std::endl;
    out << "for up to 5 processors.  The results are the number of nonzeros in the local matrix" << std::endl;
    out << "once the Zoltan repartitioning has been applied." << std::endl;
    out << "The results can be viewed in Paraview by enabling code guarded by the macro MUELU_VISUALIZE_REPARTITIONING" << std::endl;

    RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm();

    if (comm->getSize() > 5) {
      out << std::endl;
      out << "This test must be run on 1 to 5 processes." << std::endl;
      TEST_EQUALITY(true, true);
      return;
    }

    Level level;
    RCP<FactoryManagerBase> factoryHandler = rcp(new FactoryManager());
    level.SetFactoryManager(factoryHandler);
    int nx=9;
    int ny=nx;
    int dofsPerNode = 3;
    GO numGlobalElements = nx*ny*dofsPerNode;
    size_t maxEntriesPerRow=30;

    RCP<const Map> map;
    int numMyNodes = numGlobalElements / dofsPerNode;
    if (comm->getSize() > 1) {
      // In parallel, make sure that the dof's associated with a node all
      // reside on the same processor.
      int numNodes = numGlobalElements / dofsPerNode;
      TEUCHOS_TEST_FOR_EXCEPTION( (numGlobalElements - numNodes * dofsPerNode) != 0, MueLu::Exceptions::RuntimeError,
                                  "Number of matrix rows is not divisible by #dofs" );
      int nproc = comm->getSize();
      if (comm->getRank() < nproc-1) numMyNodes = numNodes / nproc;
      else numMyNodes = numNodes - (numNodes/nproc) * (nproc-1);
      map = MapFactory::createContigMap(TestHelpers::Parameters::getLib(), numGlobalElements, numMyNodes*dofsPerNode, comm);
    } else {
      map = MapFactory::createUniformContigMap(TestHelpers::Parameters::getLib(), numGlobalElements, comm);
    }

    const size_t numMyElements = map->getNodeNumElements();
    Teuchos::ArrayView<const GlobalOrdinal> myGlobalElements = map->getNodeElementList();
    RCP<Matrix> A = rcp(new CrsMatrixWrap(map, 1)); // Force underlying linear algebra library to allocate more
                                                    // memory on the fly.  While not super efficient, this
                                                    // ensures that no zeros are being stored.  Thus, from
                                                    // Zoltan's perspective the matrix is imbalanced.
    // Populate CrsMatrix with random number of entries (up to maxEntriesPerRow) per row.
    // Create a vector with random integer entries in [1,maxEntriesPerRow].
    ST::seedrandom(666*comm->getRank());
    RCP<Xpetra::Vector<LO,LO,GO,NO> > entriesPerRow = Xpetra::VectorFactory<LO,LO,GO,NO>::Build(map,false);
    Teuchos::ArrayRCP<LO> eprData = entriesPerRow->getDataNonConst(0);
    for (Teuchos::ArrayRCP<LO>::iterator i=eprData.begin(); i!=eprData.end(); ++i) {
      *i = (LO)(std::floor(((ST::random()+1)*0.5*maxEntriesPerRow)+1));
    }

    RCP<Teuchos::FancyOStream> fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
    fos->setOutputToRootOnly(-1);

    Teuchos::Array<Scalar> vals(maxEntriesPerRow);
    Teuchos::Array<GO> cols(maxEntriesPerRow);
    for (size_t i = 0; i < numMyElements; ++i) {
      Teuchos::ArrayView<SC> av(&vals[0],eprData[i]);
      Teuchos::ArrayView<GO> iv(&cols[0],eprData[i]);
      //stick in ones for values
      for (LO j=0; j< eprData[i]; ++j) vals[j] = ST::one();
      //figure out valid column indices
      GO start = std::max(myGlobalElements[i]-eprData[i]+1,0);
      for (LO j=0; j< eprData[i]; ++j) cols[j] = start+j;
      A->insertGlobalValues(myGlobalElements[i], iv, av);
    }

    A->fillComplete();

    // Now treat the matrix as if it has 3 DOFs per node.
    A->SetFixedBlockSize(dofsPerNode);
    level.Set("A",A);

    //build coordinates
    Teuchos::ParameterList list;
    list.set("nx",nx);
    list.set("ny",ny);
    RCP<const Map> coalescedMap = MapFactory::createContigMap(TestHelpers::Parameters::getLib(), numGlobalElements/dofsPerNode, numMyNodes, comm);
    RCP<MultiVector> XYZ = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("2D",coalescedMap,list);

    // XYZ are the "coalesce" coordinates as it has been generated for 1 DOF/node and we are using them for 3 DOFS/node
    // level.Set("Coordinates",XYZ); "Coordinates" == uncoalesce. "X,Y,ZCoordinates" == coalesce
    {
      RCP<MultiVector> coordinates = XYZ;

      // making a copy because I don't want to keep 'open' the Xpetra_MultiVector
      if (coordinates->getNumVectors() >= 1) {
        Teuchos::ArrayRCP<const SC> coord = coordinates->getData(0);
        Teuchos::ArrayRCP<SC> coordCpy(coord.size());
        for(int i=0; i<coord.size(); i++) {
          coordCpy[i] = coord[i];
        }
        level.Set("XCoordinates", coordCpy);
        //std::cout << coordCpy << std::endl;
      }

      if (coordinates->getNumVectors() >= 2) {
        Teuchos::ArrayRCP<const SC> coord = coordinates->getData(1);
        Teuchos::ArrayRCP<SC> coordCpy(coord.size());
        for(int i=0; i<coord.size(); i++) {
          coordCpy[i] = coord[i];
        }
        level.Set("YCoordinates", coordCpy);
      }

      /*if (coordinates->getNumVectors() >= 3) {
        Teuchos::ArrayRCP<const SC> coord = coordinates->getData(2);
        Teuchos::ArrayRCP<SC> coordCpy(coord.size());
        for(int i=0; i<coord.size(); i++) {
          coordCpy[i] = coord[i];
        }
        level.Set("ZCoordinates", coordCpy);
        }*/
    }

    //coalescedMap->describe(*fos,Teuchos::VERB_EXTREME);
    //sleep(1); comm->barrier();
    //XYZ->describe(*fos,Teuchos::VERB_EXTREME);

    LO numPartitions = comm->getSize();
    level.Set("number of partitions",numPartitions);
    RCP<ZoltanInterface> zoltan = rcp(new ZoltanInterface());
    //zoltan->SetOutputLevel(0); //options are 0=none, 1=summary, 2=every pid prints
    level.Request("Partition",zoltan.get());
    zoltan->Build(level);

    RCP<Xpetra::Vector<GO,LO,GO,NO> > decomposition = level.Get<RCP<Xpetra::Vector<GO,LO,GO,NO> > >("Partition",zoltan.get());
    /* //temporary code to have the trivial decomposition (no change)
    ArrayRCP<GO> decompEntries = decomposition->getDataNonConst(0);
    for (ArrayRCP<GO>::iterator i = decompEntries.begin(); i != decompEntries.end(); ++i)
      *i = comm->getRank();
    decompEntries=Teuchos::null;
    */

    //Create vector whose local length is the global number of partitions.
    //This vector will record the local number of nonzeros associated with each partition.
    Teuchos::Array<GO> parts(numPartitions);
    for (int i=0; i<numPartitions; ++i) parts[i] = i;
    Teuchos::ArrayView<GO> partsView(&parts[0],numPartitions);
    RCP<const Map> partitionMap = MapFactory::Build(TestHelpers::Parameters::getLib(),
                                                    Teuchos::OrdinalTraits<global_size_t>::invalid(), partsView,
                                                    map->getIndexBase(),comm);
    RCP<Xpetra::Vector<LO,LO,GO,NO> > localPartsVec = Xpetra::VectorFactory<LO,LO,GO,NO>::Build(partitionMap);

    RCP<Xpetra::Vector<LO,LO,GO,NO> > nnzPerRow = Xpetra::VectorFactory<LO,LO,GO,NO>::Build(A->getRowMap());
    Teuchos::ArrayRCP<GO> nnzData = nnzPerRow->getDataNonConst(0);
    //For the local rows in each partition, tally up the number of nonzeros.  This is what
    //Zoltan should be load-balancing.
    Teuchos::ArrayRCP<GO> lpvData = localPartsVec->getDataNonConst(0);
    Teuchos::ArrayRCP<const GO> decompData = decomposition->getData(0);
    for (size_t i=0; i<decomposition->getLocalLength();++i) {
      Teuchos::ArrayView<const LO> c;
      Teuchos::ArrayView<const SC> v;
      A->getLocalRowView(i,c,v);
      lpvData[decompData[i]] += v.size();
      nnzData[i] = v.size();
    }

    lpvData = Teuchos::null;
    decompData = Teuchos::null;
    nnzData = Teuchos::null;

    /*
    if (comm->getRank() == 0)
      std::cout << "nnz per row" << std::endl;
    nnzPerRow->describe(*fos,Teuchos::VERB_EXTREME);

    if (comm->getRank() == 0)
      std::cout << "Row-to-partition assignment (from Zoltan)" << std::endl;
    decomposition->describe(*fos,Teuchos::VERB_EXTREME);

    if (comm->getRank() == 0)
      std::cout << "#nonzeros per partition" << std::endl;
    localPartsVec->describe(*fos,Teuchos::VERB_EXTREME);
    */

    //Send the local nnz tallies to pid 0, which can report the global sums.
    size_t mysize=1;
    if (comm->getRank() == 0) mysize = numPartitions;
    RCP<const Map> globalTallyMap = MapFactory::Build(TestHelpers::Parameters::getLib(),
                                                Teuchos::OrdinalTraits<global_size_t>::invalid(),
                                                mysize,
                                                map->getIndexBase(),
                                                comm);
    RCP<Xpetra::Vector<LO,LO,GO,NO> > globalTallyVec = Xpetra::VectorFactory<LO,LO,GO,NO>::Build(globalTallyMap);
    RCP<const Export> exporter = ExportFactory::Build( partitionMap, globalTallyMap);
    globalTallyVec->doExport(*localPartsVec,*exporter,Xpetra::ADD);

    ArrayRCP<GO> expectedResults(numPartitions);
    switch (comm->getSize()) {
       case 1:
         expectedResults[0] = 3951;
         break;

       case 2:
         expectedResults[0] = 1955;
         expectedResults[1] = 1910;
         break;

       case 3:
         expectedResults[0] = 1326;
         expectedResults[1] = 1340;
         expectedResults[2] = 1321;
         break;

       case 4:
         expectedResults[0] = 950;
         expectedResults[1] = 922;
         expectedResults[2] = 908;
         expectedResults[3] = 936;
         break;

       case 5:
         expectedResults[0] = 774;
         expectedResults[1] = 735;
         expectedResults[2] = 726;
         expectedResults[3] = 771;
         expectedResults[4] = 759;
         break;

       default:
         break;
    };

    ArrayRCP<const LO> gtvData = globalTallyVec->getData(0);

#ifdef __linux__
    out << "Checking results..." << std::endl;
    for (int i=0; i<numPartitions; ++i) {
      if (comm->getRank() == 0) TEST_EQUALITY( expectedResults[i], gtvData[i]);
    }
#endif

#ifdef MUELU_VISUALIZE_REPARTITIONING
    //
    //Now write everything to a comma-separate list that ParaView can grok
    //
    Teuchos::ArrayRCP<const Scalar> X = XYZ->getData(0);
    Teuchos::ArrayRCP<const Scalar> Y = XYZ->getData(1);
    Teuchos::ArrayRCP<const GO> D = decomposition->getData(0);
    RCP<std::ofstream> outFile;
    std::string fileName = "zoltanResults.csv";

    //write header information
    if (comm->getRank() == 0) {
      outFile = rcp(new std::ofstream(fileName.c_str()));
      *outFile << "x coord, y coord, z coord, partition, row weight" << std::endl;
    }
    comm->barrier();

    //append coordinates
    nnzData = nnzPerRow->getDataNonConst(0);
    for (int j=0; j<comm->getSize(); ++j) {
      int mypid = comm->getRank();
      if (mypid == j) {
        outFile = rcp(new std::ofstream(fileName.c_str(),std::ios::app));
        int blockSize = A->GetFixedBlockSize();
        //Coordinates are for coalesced system, D is for uncoalesced
        for (int i=0; i < D.size()/blockSize; ++i) {
          int nnz=0;
          for (int k=0; k<blockSize; ++k)  nnz += nnzData[i*blockSize+k];
            *outFile << X[i] << ", " << Y[i] << ", " << ST::zero() << ", "
                     << D[i*blockSize] << ", " << nnz << std::endl;
        }
      }
    } //for (int i=0; i<comm->getSize(); ++i)

    out << std::endl;
    out << "You can view the Zoltan decomposition in ParaView 3.10.1 or later:" << std::endl;
    out << "   1) Load the data file " << fileName << "." << std::endl;
    out << "   2) Run the filter Filters/ Alphabetical/ Table To Points." << std::endl;
    out << "   3) Tell ParaView what columns are the X, Y and Z coordinates." << std::endl;
    out << "   4) Split screen horizontally (Icon, top right)." << std::endl;
    out << "   5) Click on the eyeball in the Pipeline Browser to see the points." << std::endl;
    out << "   6) Under the Display tab, you can color points by scalar value and resize them." << std::endl;
    out << std::endl;
    out << " To display row weights next to each point:" << std::endl;
    out << "   1) Click the \"Select Points Through\" button (2nd row) and select all points." << std::endl;
    out << "   2) Under View pull-down menu, choose the \"Selection Inspector\"." << std::endl;
    out << "   3) Under the Point Label, check the Visible box and set the Label Mode to \"row weight\"." << std::endl;
#endif

  } //Build3PDEs
  void FilteredAFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& currentLevel) const {
    using Teuchos::as;

    FactoryMonitor m(*this, "Matrix filtering", currentLevel);

    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
    if (currentLevel.Get<bool>("Filtering", currentLevel.GetFactoryManager()->GetFactory("Filtering").get()) == false) {
      GetOStream(Runtime0,0) << "Filtered matrix is not being constructed as no filtering is being done" << std::endl;
      Set(currentLevel, "A", A);
      return;
    }

    const ParameterList& pL = GetParameterList();
    RCP<GraphBase>  G = Get< RCP<GraphBase> >(currentLevel, "Graph");
    bool      lumping = pL.get<bool>("lumping");
    size_t    blkSize = A->GetFixedBlockSize();

    if (lumping)
      GetOStream(Runtime0,0) << "Lumping dropped entries" << std::endl;

    // Calculate max entries per row
    RCP<Matrix> filteredA = MatrixFactory::Build(A->getRowMap(), A->getColMap(), A->getNodeMaxNumRowEntries(), Xpetra::StaticProfile);

    Array<LO>   newInds;
    Array<SC>   newVals;
    Array<char> filter(blkSize*G->GetImportMap()->getNodeNumElements(), 0);

    size_t numGRows = G->GetNodeNumVertices(), numInds = 0, diagIndex;
    SC diagExtra;
    for (size_t i = 0; i < numGRows; i++) {
      // Set up filtering array
      Teuchos::ArrayView<const LO> indsG = G->getNeighborVertices(i);
      for (size_t j = 0; j < as<size_t> (indsG.size()); j++)
        for (size_t k = 0; k < blkSize; k++)
          filter[indsG[j]*blkSize+k] = 1;

      for (size_t k = 0; k < blkSize; k++) {
        LocalOrdinal row = i*blkSize+k;
        ArrayView<const LO> oldInds;
        ArrayView<const SC> oldVals;
        A->getLocalRowView(row, oldInds, oldVals);

        diagIndex = as<size_t>(-1);
        diagExtra = Teuchos::ScalarTraits<SC>::zero();

        newInds.resize(oldInds.size());
        newVals.resize(oldVals.size());
        numInds = 0;
        for (size_t j = 0; j < as<size_t> (oldInds.size()); j++)
          if (filter[oldInds[j]]) {
            newInds[numInds] = oldInds[j];
            newVals[numInds] = oldVals[j];

            // Remember diagonal position
            if (newInds[numInds] == row)
              diagIndex = numInds;
            numInds++;

          } else {
            diagExtra += oldVals[j];
          }
        // Lump dropped entries
        // NOTE
        //  * Does it make sense to lump for elasticity?
        //  * Is it different for diffusion and elasticity?
        if (lumping)
          newVals[diagIndex] += diagExtra;

        newInds.resize(numInds);
        newVals.resize(numInds);

        // Because we used a column map in the construction of the matrix
        // we can just use insertLocalValues here instead of insertGlobalValues
        filteredA->insertLocalValues(row, newInds, newVals);
      }

      // Clean up filtering array
      for (size_t j = 0; j < as<size_t> (indsG.size()); j++)
        for (size_t k = 0; k < blkSize; k++)
          filter[indsG[j]*blkSize+k] = 0;
    }
    RCP<ParameterList> fillCompleteParams(new ParameterList);
    fillCompleteParams->set("No Nonlocal Changes", true);
    filteredA->fillComplete(A->getDomainMap(), A->getRangeMap(), fillCompleteParams);

    filteredA->SetFixedBlockSize(blkSize);

    // TODO: Can we reuse max eigenvalue from A?
    // filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());

    Set(currentLevel, "A", filteredA);
  }