Esempio n. 1
0
    // test outpout of Build() and BuildSmoother()
    void testBuildCheck(RCP<SmootherFactory> smooFact, Level& level, RCP<SmootherPrototype> smooProtoA, RCP<SmootherPrototype> smooProtoB, MueLu::PreOrPost preOrPost, Teuchos::FancyOStream & out, bool & success) {
      // invariant: smoother prototypes kept unchanged
      check(smooFact, smooProtoA, smooProtoB, out, success);

      // output test
      if (preOrPost == MueLu::BOTH) {

        testBuildCheckOutput(smooFact, level, smooProtoA, "PreSmoother", out, success);
        testBuildCheckOutput(smooFact, level, smooProtoB, "PostSmoother", out, success);

        // ReUse: if pre and post prototype are the same, then pre smoother == post smoother
        // otherwise, they are different (have not been tested by previous tests)
        RCP<SmootherBase> smooA, smooB;
        if (smooProtoA != Teuchos::null) { smooA = level.Get< RCP<SmootherBase> >("PreSmoother",  smooFact.get()); }
        if (smooProtoB != Teuchos::null) { smooB = level.Get< RCP<SmootherBase> >("PostSmoother", smooFact.get()); }
        if (smooProtoA == smooProtoB) { TEST_EQUALITY(smooA, smooB); } else { TEST_INEQUALITY(smooA, smooB); }

      } else if (preOrPost == MueLu::PRE) {

        testBuildCheckOutput(smooFact, level, smooProtoA, "PreSmoother", out, success);
        TEST_EQUALITY(level.IsAvailable("PostSmoother", smooFact.get()), false);

      } else if (preOrPost == MueLu::POST) {

        TEST_EQUALITY(level.IsAvailable("PreSmoother", smooFact.get()), false);
        testBuildCheckOutput(smooFact, level, smooProtoB, "PostSmoother", out, success);

      } else { TEST_EQUALITY(true, false); }

    }
Esempio n. 2
0
  void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator minimization", coarseLevel);

    const ParameterList & pL = GetParameterList();

    // Set keep flags
    if (pL.isParameter("Keep P0") && pL.get<bool>("Keep P0"))
      coarseLevel.Keep("P0",this);
    if (pL.isParameter("Keep Constraint0") && pL.get<bool>("Keep Constraint0"))
      coarseLevel.Keep("Constraint0",this);

    // Reuse
    int Niterations;

    // Get A, B
    RCP<Matrix>      A = Get< RCP<Matrix> >     (fineLevel,   "A");
    RCP<MultiVector> B = Get< RCP<MultiVector> >(fineLevel,   "Nullspace");

    // Get P0 or make P
    RCP<Matrix>      P0;
    if (coarseLevel.IsAvailable("P0", this)) {
      P0          = coarseLevel.Get<RCP<Matrix> >("P0", this);
      Niterations = pL.get<int>("Reuse Niterations");
      GetOStream(Runtime0, 0) << "EminPFactory: Reusing P0"<<std::endl;

    } else {
      P0          = Get< RCP<Matrix> >(coarseLevel, "P");
      Niterations = pL.get<int>("Niterations");
    }

    // Get Constraint0 or make Constraint
    RCP<Constraint> X;
    if (coarseLevel.IsAvailable("Constraint0", this)) {
      X = coarseLevel.Get<RCP<Constraint> >("Constraint0", this);
      GetOStream(Runtime0, 0) << "EminPFactory: Reusing Constraint0"<<std::endl;

    } else {
      X = Get< RCP<Constraint> > (coarseLevel, "Constraint");
    }


    RCP<Matrix> P;
    CGSolver EminSolver(Niterations);
    EminSolver.Iterate(*A, *X, *P0, *B, P);

    Set(coarseLevel, "Constraint0", X);
    Set(coarseLevel, "P",           P);
    Set(coarseLevel, "P0",          P);

    RCP<ParameterList> params = rcp(new ParameterList());
    params->set("printLoadBalancingInfo", true);
    GetOStream(Statistics0,0) << Utils::PrintMatrixInfo(*P, "P", params);
  }
Esempio n. 3
0
  void MapTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level & fineLevel, Level & coarseLevel) const {
    typedef Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> OperatorClass; //TODO
    typedef Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> MapClass;
    typedef Xpetra::MapFactory<LocalOrdinal, GlobalOrdinal, Node> MapFactoryClass;

    Monitor m(*this, "Contact Map transfer factory");

    if (fineLevel.IsAvailable(mapName_, mapFact_.get())==false) {
        GetOStream(Runtime0, 0) << "MapTransferFactory::Build: User provided map " << mapName_ << " not found in Level class." << std::endl;
    }

    // fetch map extractor from level
    RCP<const MapClass> transferMap = fineLevel.Get<RCP<const MapClass> >(mapName_,mapFact_.get());

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> tentPFact = GetFactory("P");
    if (tentPFact == Teuchos::null) { tentPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }
    TEUCHOS_TEST_FOR_EXCEPTION(!coarseLevel.IsAvailable("P",tentPFact.get()),Exceptions::RuntimeError, "MueLu::MapTransferFactory::Build(): P (generated by TentativePFactory) not available.");
    RCP<OperatorClass> Ptent = coarseLevel.Get<RCP<OperatorClass> >("P", tentPFact.get());

    std::vector<GlobalOrdinal > coarseMapGids;

    // loop over local rows of Ptent
    for(size_t row=0; row<Ptent->getNodeNumRows(); row++) {

      GlobalOrdinal grid = Ptent->getRowMap()->getGlobalElement(row);
      if(transferMap->isNodeGlobalElement(grid)) {

        Teuchos::ArrayView<const LocalOrdinal> indices;
        Teuchos::ArrayView<const Scalar> vals;
        Ptent->getLocalRowView(row, indices, vals);

        for(size_t i=0; i<(size_t)indices.size(); i++) {
          // mark all columns in Ptent(grid,*) to be coarse Dofs of next level transferMap
          GlobalOrdinal gcid = Ptent->getColMap()->getGlobalElement(indices[i]);
          coarseMapGids.push_back(gcid);
        }
      } // end if isNodeGlobalElement(grid)
    }

    // build column maps
    std::sort(coarseMapGids.begin(), coarseMapGids.end());
    coarseMapGids.erase(std::unique(coarseMapGids.begin(), coarseMapGids.end()), coarseMapGids.end());
    Teuchos::ArrayView<GlobalOrdinal> coarseMapGidsView (&coarseMapGids[0],coarseMapGids.size());
    Teuchos::RCP<const MapClass> coarseTransferMap = MapFactoryClass::Build(Ptent->getColMap()->lib(), -1, coarseMapGidsView, Ptent->getColMap()->getIndexBase(), Ptent->getColMap()->getComm());

    // store map extractor in coarse level
    coarseLevel.Set(mapName_, coarseTransferMap, mapFact_.get());
  }
Esempio n. 4
0
    // test if a smoother created by Build() is correct (check if it corresponds to the prototype)
    void testBuildCheckOutput(RCP<SmootherFactory> smooFact, Level& level, RCP<SmootherPrototype> smooProto, const std::string& tag, Teuchos::FancyOStream & out, bool & success) {
      if (smooProto == Teuchos::null) {
        TEST_EQUALITY(level.IsAvailable(tag, smooFact.get()), false);
      } else {
        RCP<SmootherBase> smoother;
        TEST_NOTHROW(smoother = level.Get< RCP<SmootherBase> >(tag, smooFact.get()));

        TEST_INEQUALITY(smoother, Teuchos::null);
        TEST_INEQUALITY(smoother, smooProto);

        if (smooProto != Teuchos::null) {
          RCP<FakeSmootherPrototype> smootherF;

          // ouput test: smoothers same derived class as prototypes
          TEST_NOTHROW(smootherF = rcp_dynamic_cast<FakeSmootherPrototype>(smoother,true));

          if (smootherF != Teuchos::null) {
            // output test: smoother parameters == prototype parameters
            RCP<FakeSmootherPrototype> smooProtoF = rcp_dynamic_cast<FakeSmootherPrototype>(smooProto,true);
            TEST_EQUALITY(smootherF->GetParam(), smooProtoF->GetParam());

            // output test: smoothers are ready to be apply
            TEST_EQUALITY(smootherF->IsSetup(), true);

            // setup done only once.
            TEST_EQUALITY(smootherF->GetNumOfSetupCall(), 1);
            TEST_EQUALITY(smootherF->GetNumOfSetup(), 1);
          }
        }
      }
    }
  void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::DeclareInput(Level& fineLevel, Level& coarseLevel) const {
    Input(fineLevel, "A");

    static bool isAvailableP0          = false;
    static bool isAvailableConstraint0 = false;

    // Here is a tricky little piece of code
    // We don't want to request (aka call Input) when we reuse and P0 is available
    // However, we cannot run something simple like this:
    //   if (!coarseLevel.IsAvailable("P0", this))
    //     Input(coarseLevel, "P");
    // The reason is that it works fine during the request stage, but fails
    // in the release stage as we _construct_ P0 during Build process. Therefore,
    // we need to understand whether we are in Request or Release mode
    // NOTE: This is a very unique situation, please try not to propagate the
    // mode check any further

    if (coarseLevel.GetRequestMode() == Level::REQUEST) {
      isAvailableP0          = coarseLevel.IsAvailable("P0",          this);
      isAvailableConstraint0 = coarseLevel.IsAvailable("Constraint0", this);
    }

    if (isAvailableP0 == false)
      Input(coarseLevel, "P");

    if (isAvailableConstraint0 == false)
      Input(coarseLevel, "Constraint");
  }
  void CloneRepartitionInterface<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);
    currentLevel.print(GetOStream(Statistics0,0));
    // extract blocked operator A from current level
    Teuchos::RCP<Matrix> A = Get< Teuchos::RCP<Matrix> >     (currentLevel, "A");
    Teuchos::RCP<const Teuchos::Comm< int > > comm = A->getRowMap()->getComm();

    // number of Partitions only used for a shortcut.
    GO numPartitions = 0;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    }

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition = Teuchos::null;

    // extract decomposition vector
    decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");
    ArrayRCP<const GO> decompEntries = decomposition->getData(0);

    if (decomposition.is_null()) {
      GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    // create new decomposition vector
    Teuchos::RCP<GOVector> ret = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), false);
    ArrayRCP<GO> retDecompEntries = ret->getDataNonConst(0);

    // block size of output vector
    LocalOrdinal blkSize = A->GetFixedBlockSize();

    // plausibility check!
    size_t inLocalLength  = decomposition->getLocalLength();
    size_t outLocalLength = A->getRowMap()->getNodeNumElements();

    size_t numLocalNodes = outLocalLength / blkSize;
    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(outLocalLength  % blkSize) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << outLocalLength << ") and degrees of freedoms ("<<blkSize<<")");

    if (numLocalNodes > 0) {
      TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(inLocalLength  % numLocalNodes) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << inLocalLength << ") and number of local nodes (" << numLocalNodes << ")");
      LocalOrdinal inBlkSize = Teuchos::as<LocalOrdinal>(inLocalLength / numLocalNodes);
      //TEUCHOS_TEST_FOR_EXCEPTION(blkSize != inBlkSize, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: input block size = " << inBlkSize << " outpub block size = " << blkSize << ". They should be the same.");

      for(LO i = 0; i<Teuchos::as<LO>(numLocalNodes); i++) {
        for(LO j = 0; j < blkSize; j++) {
          retDecompEntries[i*blkSize + j] = Teuchos::as<GO>(decompEntries[i*inBlkSize]);
        }
      }
    } // end if numLocalNodes > 0
    Set(currentLevel, "Partition", ret);
  } //Build()
 void RigidBodyModeFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::DeclareInput(Level &currentLevel) const {
   if (currentLevel.IsAvailable(nspName_, NoFactory::get()) == false && currentLevel.GetLevelID() == 0) {
     Input(currentLevel, "A");
     //Input(currentLevel,"Coordinates");
   }
   if (currentLevel.GetLevelID() !=0) {
     currentLevel.DeclareInput("Nullspace", GetFactory(nspName_).get(), this); /* ! "Nullspace" and nspName_ mismatch possible here */
   }
 }
  void TogglePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator toggle", coarseLevel);
    std::ostringstream levelstr;
    levelstr << coarseLevel.GetLevelID();

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    TEUCHOS_TEST_FOR_EXCEPTION(nspFacts_.size() != prolongatorFacts_.size(), Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: The number of provided prolongator factories and coarse nullspace factories must be identical.");
    TEUCHOS_TEST_FOR_EXCEPTION(nspFacts_.size() != 2, Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: TogglePFactory needs two different transfer operator strategies for toggling."); // TODO adapt this/weaken this as soon as other toggling strategies are introduced.

    // decision routine which prolongator factory to be used
    int nProlongatorFactory = 0; // default behavior: use first prolongator in list

    // extract user parameters
    const Teuchos::ParameterList & pL = GetParameterList();
    std::string mode = Teuchos::as<std::string>(pL.get<std::string>("toggle: mode"));
    int semicoarsen_levels = Teuchos::as<int>(pL.get<int>("semicoarsen: number of levels"));

    TEUCHOS_TEST_FOR_EXCEPTION(mode!="semicoarsen", Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: The 'toggle: mode' parameter must be set to 'semicoarsen'. No other mode supported, yet.");

    LO NumZDir = -1;
    if(fineLevel.IsAvailable("NumZLayers", NoFactory::get())) {
      NumZDir = fineLevel.Get<LO>("NumZLayers", NoFactory::get()); //obtain info
      GetOStream(Runtime1) << "Number of layers for semicoarsening: " << NumZDir << std::endl;
    }

    // Make a decision which prolongator to be used.
    if(fineLevel.GetLevelID() >= semicoarsen_levels || NumZDir == 1) {
      nProlongatorFactory = 1;
    } else {
      nProlongatorFactory = 0;
    }

    RCP<Matrix> P = Teuchos::null;
    RCP<MultiVector> coarseNullspace = Teuchos::null;

    // call Build for selected transfer operator
    GetOStream(Runtime0) << "TogglePFactory: call transfer factory: " << (prolongatorFacts_[nProlongatorFactory])->description() << std::endl;
    prolongatorFacts_[nProlongatorFactory]->CallBuild(coarseLevel);
    P = coarseLevel.Get< RCP<Matrix> >("P", (prolongatorFacts_[nProlongatorFactory]).get());
    coarseNullspace = coarseLevel.Get< RCP<MultiVector> >("Nullspace", (nspFacts_[nProlongatorFactory]).get());

    // Release dependencies of all prolongator and coarse level null spaces
    for(size_t t=0; t<nspFacts_.size(); ++t) {
      coarseLevel.Release(*(prolongatorFacts_[t]));
      coarseLevel.Release(*(nspFacts_[t]));
    }

    // store prolongator with this factory identification.
    Set(coarseLevel, "P", P);
    Set(coarseLevel, "Nullspace", coarseNullspace);

  } //Build()
  void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& fineLevel, Level& coarseLevel) const {
    static bool isAvailableCoords = false;

    if (coarseLevel.GetRequestMode() == Level::REQUEST)
      isAvailableCoords = coarseLevel.IsAvailable("Coordinates", this);

    if (isAvailableCoords == false) {
      Input(fineLevel, "Coordinates");
      Input(fineLevel, "Aggregates");
      Input(fineLevel, "CoarseMap");
    }
  }
Esempio n. 10
0
  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ThresholdAFilterFactory, Basic, Scalar, LocalOrdinal, GlobalOrdinal, Node)
  {
#   include <MueLu_UseShortNames.hpp>
    MUELU_TESTING_SET_OSTREAM;
    MUELU_TESTING_LIMIT_EPETRA_SCOPE(Scalar,GlobalOrdinal,Node);
    out << "version: " << MueLu::Version() << std::endl;

    Level aLevel;
    TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(aLevel);

    RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(20); //can be an empty operator

    RCP<ThresholdAFilterFactory> AfilterFactory0 = rcp(new ThresholdAFilterFactory("A",0.1)); // keep all
    RCP<ThresholdAFilterFactory> AfilterFactory1 = rcp(new ThresholdAFilterFactory("A",1.1)); // keep only diagonal
    RCP<ThresholdAFilterFactory> AfilterFactory2 = rcp(new ThresholdAFilterFactory("A",3));   // keep only diagonal

    aLevel.Set("A",A);

    aLevel.Request("A",AfilterFactory0.get());
    AfilterFactory0->Build(aLevel);
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory0.get()), true);
    RCP<Matrix> A0 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory0.get());
    aLevel.Release("A",AfilterFactory0.get());
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory0.get()), false);
    TEST_EQUALITY(A0->getNodeNumEntries(), A->getNodeNumEntries());
    TEST_EQUALITY(A0->getGlobalNumEntries(), A->getGlobalNumEntries());

    aLevel.Request("A",AfilterFactory1.get());
    AfilterFactory1->Build(aLevel);
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory1.get()), true);
    RCP<Matrix> A1 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory1.get());
    aLevel.Release("A",AfilterFactory1.get());
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory1.get()), false);
    TEST_EQUALITY(A1->getGlobalNumEntries(), A1->getGlobalNumRows());

    aLevel.Request("A",AfilterFactory2.get());
    AfilterFactory2->Build(aLevel);
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory2.get()), true);
    RCP<Matrix> A2 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory2.get());
    aLevel.Release("A",AfilterFactory2.get());
    TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory2.get()), false);
    TEST_EQUALITY(A2->getGlobalNumEntries(), A2->getGlobalNumRows());


  }
  void SaPFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel);

    // Add debugging information
    DeviceType::execution_space::print_configuration(GetOStream(Runtime1));

    typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude;

    // Get default tentative prolongator factory
    // Getting it that way ensure that the same factory instance will be used for both SaPFactory_kokkos and NullspaceFactory.
    // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere!
    RCP<const FactoryBase> initialPFact = GetFactory("P");
    if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); }

    // Level Get
    RCP<Matrix> A     = Get< RCP<Matrix> >(fineLevel, "A");
    RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get());

    if(restrictionMode_) {
      SubFactoryMonitor m2(*this, "Transpose A", coarseLevel);

      A = Utilities_kokkos::Transpose(*A, true); // build transpose of A explicitly
    }

    //Build final prolongator
    RCP<Matrix> finalP; // output

    // Reuse pattern if available
    RCP<ParameterList> APparams = rcp(new ParameterList);
    if (coarseLevel.IsAvailable("AP reuse data", this)) {
      GetOStream(static_cast<MsgType>(Runtime0 | Test)) << "Reusing previous AP data" << std::endl;

      APparams = coarseLevel.Get< RCP<ParameterList> >("AP reuse data", this);

      if (APparams->isParameter("graph"))
        finalP = APparams->get< RCP<Matrix> >("graph");
    }

    const ParameterList& pL = GetParameterList();
    SC dampingFactor      = as<SC>(pL.get<double>("sa: damping factor"));
    LO maxEigenIterations = as<LO>(pL.get<int>("sa: eigenvalue estimate num iterations"));
    bool estimateMaxEigen = pL.get<bool>("sa: calculate eigenvalue estimate");
    if (dampingFactor != Teuchos::ScalarTraits<SC>::zero()) {

      SC lambdaMax;
      {
        SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel);
        lambdaMax = A->GetMaxEigenvalueEstimate();
        if (lambdaMax == -Teuchos::ScalarTraits<SC>::one() || estimateMaxEigen) {
          GetOStream(Statistics1) << "Calculating max eigenvalue estimate now (max iters = "<< maxEigenIterations << ")" << std::endl;
          Magnitude stopTol = 1e-4;
          lambdaMax = Utilities_kokkos::PowerMethod(*A, true, maxEigenIterations, stopTol);
          A->SetMaxEigenvalueEstimate(lambdaMax);
        } else {
          GetOStream(Statistics1) << "Using cached max eigenvalue estimate" << std::endl;
        }
        GetOStream(Statistics0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl;
      }

      {
        SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel);
        RCP<Vector> invDiag = Utilities_kokkos::GetMatrixDiagonalInverse(*A);

        SC omega = dampingFactor / lambdaMax;

        // finalP = Ptent + (I - \omega D^{-1}A) Ptent
        finalP = Xpetra::IteratorOps<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Jacobi(omega, *invDiag, *A, *Ptent, finalP, GetOStream(Statistics2), std::string("MueLu::SaP-") + toString(coarseLevel.GetLevelID()), APparams);
      }

    } else {
      finalP = Ptent;
    }

    // Level Set
    if (!restrictionMode_) {
      // prolongation factory is in prolongation mode
      Set(coarseLevel, "P", finalP);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        finalP->CreateView("stridedMaps", Ptent);

    } else {
      // prolongation factory is in restriction mode
      RCP<Matrix> R = Utilities_kokkos::Transpose(*finalP, true);
      Set(coarseLevel, "R", R);

      // NOTE: EXPERIMENTAL
      if (Ptent->IsView("stridedMaps"))
        R->CreateView("stridedMaps", Ptent, true);
    }

    if (IsPrint(Statistics1)) {
      RCP<ParameterList> params = rcp(new ParameterList());
      params->set("printLoadBalancingInfo", true);
      params->set("printCommInfo",          true);
      GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params);
    }

  } //Build()
Esempio n. 12
0
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &fineLevel, Level &coarseLevel) const { // FIXME make fineLevel const
    {
      FactoryMonitor m(*this, "Computing Ac", coarseLevel);

      // Set "Keeps" from params
      const Teuchos::ParameterList& pL = GetParameterList();
      if (pL.isParameter("Keep AP Pattern")  && pL.get<bool>("Keep AP Pattern"))
        coarseLevel.Keep("AP Pattern",  this);
      if (pL.isParameter("Keep RAP Pattern") && pL.get<bool>("Keep RAP Pattern"))
        coarseLevel.Keep("RAP Pattern", this);

      //
      // Inputs: A, P
      //

      RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
      RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P");

      //
      // Build Ac = RAP
      //

      RCP<Matrix> AP;

      // Reuse pattern if available (multiple solve)
      if (coarseLevel.IsAvailable("AP Pattern", this)){
        GetOStream(Runtime0, 0) << "Ac: Using previous AP pattern"<<std::endl;
        AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern");
      }

      {
        SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel);
        AP = Utils::Multiply(*A, false, *P, false, AP);
        Set(coarseLevel, "AP Pattern", AP);
      }

      bool doOptimizedStorage = !checkAc_; // Optimization storage option. If not modifying matrix later (inserting local values), allow optimization of storage.
                                           // This is necessary for new faster Epetra MM kernels.

      RCP<Matrix> Ac;

      // Reuse coarse matrix memory if available (multiple solve)
      if (coarseLevel.IsAvailable("RAP Pattern", this)) {
        GetOStream(Runtime0, 0) << "Ac: Using previous RAP pattern" << std::endl;
        Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern");
      }

      if (implicitTranspose_) {
        SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel);

        Ac = Utils::Multiply(*P, true, *AP, false, Ac, true, doOptimizedStorage);

      } else {

        SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel);

        RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");
        Ac = Utils::Multiply(*R, false, *AP, false, Ac, true, doOptimizedStorage);

      }

      if (checkAc_)
        CheckMainDiagonal(Ac);

      RCP<ParameterList> params = rcp(new ParameterList());;
      params->set("printLoadBalancingInfo", true);
      GetOStream(Statistics0, 0) << Utils::PrintMatrixInfo(*Ac, "Ac", params);

      Set(coarseLevel, "A",           Ac);
      Set(coarseLevel, "RAP Pattern", Ac);
    }

    if (transferFacts_.begin() != transferFacts_.end()) {
      SubFactoryMonitor m(*this, "Projections", coarseLevel);

      // call Build of all user-given transfer factories
      for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
        GetOStream(Runtime0, 0) << "Ac: call transfer factory " << (*it).get() << ": " << (*it)->description() << std::endl;
        (*it)->CallBuild(coarseLevel);
      }
    }

  }
  void CloneRepartitionInterface<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);
    currentLevel.print(GetOStream(Statistics0,0));
    // extract blocked operator A from current level
    Teuchos::RCP<Matrix> A = Get< Teuchos::RCP<Matrix> >     (currentLevel, "A");
    Teuchos::RCP<const Teuchos::Comm< int > > comm = A->getRowMap()->getComm();

    // number of Partitions only used for a shortcut.
    GO numPartitions = 0;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    }

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition = Teuchos::null;

    // extract decomposition vector
    decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");
    ArrayRCP<const GO> decompEntries = decomposition->getData(0);

    if (decomposition.is_null()) {
      GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    // create new decomposition vector
    Teuchos::RCP<GOVector> ret = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), false);
    ArrayRCP<GO> retDecompEntries = ret->getDataNonConst(0);

    // block size of output vector
    LocalOrdinal blkSize = 1;

    // check for blocking/striding information
    if(A->IsView("stridedMaps") &&
       Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap("stridedMaps")) != Teuchos::null) {
      Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!)
      RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap());
      TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null,Exceptions::BadCast,"MueLu::CloneRepartitionInterface::Build: cast to strided row map failed.");
      LocalOrdinal stridedBlock = strMap->getStridedBlockId();
      if (stridedBlock == -1)
        blkSize = strMap->getFixedBlockSize();
      else {
        std::vector<size_t> strInfo = strMap->getStridingData();
        blkSize = strInfo[stridedBlock];
      }
      oldView = A->SwitchToView(oldView);
      GetOStream(Statistics1) << "CloneRepartitionInterface::Build():" << " found blockdim=" << blkSize << " from strided maps."<< std::endl;
    } else {
      GetOStream(Statistics1) << "CloneRepartitionInterface::Build(): no striding information available. Use blockdim=" << blkSize << " (DofsPerNode)." << std::endl;
      blkSize = A->GetFixedBlockSize();
    }

    // plausibility check!
    size_t inLocalLength  = decomposition->getLocalLength();
    size_t outLocalLength = A->getRowMap()->getNodeNumElements();

    // only for non-strided maps
    size_t numLocalNodes = outLocalLength / blkSize;
    TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(outLocalLength  % blkSize) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << outLocalLength << ") and degrees of freedoms (" << blkSize <<")");

    if (numLocalNodes > 0) {
      TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(inLocalLength  % numLocalNodes) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << inLocalLength << ") and number of local nodes (" << numLocalNodes << ")");
      LocalOrdinal inBlkSize = Teuchos::as<LocalOrdinal>(inLocalLength / numLocalNodes);
      //TEUCHOS_TEST_FOR_EXCEPTION(blkSize != inBlkSize, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: input block size = " << inBlkSize << " outpub block size = " << blkSize << ". They should be the same.");

      for(LO i = 0; i<Teuchos::as<LO>(numLocalNodes); i++) {
        for(LO j = 0; j < blkSize; j++) {
          retDecompEntries[i*blkSize + j] = Teuchos::as<GO>(decompEntries[i*inBlkSize]);
        }
      }
    } // end if numLocalNodes > 0
    Set(currentLevel, "Partition", ret);
  } //Build()
Esempio n. 14
0
 bool IsAvailable(Level & level, const std::string & varName) const {
     return level.IsAvailable(varName, GetFactory(varName).get());
 }
  void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const {
    {
      FactoryMonitor m(*this, "Computing Ac", coarseLevel);
      std::ostringstream levelstr;
      levelstr << coarseLevel.GetLevelID();

      TEUCHOS_TEST_FOR_EXCEPTION(hasDeclaredInput_==false, Exceptions::RuntimeError,
                                 "MueLu::RAPFactory::Build(): CallDeclareInput has not been called before Build!");

      // Set "Keeps" from params
      const Teuchos::ParameterList& pL = GetParameterList();
      if (pL.get<bool>("Keep AP Pattern"))
        coarseLevel.Keep("AP Pattern",  this);
      if (pL.get<bool>("Keep RAP Pattern"))
        coarseLevel.Keep("RAP Pattern", this);

      RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel,   "A");
      RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P"), AP, Ac;

      // Reuse pattern if available (multiple solve)
      if (coarseLevel.IsAvailable("AP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous AP pattern" << std::endl;

        AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern");
      }

      {
        SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel);

        AP = Utils::Multiply(*A, false, *P, false, AP, GetOStream(Statistics2),true,true,std::string("MueLu::A*P-")+levelstr.str());
      }
      if (pL.get<bool>("Keep AP Pattern"))
        Set(coarseLevel, "AP Pattern", AP);

      // Reuse coarse matrix memory if available (multiple solve)
      if (coarseLevel.IsAvailable("RAP Pattern", this)) {
        GetOStream(Runtime0) << "Ac: Using previous RAP pattern" << std::endl;

        Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern");
        // Some eigenvalue may have been cached with the matrix in the previous run.
        // As the matrix values will be updated, we need to reset the eigenvalue.
        Ac->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
      }

      // If we do not modify matrix later, allow optimization of storage.
      // This is necessary for new faster Epetra MM kernels.
      bool doOptimizeStorage = !pL.get<bool>("RepairMainDiagonal");

      const bool doTranspose    = true;
      const bool doFillComplete = true;
      if (pL.get<bool>("transpose: use implicit") == true) {
        SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel);
        Ac = Utils::Multiply(*P,  doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-implicit-")+levelstr.str());

      } else {
        RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");

        SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel);
        Ac = Utils::Multiply(*R, !doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-explicit-")+levelstr.str());
      }

      CheckRepairMainDiagonal(Ac);

      if (IsPrint(Statistics1)) {
        RCP<ParameterList> params = rcp(new ParameterList());;
        params->set("printLoadBalancingInfo", true);
        params->set("printCommInfo",          true);
        GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ac, "Ac", params);
      }

      Set(coarseLevel, "A",           Ac);
      if (pL.get<bool>("Keep RAP Pattern"))
        Set(coarseLevel, "RAP Pattern", Ac);
    }

    if (transferFacts_.begin() != transferFacts_.end()) {
      SubFactoryMonitor m(*this, "Projections", coarseLevel);

      // call Build of all user-given transfer factories
      for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) {
        RCP<const FactoryBase> fac = *it;
        GetOStream(Runtime0) << "RAPFactory: call transfer factory: " << fac->description() << std::endl;
        fac->CallBuild(coarseLevel);
        // Coordinates transfer is marginally different from all other operations
        // because it is *optional*, and not required. For instance, we may need
        // coordinates only on level 4 if we start repartitioning from that level,
        // but we don't need them on level 1,2,3. As our current Hierarchy setup
        // assumes propagation of dependencies only through three levels, this
        // means that we need to rely on other methods to propagate optional data.
        //
        // The method currently used is through RAP transfer factories, which are
        // simply factories which are called at the end of RAP with a single goal:
        // transfer some fine data to coarser level. Because these factories are
        // kind of outside of the mainline factories, they behave different. In
        // particular, we call their Build method explicitly, rather than through
        // Get calls. This difference is significant, as the Get call is smart
        // enough to know when to release all factory dependencies, and Build is
        // dumb. This led to the following CoordinatesTransferFactory sequence:
        // 1. Request level 0
        // 2. Request level 1
        // 3. Request level 0
        // 4. Release level 0
        // 5. Release level 1
        //
        // The problem is missing "6. Release level 0". Because it was missing,
        // we had outstanding request on "Coordinates", "Aggregates" and
        // "CoarseMap" on level 0.
        //
        // This was fixed by explicitly calling Release on transfer factories in
        // RAPFactory. I am still unsure how exactly it works, but now we have
        // clear data requests for all levels.
        coarseLevel.Release(*fac);
      }
    }

  }
  void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const Teuchos::ParameterList & pL = GetParameterList();
    // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
    // TODO (JG): I don't really know if we want to do this.
    const int    startLevel          = pL.get<int>   ("repartition: start level");
    const LO     minRowsPerProcessor = pL.get<LO>    ("repartition: min rows per proc");
    const double nonzeroImbalance    = pL.get<double>("repartition: max imbalance");
    const bool   remapPartitions     = pL.get<bool>  ("repartition: remap parts");

    // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");

    // ======================================================================================================
    // Determine whether partitioning is needed
    // ======================================================================================================
    // NOTE: most tests include some global communication, which is why we currently only do tests until we make
    // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
    // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.

    // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
    if (currentLevel.GetLevelID() < startLevel) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" <<
          "\n  current level = " << Teuchos::toString(currentLevel.GetLevelID()) <<
          ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    RCP<const Map> rowMap = A->getRowMap();

    // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is
    // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it
    // does not matter.
    RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
    RCP<const Teuchos::Comm<int> > comm     = origComm->duplicate();

    // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
    // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
    // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
    {
      int numActiveProcesses = 0;
      MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses);

      if (numActiveProcesses == 1) {
        GetOStream(Statistics0) << "Repartitioning?  NO:" <<
            "\n  # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;

        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    bool test3 = false, test4 = false;
    std::string msg3, msg4;

    // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
    // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
    if (minRowsPerProcessor > 0) {
      LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
      LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0;
      MueLu_sumAll(comm, haveFewRows, numWithFewRows);
      MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);

      // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
      // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
      // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
      if (numWithFewRows > 0)
        test3 = true;

      msg3 = "\n  min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor);
    }

    // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
    if (!test3) {
      GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries());
      MueLu_maxAll(comm, numMyNnz,                           maxNnz);
      MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
      double imbalance = Teuchos::as<double>(maxNnz)/minNnz;

      if (imbalance > nonzeroImbalance)
        test4 = true;

      msg4 = "\n  nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
    }

    if (!test3 && !test4) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" << msg3 + msg4 << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;

    GO                     indexBase = rowMap->getIndexBase();
    Xpetra::UnderlyingLib  lib       = rowMap->lib();
    int myRank   = comm->getRank();
    int numProcs = comm->getSize();

    RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
    TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();

    // ======================================================================================================
    // Calculate number of partitions
    // ======================================================================================================
    // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
    // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
    GO numPartitions;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    } else {
      if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) {
        // System is too small, migrate it to a single processor
        numPartitions = 1;

      } else {
        // Make sure that each processor has approximately minRowsPerProcessor
        numPartitions = A->getGlobalNumRows() / minRowsPerProcessor;
      }
      numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs));

      currentLevel.Set("number of partitions", numPartitions, NoFactory::get());
    }
    GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl;

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition;
    if (numPartitions == 1) {
      // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
      // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
      // in Zoltan[12]Interface).
      // TODO: We can probably skip more work in this case (like building all extra data structures)
      GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl;
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true);

    } else {
      decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");

      if (decomposition.is_null()) {
        GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    // ======================================================================================================
    // Remap if necessary
    // ======================================================================================================
    // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
    // There are two problems, however.
    // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
    //     This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
    //     each partition is the same.
    // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
    //     Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
    //     GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
    //     processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
    //     be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
    //     algorithm can resolve these differently. For instance, running
    //         mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
    //     with
    //         <ParameterList name="MueLu">
    //           <Parameter name="coarse: max size"                type="int"      value="1"/>
    //           <Parameter name="repartition: enable"             type="bool"     value="true"/>
    //           <Parameter name="repartition: min rows per proc"  type="int"      value="2"/>
    //           <ParameterList name="level 1">
    //             <Parameter name="repartition: remap parts"      type="bool"     value="false/true"/>
    //           </ParameterList>
    //         </ParameterList>
    //     produces different repartitioning for level 2.
    //     This different repartitioning may then escalate into different aggregation for the next level.
    //
    // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
    // Fixing (2) is more complicated.
    // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
    // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
    // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
    // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
    // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
    // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
    // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
    // MPI_COMM_WORLD, which may lead to issues for logging.
    if (remapPartitions) {
      SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);

      DeterminePartitionPlacement(*A, *decomposition, numPartitions);
    }

    // ======================================================================================================
    // Construct importer
    // ======================================================================================================
    // At this point, the following is true:
    //  * Each processors owns 0 or 1 partitions
    //  * If a processor owns a partition, that partition number is equal to the processor rank
    //  * The decomposition vector contains the partitions ids that the corresponding GID belongs to

    ArrayRCP<const GO> decompEntries;
    if (decomposition->getLocalLength() > 0)
      decompEntries = decomposition->getData(0);

#ifdef HAVE_MUELU_DEBUG
    // Test range of partition ids
    int incorrectRank = -1;
    for (int i = 0; i < decompEntries.size(); i++)
      if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
        incorrectRank = myRank;
        break;
      }

    int incorrectGlobalRank = -1;
    MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
    TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
#endif

    Array<GO> myGIDs;
    myGIDs.reserve(decomposition->getLocalLength());

    // Step 0: Construct mapping
    //    part number -> GIDs I own which belong to this part
    // NOTE: my own part GIDs are not part of the map
    typedef std::map<GO, Array<GO> > map_type;
    map_type sendMap;
    for (LO i = 0; i < decompEntries.size(); i++) {
      GO id  = decompEntries[i];
      GO GID = rowMap->getGlobalElement(i);

      if (id == myRank)
        myGIDs     .push_back(GID);
      else
        sendMap[id].push_back(GID);
    }
    decompEntries = Teuchos::null;

    if (IsPrint(Statistics2)) {
      GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
      MueLu_sumAll(comm,numLocalKept, numGlobalKept);
      GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl;
    }

    int numSend = sendMap.size(), numRecv;

    // Arrayify map keys
    Array<GO> myParts(numSend), myPart(1);
    int cnt = 0;
    myPart[0] = myRank;
    for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
      myParts[cnt++] = it->first;

    // Step 1: Find out how many processors send me data
    // partsIndexBase starts from zero, as the processors ids start from zero
    GO partsIndexBase = 0;
    RCP<Map>    partsIHave  = MapFactory   ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
    RCP<Map>    partsIOwn   = MapFactory   ::Build(lib,                                                 numProcs,  myPart(), partsIndexBase, comm);
    RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);

    RCP<GOVector> partsISend    = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave);
    RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn);
    if (numSend) {
      ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
      for (int i = 0; i < numSend; i++)
        partsISendData[i] = 1;
    }
    (numPartsIRecv->getDataNonConst(0))[0] = 0;

    numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
    numRecv = (numPartsIRecv->getData(0))[0];

    // Step 2: Get my GIDs from everybody else
    MPI_Datatype MpiType = MpiTypeTraits<GO>::getType();
    int msgTag = 12345;  // TODO: use Comm::dup for all internal messaging

    // Post sends
    Array<MPI_Request> sendReqs(numSend);
    cnt = 0;
    for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
      MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);

    map_type recvMap;
    size_t totalGIDs = myGIDs.size();
    for (int i = 0; i < numRecv; i++) {
      MPI_Status status;
      MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);

      // Get rank and number of elements from status
      int fromRank = status.MPI_SOURCE, count;
      MPI_Get_count(&status, MpiType, &count);

      recvMap[fromRank].resize(count);
      MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);

      totalGIDs += count;
    }

    // Do waits on send requests
    if (numSend) {
      Array<MPI_Status> sendStatuses(numSend);
      MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
    }

    // Merge GIDs
    myGIDs.reserve(totalGIDs);
    for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
      int offset = myGIDs.size(), len = it->second.size();
      if (len) {
        myGIDs.resize(offset + len);
        memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO));
      }
    }
    // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
    // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
    std::sort(myGIDs.begin(), myGIDs.end());

    // Step 3: Construct importer
    RCP<Map>          newRowMap      = MapFactory   ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
    RCP<const Import> rowMapImporter;
    {
      SubFactoryMonitor m1(*this, "Import construction", currentLevel);
      rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
    }

    Set(currentLevel, "Importer", rowMapImporter);

    // ======================================================================================================
    // Print some data
    // ======================================================================================================
    if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
      // Print the grid of processors
      GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;

      char amActive = (myGIDs.size() ? 1 : 0);
      std::vector<char> areActive(numProcs, 0);
      MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);

      int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
      for (int proc = 0; proc < numProcs; proc += rowWidth) {
        for (int j = 0; j < rowWidth; j++)
          if (proc + j < numProcs)
            GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
          else
          GetOStream(Statistics2) << " ";

        GetOStream(Statistics2) << "      " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
      }
    }

  } // Build
  void SmootherFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildSmoother(Level& currentLevel, PreOrPost const preOrPost) const {
    // SmootherFactory is quite tricky because of the fact that one of the smoother prototypes may be zero.
    // The challenge is that we have no way of knowing how user uses this factory. For instance, lets say
    // user wants to use s1 prototype as a presmoother, and s2 as a postsmoother. He could do:
    //   (a) create SmootherFactory(s1, s2), or
    //   (b) create SmootherFactory(s1, null) and SmootherFactory(null, s2)
    // It may also happen that somewhere somebody set presmoother factory = postsmoother factory = (a)
    // How do you do DeclareInput in this case? It could easily introduce a bug if a user does not check
    // whether presmoother = postsmoother. A buggy code could look like that:
    //   RCP<SmootherFactory> s = rcp(new SmootherFactory(s1,s2));
    //   level.Request("PreSmoother",  s.get());
    //   level.Request("PostSmoother", s.get());
    //   Get<RCP<SmootherBase> > pre  = Get<RCP<SmootherBase> >("PreSmoother",  s.get());
    //   Get<RCP<SmootherBase> > post = Get<RCP<SmootherBase> >("PostSmoother", s.get());
    // This code would call DeclareInput in request mode twice, but as the Build method generates both Pre and Post
    // smoothers, it would call DelcareInput in release mode only once, leaving requests.
    // This code has another problem if s2 = Teuchos::null. In that case, despite the request for PostSmoother, the factory
    // would not generate one, and second Get would throw. The real issue here is that given a Factory pointer
    // there is no way to be sure that this factory would generate any of "PreSmoother" or "PostSmoother", unless you are
    // able to cast it to SmootherFactory, do GetPrototypes and to check whether any of those is Teuchos::null.

    const Teuchos::ParameterList& pL = GetParameterList();

    RCP<SmootherPrototype> preSmoother, postSmoother;
    ParameterList preSmootherParams, postSmootherParams;

    if ((preOrPost & PRE) && !preSmootherPrototype_.is_null()) {

      if (currentLevel.IsAvailable("PreSmoother data", this))
        preSmoother = currentLevel.Get<RCP<SmootherPrototype> >("PreSmoother data", this);
      else
        preSmoother = preSmootherPrototype_->Copy();

      int oldRank = -1;
      if (!currentLevel.GetComm().is_null())
        oldRank = preSmoother->SetProcRankVerbose(currentLevel.GetComm()->getRank());

      preSmoother->Setup(currentLevel);
      preSmootherParams = preSmoother->GetParameterList();

      if (oldRank != -1)
        preSmoother->SetProcRankVerbose(oldRank);

      currentLevel.Set<RCP<SmootherBase> >("PreSmoother", preSmoother, this);

      if (pL.get<bool>("keep smoother data"))
        Set(currentLevel, "PreSmoother data", preSmoother);
    }

    if ((preOrPost & POST) && !postSmootherPrototype_.is_null()) {
      if (preOrPost == BOTH && preSmootherPrototype_ == postSmootherPrototype_) {
        // Simple reuse
        // Same prototypes for pre- and post-smoothers mean that we only need to call Setup only once
        postSmoother = preSmoother;

        //               else if (preOrPost == BOTH &&
        //                        preSmootherPrototype_ != Teuchos::null &&
        //                        preSmootherPrototype_->GetType() == postSmootherPrototype_->GetType()) {

        //               // More complex reuse case: need implementation of CopyParameters() and a smoothers smart enough to know when parameters affect the setup phase.

        //               // YES: post-smoother == pre-smoother
        //               // => copy the pre-smoother to avoid the setup phase of the post-smoother.
        //               postSmoother = preSmoother->Copy();
        //               // If the post-smoother parameters are different from
        //               // pre-smoother, the parameters stored in the post-smoother
        //               // prototype are copied in the new post-smoother object.
        //               postSmoother->CopyParameters(postSmootherPrototype_);
        //               // If parameters don't influence the Setup phase (it is the case
        //               // for Jacobi, Chebyshev...), PostSmoother is already setup. Nothing
        //               // more to do. In the case of ILU, parameters of the smoother
        //               // are in fact the parameters of the Setup phase. The call to
        //               // CopyParameters resets the smoother (only if parameters are
        //               // different) and we must call Setup() again.
        //               postSmoother->Setup(currentLevel);
        //               }

        //               // TODO: if CopyParameters do not exist, do setup twice.

      } else {

        if (currentLevel.IsAvailable("PostSmoother data", this)) {
          postSmoother = currentLevel.Get<RCP<SmootherPrototype> >("PostSmoother data", this);
        } else {
          // No reuse:
          //  - either we only do postsmoothing without any presmoothing
          //  - or our postsmoother is different from presmoother
          postSmoother = postSmootherPrototype_->Copy();
        }

        int oldRank = -1;
        if (!currentLevel.GetComm().is_null())
          oldRank = postSmoother->SetProcRankVerbose(GetProcRankVerbose());

        postSmoother->Setup(currentLevel);
        postSmootherParams = postSmoother->GetParameterList();

        if (oldRank != -1)
          postSmoother->SetProcRankVerbose(oldRank);
      }

      currentLevel.Set<RCP<SmootherBase> >("PostSmoother", postSmoother, this);

      if (pL.get<bool>("keep smoother data"))
        Set(currentLevel, "PostSmoother data", preSmoother);
    }

    ParameterList& paramList = const_cast<ParameterList&>(this->GetParameterList());
    if (postSmoother == preSmoother && !preSmoother.is_null()) {
      paramList.sublist("smoother", false) = preSmoother->GetParameterList();

    } else {
      if (!preSmoother.is_null())
        paramList.sublist("presmoother", false) = preSmootherParams;

      if (!postSmoother.is_null())
        paramList.sublist("postsmoother", false) = postSmootherParams;
    }

  } // Build()
  void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level & fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    typedef Xpetra::MultiVector<double,LO,GO,NO> xdMV;

    GetOStream(Runtime0) << "Transferring coordinates" << std::endl;

    if (coarseLevel.IsAvailable("Coordinates", this)) {
      GetOStream(Runtime0) << "Reusing coordinates" << std::endl;
      return;
    }

    RCP<Aggregates>     aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates");
    RCP<xdMV>    fineCoords = Get< RCP<xdMV> >(fineLevel, "Coordinates");
    RCP<const Map>      coarseMap  = Get< RCP<const Map> >  (fineLevel, "CoarseMap");

    // coarseMap is being used to set up the domain map of tentative P, and therefore, the row map of Ac
    // Therefore, if we amalgamate coarseMap, logical nodes in the coordinates vector would correspond to
    // logical blocks in the matrix

    ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
    LO                  blkSize      = 1;
    if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
      blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();

    GO                  indexBase    = coarseMap->getIndexBase();
    size_t              numElements  = elementAList.size() / blkSize;
    Array<GO>           elementList(numElements);

    // Amalgamate the map
    for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
      elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;

    RCP<const Map>   uniqueMap      = fineCoords->getMap();
    RCP<const Map>   coarseCoordMap = MapFactory        ::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), elementList, indexBase, coarseMap->getComm());
    RCP<xdMV> coarseCoords   = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(coarseCoordMap, fineCoords->getNumVectors());

    // Create overlapped fine coordinates to reduce global communication
    RCP<xdMV> ghostedCoords = fineCoords;
    if (aggregates->AggregatesCrossProcessors()) {
      RCP<const Map>    nonUniqueMap = aggregates->GetMap();
      RCP<const Import> importer     = ImportFactory::Build(uniqueMap, nonUniqueMap);

      ghostedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(nonUniqueMap, fineCoords->getNumVectors());
      ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
    }

    // Get some info about aggregates
    int                         myPID        = uniqueMap->getComm()->getRank();
    LO                          numAggs      = aggregates->GetNumAggregates();
    ArrayRCP<LO>                aggSizes     = aggregates->ComputeAggregateSizes(true,true);
    const ArrayRCP<const LO>    vertex2AggID = aggregates->GetVertex2AggId()->getData(0);
    const ArrayRCP<const LO>    procWinner   = aggregates->GetProcWinner()->getData(0);

    // Fill in coarse coordinates
    for (size_t j = 0; j < fineCoords->getNumVectors(); j++) {
      ArrayRCP<const double> fineCoordsData = ghostedCoords->getData(j);
      ArrayRCP<double>     coarseCoordsData = coarseCoords->getDataNonConst(j);

      for (LO lnode = 0; lnode < vertex2AggID.size(); lnode++)
        if (procWinner[lnode] == myPID)
          coarseCoordsData[vertex2AggID[lnode]] += fineCoordsData[lnode];

      for (LO agg = 0; agg < numAggs; agg++)
        coarseCoordsData[agg] /= aggSizes[agg];
    }

    Set<RCP<xdMV> >(coarseLevel, "Coordinates", coarseCoords);

    const ParameterList& pL = GetParameterList();
    int writeStart = pL.get<int>("write start"), writeEnd = pL.get<int>("write end");
    if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd) {
      std::ostringstream buf;
      buf << fineLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Xpetra::IO<double,LO,GO,NO>::Write(fileName,*fineCoords);
    }
    if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd) {
      std::ostringstream buf;
      buf << coarseLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Xpetra::IO<double,LO,GO,NO>::Write(fileName,*coarseCoords);
    }
  }
  void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const {
    FactoryMonitor m(*this, "Prolongator minimization", coarseLevel);

    const ParameterList & pL = GetParameterList();

    // Get the matrix
    RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A");

    // Get/make initial guess
    RCP<Matrix> P0;
    int         Niterations;
    if (coarseLevel.IsAvailable("P0", this)) {
      // Reuse data
      P0          = coarseLevel.Get<RCP<Matrix> >("P0", this);
      Niterations = pL.get<int>("Reuse Niterations");
      GetOStream(Runtime0, 0) << "Reusing P0" << std::endl;

    } else {
      // Construct data
      P0          = Get< RCP<Matrix> >(coarseLevel, "P");
      Niterations = pL.get<int>("Niterations");
    }
    // NOTE: the main assumption here that P0 satisfies both constraints:
    //   - nonzero pattern
    //   - nullspace preservation

    // Get/make constraint operator
    RCP<Constraint> X;
    if (coarseLevel.IsAvailable("Constraint0", this)) {
      // Reuse data
      X = coarseLevel.Get<RCP<Constraint> >("Constraint0", this);
      GetOStream(Runtime0, 0) << "Reusing Constraint0" << std::endl;

    } else {
      // Construct data
      X = Get< RCP<Constraint> >(coarseLevel, "Constraint");
    }
    GetOStream(Runtime0,0) << "Number of emin iterations = " << Niterations << std::endl;


    RCP<Matrix> P;
    CGSolver EminSolver(Niterations);
    EminSolver.Iterate(*A, *X, *P0, P);

    Set(coarseLevel, "P", P);
    if (pL.get<bool>("Keep P0")) {
      // NOTE: we must do Keep _before_ set as the Needs class only sets if
      //  a) data has been requested (which is not the case here), or
      //  b) data has some keep flag
      coarseLevel.Keep("P0", this);
      Set(coarseLevel, "P0", P);
    }
    if (pL.get<bool>("Keep Constraint0")) {
      // NOTE: we must do Keep _before_ set as the Needs class only sets if
      //  a) data has been requested (which is not the case here), or
      //  b) data has some keep flag
      coarseLevel.Keep("Constraint0", this);
      Set(coarseLevel, "Constraint0", X);
    }

    RCP<ParameterList> params = rcp(new ParameterList());
    params->set("printLoadBalancingInfo", true);
    GetOStream(Statistics1,0) << Utils::PrintMatrixInfo(*P, "P", params);
  }
  void RigidBodyModeFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &currentLevel) const {
    FactoryMonitor m(*this, "Rigid body mode factory", currentLevel);
    RCP<MultiVector> nullspace;
    if (currentLevel.GetLevelID() == 0) {
      if (currentLevel.IsAvailable(nspName_, NoFactory::get())) {
        nullspace = currentLevel.Get< RCP<MultiVector> >(nspName_, NoFactory::get());
        GetOStream(Runtime1, 0) << "Use user-given rigid body modes " << nspName_ << ": nullspace dimension=" << nullspace->getNumVectors() << " nullspace length=" << nullspace->getGlobalLength() << std::endl;
      }
      else {
        RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
        GetOStream(Runtime1, 0) << "Generating rigid body modes: dimension = " << numPDEs_ << std::endl;
	RCP<const Map> xmap=A->getDomainMap();
	if(numPDEs_==1)      { nullspace = MultiVectorFactory::Build(xmap, 1); }
	else if(numPDEs_==2) { nullspace = MultiVectorFactory::Build(xmap, 3); }
	else if(numPDEs_==3) { nullspace = MultiVectorFactory::Build(xmap, 6); }
	Scalar zero(0.0);
	nullspace -> putScalar(zero);
	RCP<MultiVector> Coords = Get< RCP<MultiVector> >(currentLevel,"Coordinates");
	ArrayRCP<Scalar> xnodes, ynodes, znodes;
	Scalar cx, cy, cz;
	ArrayRCP<Scalar> nsValues0, nsValues1, nsValues2, nsValues3, nsValues4, nsValues5;
	int nDOFs=xmap->getNodeNumElements();
	if(numPDEs_==1) {
	  nsValues0 = nullspace->getDataNonConst(0);
	  for(int j=0; j<nDOFs; j++) {
	    // constant null space for scalar PDE
	    nsValues0[j]=1.0;
	  }
	}
	else if(numPDEs_==2) {
	  xnodes = Coords->getDataNonConst(0);
	  ynodes = Coords->getDataNonConst(1);
	  cx = Coords->getVector(0)->meanValue();
	  cy = Coords->getVector(1)->meanValue();
	  nsValues0 = nullspace->getDataNonConst(0);
	  nsValues1 = nullspace->getDataNonConst(1);
	  nsValues2 = nullspace->getDataNonConst(2);
	  for (int j=0; j<nDOFs; j+=numPDEs_) {
	    // translation
	    nsValues0[j+0] = 1.0;
	    nsValues1[j+1] = 1.0;
	    // rotate around z-axis (x-y plane)
	    nsValues2[j+0] = -(ynodes[j]-cy);
	    nsValues2[j+1] =  (xnodes[j]-cx);
	  }
	}
	else if(numPDEs_==3) {
	  xnodes = Coords->getDataNonConst(0);
	  ynodes = Coords->getDataNonConst(1);
	  znodes = Coords->getDataNonConst(2);
	  cx = Coords->getVector(0)->meanValue();
	  cy = Coords->getVector(1)->meanValue();
	  cz = Coords->getVector(2)->meanValue();
	  nsValues0 = nullspace->getDataNonConst(0);
	  nsValues1 = nullspace->getDataNonConst(1);
	  nsValues2 = nullspace->getDataNonConst(2);
	  nsValues3 = nullspace->getDataNonConst(3);
	  nsValues4 = nullspace->getDataNonConst(4);
	  nsValues5 = nullspace->getDataNonConst(5);
	  for (int j=0; j<nDOFs; j+=numPDEs_) {
	    // translation
	    nsValues0[j+0] = 1.0;
	    nsValues1[j+1] = 1.0;
	    nsValues2[j+2] = 1.0;
	    // rotate around z-axis (x-y plane)
	    nsValues3[j+0] = -(ynodes[j]-cy);
	    nsValues3[j+1] =  (xnodes[j]-cx);
	    // rotate around x-axis (y-z plane)
	    nsValues4[j+1] = -(znodes[j]-cz);
	    nsValues4[j+2] =  (ynodes[j]-cy);
	    // rotate around y-axis (x-z plane)
	    nsValues5[j+0] =  (znodes[j]-cz);
	    nsValues5[j+2] = -(xnodes[j]-cx);
	  }
	}
      } // end if "Nullspace" not available
    }
    else {
      nullspace = currentLevel.Get< RCP<MultiVector> >("Nullspace", GetFactory(nspName_).get());
    }
    Set(currentLevel, "Nullspace", nullspace);
  }