// test outpout of Build() and BuildSmoother() void testBuildCheck(RCP<SmootherFactory> smooFact, Level& level, RCP<SmootherPrototype> smooProtoA, RCP<SmootherPrototype> smooProtoB, MueLu::PreOrPost preOrPost, Teuchos::FancyOStream & out, bool & success) { // invariant: smoother prototypes kept unchanged check(smooFact, smooProtoA, smooProtoB, out, success); // output test if (preOrPost == MueLu::BOTH) { testBuildCheckOutput(smooFact, level, smooProtoA, "PreSmoother", out, success); testBuildCheckOutput(smooFact, level, smooProtoB, "PostSmoother", out, success); // ReUse: if pre and post prototype are the same, then pre smoother == post smoother // otherwise, they are different (have not been tested by previous tests) RCP<SmootherBase> smooA, smooB; if (smooProtoA != Teuchos::null) { smooA = level.Get< RCP<SmootherBase> >("PreSmoother", smooFact.get()); } if (smooProtoB != Teuchos::null) { smooB = level.Get< RCP<SmootherBase> >("PostSmoother", smooFact.get()); } if (smooProtoA == smooProtoB) { TEST_EQUALITY(smooA, smooB); } else { TEST_INEQUALITY(smooA, smooB); } } else if (preOrPost == MueLu::PRE) { testBuildCheckOutput(smooFact, level, smooProtoA, "PreSmoother", out, success); TEST_EQUALITY(level.IsAvailable("PostSmoother", smooFact.get()), false); } else if (preOrPost == MueLu::POST) { TEST_EQUALITY(level.IsAvailable("PreSmoother", smooFact.get()), false); testBuildCheckOutput(smooFact, level, smooProtoB, "PostSmoother", out, success); } else { TEST_EQUALITY(true, false); } }
void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const { FactoryMonitor m(*this, "Prolongator minimization", coarseLevel); const ParameterList & pL = GetParameterList(); // Set keep flags if (pL.isParameter("Keep P0") && pL.get<bool>("Keep P0")) coarseLevel.Keep("P0",this); if (pL.isParameter("Keep Constraint0") && pL.get<bool>("Keep Constraint0")) coarseLevel.Keep("Constraint0",this); // Reuse int Niterations; // Get A, B RCP<Matrix> A = Get< RCP<Matrix> > (fineLevel, "A"); RCP<MultiVector> B = Get< RCP<MultiVector> >(fineLevel, "Nullspace"); // Get P0 or make P RCP<Matrix> P0; if (coarseLevel.IsAvailable("P0", this)) { P0 = coarseLevel.Get<RCP<Matrix> >("P0", this); Niterations = pL.get<int>("Reuse Niterations"); GetOStream(Runtime0, 0) << "EminPFactory: Reusing P0"<<std::endl; } else { P0 = Get< RCP<Matrix> >(coarseLevel, "P"); Niterations = pL.get<int>("Niterations"); } // Get Constraint0 or make Constraint RCP<Constraint> X; if (coarseLevel.IsAvailable("Constraint0", this)) { X = coarseLevel.Get<RCP<Constraint> >("Constraint0", this); GetOStream(Runtime0, 0) << "EminPFactory: Reusing Constraint0"<<std::endl; } else { X = Get< RCP<Constraint> > (coarseLevel, "Constraint"); } RCP<Matrix> P; CGSolver EminSolver(Niterations); EminSolver.Iterate(*A, *X, *P0, *B, P); Set(coarseLevel, "Constraint0", X); Set(coarseLevel, "P", P); Set(coarseLevel, "P0", P); RCP<ParameterList> params = rcp(new ParameterList()); params->set("printLoadBalancingInfo", true); GetOStream(Statistics0,0) << Utils::PrintMatrixInfo(*P, "P", params); }
void MapTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level & fineLevel, Level & coarseLevel) const { typedef Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> OperatorClass; //TODO typedef Xpetra::Map<LocalOrdinal, GlobalOrdinal, Node> MapClass; typedef Xpetra::MapFactory<LocalOrdinal, GlobalOrdinal, Node> MapFactoryClass; Monitor m(*this, "Contact Map transfer factory"); if (fineLevel.IsAvailable(mapName_, mapFact_.get())==false) { GetOStream(Runtime0, 0) << "MapTransferFactory::Build: User provided map " << mapName_ << " not found in Level class." << std::endl; } // fetch map extractor from level RCP<const MapClass> transferMap = fineLevel.Get<RCP<const MapClass> >(mapName_,mapFact_.get()); // Get default tentative prolongator factory // Getting it that way ensure that the same factory instance will be used for both SaPFactory and NullspaceFactory. // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere! RCP<const FactoryBase> tentPFact = GetFactory("P"); if (tentPFact == Teuchos::null) { tentPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); } TEUCHOS_TEST_FOR_EXCEPTION(!coarseLevel.IsAvailable("P",tentPFact.get()),Exceptions::RuntimeError, "MueLu::MapTransferFactory::Build(): P (generated by TentativePFactory) not available."); RCP<OperatorClass> Ptent = coarseLevel.Get<RCP<OperatorClass> >("P", tentPFact.get()); std::vector<GlobalOrdinal > coarseMapGids; // loop over local rows of Ptent for(size_t row=0; row<Ptent->getNodeNumRows(); row++) { GlobalOrdinal grid = Ptent->getRowMap()->getGlobalElement(row); if(transferMap->isNodeGlobalElement(grid)) { Teuchos::ArrayView<const LocalOrdinal> indices; Teuchos::ArrayView<const Scalar> vals; Ptent->getLocalRowView(row, indices, vals); for(size_t i=0; i<(size_t)indices.size(); i++) { // mark all columns in Ptent(grid,*) to be coarse Dofs of next level transferMap GlobalOrdinal gcid = Ptent->getColMap()->getGlobalElement(indices[i]); coarseMapGids.push_back(gcid); } } // end if isNodeGlobalElement(grid) } // build column maps std::sort(coarseMapGids.begin(), coarseMapGids.end()); coarseMapGids.erase(std::unique(coarseMapGids.begin(), coarseMapGids.end()), coarseMapGids.end()); Teuchos::ArrayView<GlobalOrdinal> coarseMapGidsView (&coarseMapGids[0],coarseMapGids.size()); Teuchos::RCP<const MapClass> coarseTransferMap = MapFactoryClass::Build(Ptent->getColMap()->lib(), -1, coarseMapGidsView, Ptent->getColMap()->getIndexBase(), Ptent->getColMap()->getComm()); // store map extractor in coarse level coarseLevel.Set(mapName_, coarseTransferMap, mapFact_.get()); }
// test if a smoother created by Build() is correct (check if it corresponds to the prototype) void testBuildCheckOutput(RCP<SmootherFactory> smooFact, Level& level, RCP<SmootherPrototype> smooProto, const std::string& tag, Teuchos::FancyOStream & out, bool & success) { if (smooProto == Teuchos::null) { TEST_EQUALITY(level.IsAvailable(tag, smooFact.get()), false); } else { RCP<SmootherBase> smoother; TEST_NOTHROW(smoother = level.Get< RCP<SmootherBase> >(tag, smooFact.get())); TEST_INEQUALITY(smoother, Teuchos::null); TEST_INEQUALITY(smoother, smooProto); if (smooProto != Teuchos::null) { RCP<FakeSmootherPrototype> smootherF; // ouput test: smoothers same derived class as prototypes TEST_NOTHROW(smootherF = rcp_dynamic_cast<FakeSmootherPrototype>(smoother,true)); if (smootherF != Teuchos::null) { // output test: smoother parameters == prototype parameters RCP<FakeSmootherPrototype> smooProtoF = rcp_dynamic_cast<FakeSmootherPrototype>(smooProto,true); TEST_EQUALITY(smootherF->GetParam(), smooProtoF->GetParam()); // output test: smoothers are ready to be apply TEST_EQUALITY(smootherF->IsSetup(), true); // setup done only once. TEST_EQUALITY(smootherF->GetNumOfSetupCall(), 1); TEST_EQUALITY(smootherF->GetNumOfSetup(), 1); } } } }
void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::DeclareInput(Level& fineLevel, Level& coarseLevel) const { Input(fineLevel, "A"); static bool isAvailableP0 = false; static bool isAvailableConstraint0 = false; // Here is a tricky little piece of code // We don't want to request (aka call Input) when we reuse and P0 is available // However, we cannot run something simple like this: // if (!coarseLevel.IsAvailable("P0", this)) // Input(coarseLevel, "P"); // The reason is that it works fine during the request stage, but fails // in the release stage as we _construct_ P0 during Build process. Therefore, // we need to understand whether we are in Request or Release mode // NOTE: This is a very unique situation, please try not to propagate the // mode check any further if (coarseLevel.GetRequestMode() == Level::REQUEST) { isAvailableP0 = coarseLevel.IsAvailable("P0", this); isAvailableConstraint0 = coarseLevel.IsAvailable("Constraint0", this); } if (isAvailableP0 == false) Input(coarseLevel, "P"); if (isAvailableConstraint0 == false) Input(coarseLevel, "Constraint"); }
void CloneRepartitionInterface<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level ¤tLevel) const { FactoryMonitor m(*this, "Build", currentLevel); currentLevel.print(GetOStream(Statistics0,0)); // extract blocked operator A from current level Teuchos::RCP<Matrix> A = Get< Teuchos::RCP<Matrix> > (currentLevel, "A"); Teuchos::RCP<const Teuchos::Comm< int > > comm = A->getRowMap()->getComm(); // number of Partitions only used for a shortcut. GO numPartitions = 0; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition = Teuchos::null; // extract decomposition vector decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); ArrayRCP<const GO> decompEntries = decomposition->getData(0); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } // create new decomposition vector Teuchos::RCP<GOVector> ret = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), false); ArrayRCP<GO> retDecompEntries = ret->getDataNonConst(0); // block size of output vector LocalOrdinal blkSize = A->GetFixedBlockSize(); // plausibility check! size_t inLocalLength = decomposition->getLocalLength(); size_t outLocalLength = A->getRowMap()->getNodeNumElements(); size_t numLocalNodes = outLocalLength / blkSize; TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(outLocalLength % blkSize) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << outLocalLength << ") and degrees of freedoms ("<<blkSize<<")"); if (numLocalNodes > 0) { TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(inLocalLength % numLocalNodes) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << inLocalLength << ") and number of local nodes (" << numLocalNodes << ")"); LocalOrdinal inBlkSize = Teuchos::as<LocalOrdinal>(inLocalLength / numLocalNodes); //TEUCHOS_TEST_FOR_EXCEPTION(blkSize != inBlkSize, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: input block size = " << inBlkSize << " outpub block size = " << blkSize << ". They should be the same."); for(LO i = 0; i<Teuchos::as<LO>(numLocalNodes); i++) { for(LO j = 0; j < blkSize; j++) { retDecompEntries[i*blkSize + j] = Teuchos::as<GO>(decompEntries[i*inBlkSize]); } } } // end if numLocalNodes > 0 Set(currentLevel, "Partition", ret); } //Build()
void RigidBodyModeFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::DeclareInput(Level ¤tLevel) const { if (currentLevel.IsAvailable(nspName_, NoFactory::get()) == false && currentLevel.GetLevelID() == 0) { Input(currentLevel, "A"); //Input(currentLevel,"Coordinates"); } if (currentLevel.GetLevelID() !=0) { currentLevel.DeclareInput("Nullspace", GetFactory(nspName_).get(), this); /* ! "Nullspace" and nspName_ mismatch possible here */ } }
void TogglePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level &coarseLevel) const { FactoryMonitor m(*this, "Prolongator toggle", coarseLevel); std::ostringstream levelstr; levelstr << coarseLevel.GetLevelID(); typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude; TEUCHOS_TEST_FOR_EXCEPTION(nspFacts_.size() != prolongatorFacts_.size(), Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: The number of provided prolongator factories and coarse nullspace factories must be identical."); TEUCHOS_TEST_FOR_EXCEPTION(nspFacts_.size() != 2, Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: TogglePFactory needs two different transfer operator strategies for toggling."); // TODO adapt this/weaken this as soon as other toggling strategies are introduced. // decision routine which prolongator factory to be used int nProlongatorFactory = 0; // default behavior: use first prolongator in list // extract user parameters const Teuchos::ParameterList & pL = GetParameterList(); std::string mode = Teuchos::as<std::string>(pL.get<std::string>("toggle: mode")); int semicoarsen_levels = Teuchos::as<int>(pL.get<int>("semicoarsen: number of levels")); TEUCHOS_TEST_FOR_EXCEPTION(mode!="semicoarsen", Exceptions::RuntimeError, "MueLu::TogglePFactory::Build: The 'toggle: mode' parameter must be set to 'semicoarsen'. No other mode supported, yet."); LO NumZDir = -1; if(fineLevel.IsAvailable("NumZLayers", NoFactory::get())) { NumZDir = fineLevel.Get<LO>("NumZLayers", NoFactory::get()); //obtain info GetOStream(Runtime1) << "Number of layers for semicoarsening: " << NumZDir << std::endl; } // Make a decision which prolongator to be used. if(fineLevel.GetLevelID() >= semicoarsen_levels || NumZDir == 1) { nProlongatorFactory = 1; } else { nProlongatorFactory = 0; } RCP<Matrix> P = Teuchos::null; RCP<MultiVector> coarseNullspace = Teuchos::null; // call Build for selected transfer operator GetOStream(Runtime0) << "TogglePFactory: call transfer factory: " << (prolongatorFacts_[nProlongatorFactory])->description() << std::endl; prolongatorFacts_[nProlongatorFactory]->CallBuild(coarseLevel); P = coarseLevel.Get< RCP<Matrix> >("P", (prolongatorFacts_[nProlongatorFactory]).get()); coarseNullspace = coarseLevel.Get< RCP<MultiVector> >("Nullspace", (nspFacts_[nProlongatorFactory]).get()); // Release dependencies of all prolongator and coarse level null spaces for(size_t t=0; t<nspFacts_.size(); ++t) { coarseLevel.Release(*(prolongatorFacts_[t])); coarseLevel.Release(*(nspFacts_[t])); } // store prolongator with this factory identification. Set(coarseLevel, "P", P); Set(coarseLevel, "Nullspace", coarseNullspace); } //Build()
void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::DeclareInput(Level& fineLevel, Level& coarseLevel) const { static bool isAvailableCoords = false; if (coarseLevel.GetRequestMode() == Level::REQUEST) isAvailableCoords = coarseLevel.IsAvailable("Coordinates", this); if (isAvailableCoords == false) { Input(fineLevel, "Coordinates"); Input(fineLevel, "Aggregates"); Input(fineLevel, "CoarseMap"); } }
TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ThresholdAFilterFactory, Basic, Scalar, LocalOrdinal, GlobalOrdinal, Node) { # include <MueLu_UseShortNames.hpp> MUELU_TESTING_SET_OSTREAM; MUELU_TESTING_LIMIT_EPETRA_SCOPE(Scalar,GlobalOrdinal,Node); out << "version: " << MueLu::Version() << std::endl; Level aLevel; TestHelpers::TestFactory<SC, LO, GO, NO>::createSingleLevelHierarchy(aLevel); RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(20); //can be an empty operator RCP<ThresholdAFilterFactory> AfilterFactory0 = rcp(new ThresholdAFilterFactory("A",0.1)); // keep all RCP<ThresholdAFilterFactory> AfilterFactory1 = rcp(new ThresholdAFilterFactory("A",1.1)); // keep only diagonal RCP<ThresholdAFilterFactory> AfilterFactory2 = rcp(new ThresholdAFilterFactory("A",3)); // keep only diagonal aLevel.Set("A",A); aLevel.Request("A",AfilterFactory0.get()); AfilterFactory0->Build(aLevel); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory0.get()), true); RCP<Matrix> A0 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory0.get()); aLevel.Release("A",AfilterFactory0.get()); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory0.get()), false); TEST_EQUALITY(A0->getNodeNumEntries(), A->getNodeNumEntries()); TEST_EQUALITY(A0->getGlobalNumEntries(), A->getGlobalNumEntries()); aLevel.Request("A",AfilterFactory1.get()); AfilterFactory1->Build(aLevel); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory1.get()), true); RCP<Matrix> A1 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory1.get()); aLevel.Release("A",AfilterFactory1.get()); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory1.get()), false); TEST_EQUALITY(A1->getGlobalNumEntries(), A1->getGlobalNumRows()); aLevel.Request("A",AfilterFactory2.get()); AfilterFactory2->Build(aLevel); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory2.get()), true); RCP<Matrix> A2 = aLevel.Get< RCP<Matrix> >("A",AfilterFactory2.get()); aLevel.Release("A",AfilterFactory2.get()); TEST_EQUALITY(aLevel.IsAvailable("A",AfilterFactory2.get()), false); TEST_EQUALITY(A2->getGlobalNumEntries(), A2->getGlobalNumRows()); }
void SaPFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const { FactoryMonitor m(*this, "Prolongator smoothing", coarseLevel); // Add debugging information DeviceType::execution_space::print_configuration(GetOStream(Runtime1)); typedef typename Teuchos::ScalarTraits<SC>::magnitudeType Magnitude; // Get default tentative prolongator factory // Getting it that way ensure that the same factory instance will be used for both SaPFactory_kokkos and NullspaceFactory. // -- Warning: Do not use directly initialPFact_. Use initialPFact instead everywhere! RCP<const FactoryBase> initialPFact = GetFactory("P"); if (initialPFact == Teuchos::null) { initialPFact = coarseLevel.GetFactoryManager()->GetFactory("Ptent"); } // Level Get RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A"); RCP<Matrix> Ptent = coarseLevel.Get< RCP<Matrix> >("P", initialPFact.get()); if(restrictionMode_) { SubFactoryMonitor m2(*this, "Transpose A", coarseLevel); A = Utilities_kokkos::Transpose(*A, true); // build transpose of A explicitly } //Build final prolongator RCP<Matrix> finalP; // output // Reuse pattern if available RCP<ParameterList> APparams = rcp(new ParameterList); if (coarseLevel.IsAvailable("AP reuse data", this)) { GetOStream(static_cast<MsgType>(Runtime0 | Test)) << "Reusing previous AP data" << std::endl; APparams = coarseLevel.Get< RCP<ParameterList> >("AP reuse data", this); if (APparams->isParameter("graph")) finalP = APparams->get< RCP<Matrix> >("graph"); } const ParameterList& pL = GetParameterList(); SC dampingFactor = as<SC>(pL.get<double>("sa: damping factor")); LO maxEigenIterations = as<LO>(pL.get<int>("sa: eigenvalue estimate num iterations")); bool estimateMaxEigen = pL.get<bool>("sa: calculate eigenvalue estimate"); if (dampingFactor != Teuchos::ScalarTraits<SC>::zero()) { SC lambdaMax; { SubFactoryMonitor m2(*this, "Eigenvalue estimate", coarseLevel); lambdaMax = A->GetMaxEigenvalueEstimate(); if (lambdaMax == -Teuchos::ScalarTraits<SC>::one() || estimateMaxEigen) { GetOStream(Statistics1) << "Calculating max eigenvalue estimate now (max iters = "<< maxEigenIterations << ")" << std::endl; Magnitude stopTol = 1e-4; lambdaMax = Utilities_kokkos::PowerMethod(*A, true, maxEigenIterations, stopTol); A->SetMaxEigenvalueEstimate(lambdaMax); } else { GetOStream(Statistics1) << "Using cached max eigenvalue estimate" << std::endl; } GetOStream(Statistics0) << "Prolongator damping factor = " << dampingFactor/lambdaMax << " (" << dampingFactor << " / " << lambdaMax << ")" << std::endl; } { SubFactoryMonitor m2(*this, "Fused (I-omega*D^{-1} A)*Ptent", coarseLevel); RCP<Vector> invDiag = Utilities_kokkos::GetMatrixDiagonalInverse(*A); SC omega = dampingFactor / lambdaMax; // finalP = Ptent + (I - \omega D^{-1}A) Ptent finalP = Xpetra::IteratorOps<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Jacobi(omega, *invDiag, *A, *Ptent, finalP, GetOStream(Statistics2), std::string("MueLu::SaP-") + toString(coarseLevel.GetLevelID()), APparams); } } else { finalP = Ptent; } // Level Set if (!restrictionMode_) { // prolongation factory is in prolongation mode Set(coarseLevel, "P", finalP); // NOTE: EXPERIMENTAL if (Ptent->IsView("stridedMaps")) finalP->CreateView("stridedMaps", Ptent); } else { // prolongation factory is in restriction mode RCP<Matrix> R = Utilities_kokkos::Transpose(*finalP, true); Set(coarseLevel, "R", R); // NOTE: EXPERIMENTAL if (Ptent->IsView("stridedMaps")) R->CreateView("stridedMaps", Ptent, true); } if (IsPrint(Statistics1)) { RCP<ParameterList> params = rcp(new ParameterList()); params->set("printLoadBalancingInfo", true); params->set("printCommInfo", true); GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*finalP, (!restrictionMode_ ? "P" : "R"), params); } } //Build()
void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level &fineLevel, Level &coarseLevel) const { // FIXME make fineLevel const { FactoryMonitor m(*this, "Computing Ac", coarseLevel); // Set "Keeps" from params const Teuchos::ParameterList& pL = GetParameterList(); if (pL.isParameter("Keep AP Pattern") && pL.get<bool>("Keep AP Pattern")) coarseLevel.Keep("AP Pattern", this); if (pL.isParameter("Keep RAP Pattern") && pL.get<bool>("Keep RAP Pattern")) coarseLevel.Keep("RAP Pattern", this); // // Inputs: A, P // RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A"); RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P"); // // Build Ac = RAP // RCP<Matrix> AP; // Reuse pattern if available (multiple solve) if (coarseLevel.IsAvailable("AP Pattern", this)){ GetOStream(Runtime0, 0) << "Ac: Using previous AP pattern"<<std::endl; AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern"); } { SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel); AP = Utils::Multiply(*A, false, *P, false, AP); Set(coarseLevel, "AP Pattern", AP); } bool doOptimizedStorage = !checkAc_; // Optimization storage option. If not modifying matrix later (inserting local values), allow optimization of storage. // This is necessary for new faster Epetra MM kernels. RCP<Matrix> Ac; // Reuse coarse matrix memory if available (multiple solve) if (coarseLevel.IsAvailable("RAP Pattern", this)) { GetOStream(Runtime0, 0) << "Ac: Using previous RAP pattern" << std::endl; Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern"); } if (implicitTranspose_) { SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel); Ac = Utils::Multiply(*P, true, *AP, false, Ac, true, doOptimizedStorage); } else { SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel); RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R"); Ac = Utils::Multiply(*R, false, *AP, false, Ac, true, doOptimizedStorage); } if (checkAc_) CheckMainDiagonal(Ac); RCP<ParameterList> params = rcp(new ParameterList());; params->set("printLoadBalancingInfo", true); GetOStream(Statistics0, 0) << Utils::PrintMatrixInfo(*Ac, "Ac", params); Set(coarseLevel, "A", Ac); Set(coarseLevel, "RAP Pattern", Ac); } if (transferFacts_.begin() != transferFacts_.end()) { SubFactoryMonitor m(*this, "Projections", coarseLevel); // call Build of all user-given transfer factories for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) { GetOStream(Runtime0, 0) << "Ac: call transfer factory " << (*it).get() << ": " << (*it)->description() << std::endl; (*it)->CallBuild(coarseLevel); } } }
void CloneRepartitionInterface<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level ¤tLevel) const { FactoryMonitor m(*this, "Build", currentLevel); currentLevel.print(GetOStream(Statistics0,0)); // extract blocked operator A from current level Teuchos::RCP<Matrix> A = Get< Teuchos::RCP<Matrix> > (currentLevel, "A"); Teuchos::RCP<const Teuchos::Comm< int > > comm = A->getRowMap()->getComm(); // number of Partitions only used for a shortcut. GO numPartitions = 0; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition = Teuchos::null; // extract decomposition vector decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); ArrayRCP<const GO> decompEntries = decomposition->getData(0); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } // create new decomposition vector Teuchos::RCP<GOVector> ret = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), false); ArrayRCP<GO> retDecompEntries = ret->getDataNonConst(0); // block size of output vector LocalOrdinal blkSize = 1; // check for blocking/striding information if(A->IsView("stridedMaps") && Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap("stridedMaps")) != Teuchos::null) { Xpetra::viewLabel_t oldView = A->SwitchToView("stridedMaps"); // note: "stridedMaps are always non-overlapping (correspond to range and domain maps!) RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(A->getRowMap()); TEUCHOS_TEST_FOR_EXCEPTION(strMap == Teuchos::null,Exceptions::BadCast,"MueLu::CloneRepartitionInterface::Build: cast to strided row map failed."); LocalOrdinal stridedBlock = strMap->getStridedBlockId(); if (stridedBlock == -1) blkSize = strMap->getFixedBlockSize(); else { std::vector<size_t> strInfo = strMap->getStridingData(); blkSize = strInfo[stridedBlock]; } oldView = A->SwitchToView(oldView); GetOStream(Statistics1) << "CloneRepartitionInterface::Build():" << " found blockdim=" << blkSize << " from strided maps."<< std::endl; } else { GetOStream(Statistics1) << "CloneRepartitionInterface::Build(): no striding information available. Use blockdim=" << blkSize << " (DofsPerNode)." << std::endl; blkSize = A->GetFixedBlockSize(); } // plausibility check! size_t inLocalLength = decomposition->getLocalLength(); size_t outLocalLength = A->getRowMap()->getNodeNumElements(); // only for non-strided maps size_t numLocalNodes = outLocalLength / blkSize; TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(outLocalLength % blkSize) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << outLocalLength << ") and degrees of freedoms (" << blkSize <<")"); if (numLocalNodes > 0) { TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as<size_t>(inLocalLength % numLocalNodes) != 0, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: inconsistent number of local DOFs (" << inLocalLength << ") and number of local nodes (" << numLocalNodes << ")"); LocalOrdinal inBlkSize = Teuchos::as<LocalOrdinal>(inLocalLength / numLocalNodes); //TEUCHOS_TEST_FOR_EXCEPTION(blkSize != inBlkSize, MueLu::Exceptions::RuntimeError,"CloneRepartitionInterface: input block size = " << inBlkSize << " outpub block size = " << blkSize << ". They should be the same."); for(LO i = 0; i<Teuchos::as<LO>(numLocalNodes); i++) { for(LO j = 0; j < blkSize; j++) { retDecompEntries[i*blkSize + j] = Teuchos::as<GO>(decompEntries[i*inBlkSize]); } } } // end if numLocalNodes > 0 Set(currentLevel, "Partition", ret); } //Build()
bool IsAvailable(Level & level, const std::string & varName) const { return level.IsAvailable(varName, GetFactory(varName).get()); }
void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& fineLevel, Level& coarseLevel) const { { FactoryMonitor m(*this, "Computing Ac", coarseLevel); std::ostringstream levelstr; levelstr << coarseLevel.GetLevelID(); TEUCHOS_TEST_FOR_EXCEPTION(hasDeclaredInput_==false, Exceptions::RuntimeError, "MueLu::RAPFactory::Build(): CallDeclareInput has not been called before Build!"); // Set "Keeps" from params const Teuchos::ParameterList& pL = GetParameterList(); if (pL.get<bool>("Keep AP Pattern")) coarseLevel.Keep("AP Pattern", this); if (pL.get<bool>("Keep RAP Pattern")) coarseLevel.Keep("RAP Pattern", this); RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A"); RCP<Matrix> P = Get< RCP<Matrix> >(coarseLevel, "P"), AP, Ac; // Reuse pattern if available (multiple solve) if (coarseLevel.IsAvailable("AP Pattern", this)) { GetOStream(Runtime0) << "Ac: Using previous AP pattern" << std::endl; AP = Get< RCP<Matrix> >(coarseLevel, "AP Pattern"); } { SubFactoryMonitor subM(*this, "MxM: A x P", coarseLevel); AP = Utils::Multiply(*A, false, *P, false, AP, GetOStream(Statistics2),true,true,std::string("MueLu::A*P-")+levelstr.str()); } if (pL.get<bool>("Keep AP Pattern")) Set(coarseLevel, "AP Pattern", AP); // Reuse coarse matrix memory if available (multiple solve) if (coarseLevel.IsAvailable("RAP Pattern", this)) { GetOStream(Runtime0) << "Ac: Using previous RAP pattern" << std::endl; Ac = Get< RCP<Matrix> >(coarseLevel, "RAP Pattern"); // Some eigenvalue may have been cached with the matrix in the previous run. // As the matrix values will be updated, we need to reset the eigenvalue. Ac->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one()); } // If we do not modify matrix later, allow optimization of storage. // This is necessary for new faster Epetra MM kernels. bool doOptimizeStorage = !pL.get<bool>("RepairMainDiagonal"); const bool doTranspose = true; const bool doFillComplete = true; if (pL.get<bool>("transpose: use implicit") == true) { SubFactoryMonitor m2(*this, "MxM: P' x (AP) (implicit)", coarseLevel); Ac = Utils::Multiply(*P, doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-implicit-")+levelstr.str()); } else { RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R"); SubFactoryMonitor m2(*this, "MxM: R x (AP) (explicit)", coarseLevel); Ac = Utils::Multiply(*R, !doTranspose, *AP, !doTranspose, Ac, GetOStream(Statistics2), doFillComplete, doOptimizeStorage,std::string("MueLu::R*(AP)-explicit-")+levelstr.str()); } CheckRepairMainDiagonal(Ac); if (IsPrint(Statistics1)) { RCP<ParameterList> params = rcp(new ParameterList());; params->set("printLoadBalancingInfo", true); params->set("printCommInfo", true); GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ac, "Ac", params); } Set(coarseLevel, "A", Ac); if (pL.get<bool>("Keep RAP Pattern")) Set(coarseLevel, "RAP Pattern", Ac); } if (transferFacts_.begin() != transferFacts_.end()) { SubFactoryMonitor m(*this, "Projections", coarseLevel); // call Build of all user-given transfer factories for (std::vector<RCP<const FactoryBase> >::const_iterator it = transferFacts_.begin(); it != transferFacts_.end(); ++it) { RCP<const FactoryBase> fac = *it; GetOStream(Runtime0) << "RAPFactory: call transfer factory: " << fac->description() << std::endl; fac->CallBuild(coarseLevel); // Coordinates transfer is marginally different from all other operations // because it is *optional*, and not required. For instance, we may need // coordinates only on level 4 if we start repartitioning from that level, // but we don't need them on level 1,2,3. As our current Hierarchy setup // assumes propagation of dependencies only through three levels, this // means that we need to rely on other methods to propagate optional data. // // The method currently used is through RAP transfer factories, which are // simply factories which are called at the end of RAP with a single goal: // transfer some fine data to coarser level. Because these factories are // kind of outside of the mainline factories, they behave different. In // particular, we call their Build method explicitly, rather than through // Get calls. This difference is significant, as the Get call is smart // enough to know when to release all factory dependencies, and Build is // dumb. This led to the following CoordinatesTransferFactory sequence: // 1. Request level 0 // 2. Request level 1 // 3. Request level 0 // 4. Release level 0 // 5. Release level 1 // // The problem is missing "6. Release level 0". Because it was missing, // we had outstanding request on "Coordinates", "Aggregates" and // "CoarseMap" on level 0. // // This was fixed by explicitly calling Release on transfer factories in // RAPFactory. I am still unsure how exactly it works, but now we have // clear data requests for all levels. coarseLevel.Release(*fac); } } }
void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const { FactoryMonitor m(*this, "Build", currentLevel); const Teuchos::ParameterList & pL = GetParameterList(); // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation. // TODO (JG): I don't really know if we want to do this. const int startLevel = pL.get<int> ("repartition: start level"); const LO minRowsPerProcessor = pL.get<LO> ("repartition: min rows per proc"); const double nonzeroImbalance = pL.get<double>("repartition: max imbalance"); const bool remapPartitions = pL.get<bool> ("repartition: remap parts"); // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types. RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A"); // ====================================================================================================== // Determine whether partitioning is needed // ====================================================================================================== // NOTE: most tests include some global communication, which is why we currently only do tests until we make // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to // rebalance an operator. So, it would probably be beneficial to do and report *all* tests. // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning if (currentLevel.GetLevelID() < startLevel) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } RCP<const Map> rowMap = A->getRowMap(); // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it // does not matter. RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm(); RCP<const Teuchos::Comm<int> > comm = origComm->duplicate(); // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created) // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size { int numActiveProcesses = 0; MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses); if (numActiveProcesses == 1) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } bool test3 = false, test4 = false; std::string msg3, msg4; // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3) if (minRowsPerProcessor > 0) { LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max(); LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0; MueLu_sumAll(comm, haveFewRows, numWithFewRows); MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows); // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements. // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement. if (numWithFewRows > 0) test3 = true; msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor); } // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold if (!test3) { GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries()); MueLu_maxAll(comm, numMyNnz, maxNnz); MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors double imbalance = Teuchos::as<double>(maxNnz)/minNnz; if (imbalance > nonzeroImbalance) test4 = true; msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance); } if (!test3 && !test4) { GetOStream(Statistics0) << "Repartitioning? NO:" << msg3 + msg4 << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl; GO indexBase = rowMap->getIndexBase(); Xpetra::UnderlyingLib lib = rowMap->lib(); int myRank = comm->getRank(); int numProcs = comm->getSize(); RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm); TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object."); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); // ====================================================================================================== // Calculate number of partitions // ====================================================================================================== // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML) // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold. GO numPartitions; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } else { if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) { // System is too small, migrate it to a single processor numPartitions = 1; } else { // Make sure that each processor has approximately minRowsPerProcessor numPartitions = A->getGlobalNumRows() / minRowsPerProcessor; } numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs)); currentLevel.Set("number of partitions", numPartitions, NoFactory::get()); } GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl; // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition; if (numPartitions == 1) { // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut // in Zoltan[12]Interface). // TODO: We can probably skip more work in this case (like building all extra data structures) GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl; decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true); } else { decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } // ====================================================================================================== // Remap if necessary // ====================================================================================================== // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature. // There are two problems, however. // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings. // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for // each partition is the same. // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level. // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking // algorithm can resolve these differently. For instance, running // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12 // with // <ParameterList name="MueLu"> // <Parameter name="coarse: max size" type="int" value="1"/> // <Parameter name="repartition: enable" type="bool" value="true"/> // <Parameter name="repartition: min rows per proc" type="int" value="2"/> // <ParameterList name="level 1"> // <Parameter name="repartition: remap parts" type="bool" value="false/true"/> // </ParameterList> // </ParameterList> // produces different repartitioning for level 2. // This different repartitioning may then escalate into different aggregation for the next level. // // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector. // Fixing (2) is more complicated. // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu, // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1 // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1. // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place. // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in // MPI_COMM_WORLD, which may lead to issues for logging. if (remapPartitions) { SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel); DeterminePartitionPlacement(*A, *decomposition, numPartitions); } // ====================================================================================================== // Construct importer // ====================================================================================================== // At this point, the following is true: // * Each processors owns 0 or 1 partitions // * If a processor owns a partition, that partition number is equal to the processor rank // * The decomposition vector contains the partitions ids that the corresponding GID belongs to ArrayRCP<const GO> decompEntries; if (decomposition->getLocalLength() > 0) decompEntries = decomposition->getData(0); #ifdef HAVE_MUELU_DEBUG // Test range of partition ids int incorrectRank = -1; for (int i = 0; i < decompEntries.size(); i++) if (decompEntries[i] >= numProcs || decompEntries[i] < 0) { incorrectRank = myRank; break; } int incorrectGlobalRank = -1; MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank); TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range"); #endif Array<GO> myGIDs; myGIDs.reserve(decomposition->getLocalLength()); // Step 0: Construct mapping // part number -> GIDs I own which belong to this part // NOTE: my own part GIDs are not part of the map typedef std::map<GO, Array<GO> > map_type; map_type sendMap; for (LO i = 0; i < decompEntries.size(); i++) { GO id = decompEntries[i]; GO GID = rowMap->getGlobalElement(i); if (id == myRank) myGIDs .push_back(GID); else sendMap[id].push_back(GID); } decompEntries = Teuchos::null; if (IsPrint(Statistics2)) { GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows(); MueLu_sumAll(comm,numLocalKept, numGlobalKept); GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl; } int numSend = sendMap.size(), numRecv; // Arrayify map keys Array<GO> myParts(numSend), myPart(1); int cnt = 0; myPart[0] = myRank; for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++) myParts[cnt++] = it->first; // Step 1: Find out how many processors send me data // partsIndexBase starts from zero, as the processors ids start from zero GO partsIndexBase = 0; RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm); RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm); RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn); RCP<GOVector> partsISend = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave); RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn); if (numSend) { ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0); for (int i = 0; i < numSend; i++) partsISendData[i] = 1; } (numPartsIRecv->getDataNonConst(0))[0] = 0; numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD); numRecv = (numPartsIRecv->getData(0))[0]; // Step 2: Get my GIDs from everybody else MPI_Datatype MpiType = MpiTypeTraits<GO>::getType(); int msgTag = 12345; // TODO: use Comm::dup for all internal messaging // Post sends Array<MPI_Request> sendReqs(numSend); cnt = 0; for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++) MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]); map_type recvMap; size_t totalGIDs = myGIDs.size(); for (int i = 0; i < numRecv; i++) { MPI_Status status; MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status); // Get rank and number of elements from status int fromRank = status.MPI_SOURCE, count; MPI_Get_count(&status, MpiType, &count); recvMap[fromRank].resize(count); MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status); totalGIDs += count; } // Do waits on send requests if (numSend) { Array<MPI_Status> sendStatuses(numSend); MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr()); } // Merge GIDs myGIDs.reserve(totalGIDs); for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) { int offset = myGIDs.size(), len = it->second.size(); if (len) { myGIDs.resize(offset + len); memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO)); } } // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation. std::sort(myGIDs.begin(), myGIDs.end()); // Step 3: Construct importer RCP<Map> newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm); RCP<const Import> rowMapImporter; { SubFactoryMonitor m1(*this, "Import construction", currentLevel); rowMapImporter = ImportFactory::Build(rowMap, newRowMap); } Set(currentLevel, "Importer", rowMapImporter); // ====================================================================================================== // Print some data // ====================================================================================================== if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) { // Print the grid of processors GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl; char amActive = (myGIDs.size() ? 1 : 0); std::vector<char> areActive(numProcs, 0); MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm); int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100); for (int proc = 0; proc < numProcs; proc += rowWidth) { for (int j = 0; j < rowWidth; j++) if (proc + j < numProcs) GetOStream(Statistics2) << (areActive[proc + j] ? "+" : "."); else GetOStream(Statistics2) << " "; GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl; } } } // Build
void SmootherFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildSmoother(Level& currentLevel, PreOrPost const preOrPost) const { // SmootherFactory is quite tricky because of the fact that one of the smoother prototypes may be zero. // The challenge is that we have no way of knowing how user uses this factory. For instance, lets say // user wants to use s1 prototype as a presmoother, and s2 as a postsmoother. He could do: // (a) create SmootherFactory(s1, s2), or // (b) create SmootherFactory(s1, null) and SmootherFactory(null, s2) // It may also happen that somewhere somebody set presmoother factory = postsmoother factory = (a) // How do you do DeclareInput in this case? It could easily introduce a bug if a user does not check // whether presmoother = postsmoother. A buggy code could look like that: // RCP<SmootherFactory> s = rcp(new SmootherFactory(s1,s2)); // level.Request("PreSmoother", s.get()); // level.Request("PostSmoother", s.get()); // Get<RCP<SmootherBase> > pre = Get<RCP<SmootherBase> >("PreSmoother", s.get()); // Get<RCP<SmootherBase> > post = Get<RCP<SmootherBase> >("PostSmoother", s.get()); // This code would call DeclareInput in request mode twice, but as the Build method generates both Pre and Post // smoothers, it would call DelcareInput in release mode only once, leaving requests. // This code has another problem if s2 = Teuchos::null. In that case, despite the request for PostSmoother, the factory // would not generate one, and second Get would throw. The real issue here is that given a Factory pointer // there is no way to be sure that this factory would generate any of "PreSmoother" or "PostSmoother", unless you are // able to cast it to SmootherFactory, do GetPrototypes and to check whether any of those is Teuchos::null. const Teuchos::ParameterList& pL = GetParameterList(); RCP<SmootherPrototype> preSmoother, postSmoother; ParameterList preSmootherParams, postSmootherParams; if ((preOrPost & PRE) && !preSmootherPrototype_.is_null()) { if (currentLevel.IsAvailable("PreSmoother data", this)) preSmoother = currentLevel.Get<RCP<SmootherPrototype> >("PreSmoother data", this); else preSmoother = preSmootherPrototype_->Copy(); int oldRank = -1; if (!currentLevel.GetComm().is_null()) oldRank = preSmoother->SetProcRankVerbose(currentLevel.GetComm()->getRank()); preSmoother->Setup(currentLevel); preSmootherParams = preSmoother->GetParameterList(); if (oldRank != -1) preSmoother->SetProcRankVerbose(oldRank); currentLevel.Set<RCP<SmootherBase> >("PreSmoother", preSmoother, this); if (pL.get<bool>("keep smoother data")) Set(currentLevel, "PreSmoother data", preSmoother); } if ((preOrPost & POST) && !postSmootherPrototype_.is_null()) { if (preOrPost == BOTH && preSmootherPrototype_ == postSmootherPrototype_) { // Simple reuse // Same prototypes for pre- and post-smoothers mean that we only need to call Setup only once postSmoother = preSmoother; // else if (preOrPost == BOTH && // preSmootherPrototype_ != Teuchos::null && // preSmootherPrototype_->GetType() == postSmootherPrototype_->GetType()) { // // More complex reuse case: need implementation of CopyParameters() and a smoothers smart enough to know when parameters affect the setup phase. // // YES: post-smoother == pre-smoother // // => copy the pre-smoother to avoid the setup phase of the post-smoother. // postSmoother = preSmoother->Copy(); // // If the post-smoother parameters are different from // // pre-smoother, the parameters stored in the post-smoother // // prototype are copied in the new post-smoother object. // postSmoother->CopyParameters(postSmootherPrototype_); // // If parameters don't influence the Setup phase (it is the case // // for Jacobi, Chebyshev...), PostSmoother is already setup. Nothing // // more to do. In the case of ILU, parameters of the smoother // // are in fact the parameters of the Setup phase. The call to // // CopyParameters resets the smoother (only if parameters are // // different) and we must call Setup() again. // postSmoother->Setup(currentLevel); // } // // TODO: if CopyParameters do not exist, do setup twice. } else { if (currentLevel.IsAvailable("PostSmoother data", this)) { postSmoother = currentLevel.Get<RCP<SmootherPrototype> >("PostSmoother data", this); } else { // No reuse: // - either we only do postsmoothing without any presmoothing // - or our postsmoother is different from presmoother postSmoother = postSmootherPrototype_->Copy(); } int oldRank = -1; if (!currentLevel.GetComm().is_null()) oldRank = postSmoother->SetProcRankVerbose(GetProcRankVerbose()); postSmoother->Setup(currentLevel); postSmootherParams = postSmoother->GetParameterList(); if (oldRank != -1) postSmoother->SetProcRankVerbose(oldRank); } currentLevel.Set<RCP<SmootherBase> >("PostSmoother", postSmoother, this); if (pL.get<bool>("keep smoother data")) Set(currentLevel, "PostSmoother data", preSmoother); } ParameterList& paramList = const_cast<ParameterList&>(this->GetParameterList()); if (postSmoother == preSmoother && !preSmoother.is_null()) { paramList.sublist("smoother", false) = preSmoother->GetParameterList(); } else { if (!preSmoother.is_null()) paramList.sublist("presmoother", false) = preSmootherParams; if (!postSmoother.is_null()) paramList.sublist("postsmoother", false) = postSmootherParams; } } // Build()
void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level & fineLevel, Level &coarseLevel) const { FactoryMonitor m(*this, "Build", coarseLevel); typedef Xpetra::MultiVector<double,LO,GO,NO> xdMV; GetOStream(Runtime0) << "Transferring coordinates" << std::endl; if (coarseLevel.IsAvailable("Coordinates", this)) { GetOStream(Runtime0) << "Reusing coordinates" << std::endl; return; } RCP<Aggregates> aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates"); RCP<xdMV> fineCoords = Get< RCP<xdMV> >(fineLevel, "Coordinates"); RCP<const Map> coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap"); // coarseMap is being used to set up the domain map of tentative P, and therefore, the row map of Ac // Therefore, if we amalgamate coarseMap, logical nodes in the coordinates vector would correspond to // logical blocks in the matrix ArrayView<const GO> elementAList = coarseMap->getNodeElementList(); LO blkSize = 1; if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null) blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize(); GO indexBase = coarseMap->getIndexBase(); size_t numElements = elementAList.size() / blkSize; Array<GO> elementList(numElements); // Amalgamate the map for (LO i = 0; i < Teuchos::as<LO>(numElements); i++) elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase; RCP<const Map> uniqueMap = fineCoords->getMap(); RCP<const Map> coarseCoordMap = MapFactory ::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), elementList, indexBase, coarseMap->getComm()); RCP<xdMV> coarseCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(coarseCoordMap, fineCoords->getNumVectors()); // Create overlapped fine coordinates to reduce global communication RCP<xdMV> ghostedCoords = fineCoords; if (aggregates->AggregatesCrossProcessors()) { RCP<const Map> nonUniqueMap = aggregates->GetMap(); RCP<const Import> importer = ImportFactory::Build(uniqueMap, nonUniqueMap); ghostedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(nonUniqueMap, fineCoords->getNumVectors()); ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT); } // Get some info about aggregates int myPID = uniqueMap->getComm()->getRank(); LO numAggs = aggregates->GetNumAggregates(); ArrayRCP<LO> aggSizes = aggregates->ComputeAggregateSizes(true,true); const ArrayRCP<const LO> vertex2AggID = aggregates->GetVertex2AggId()->getData(0); const ArrayRCP<const LO> procWinner = aggregates->GetProcWinner()->getData(0); // Fill in coarse coordinates for (size_t j = 0; j < fineCoords->getNumVectors(); j++) { ArrayRCP<const double> fineCoordsData = ghostedCoords->getData(j); ArrayRCP<double> coarseCoordsData = coarseCoords->getDataNonConst(j); for (LO lnode = 0; lnode < vertex2AggID.size(); lnode++) if (procWinner[lnode] == myPID) coarseCoordsData[vertex2AggID[lnode]] += fineCoordsData[lnode]; for (LO agg = 0; agg < numAggs; agg++) coarseCoordsData[agg] /= aggSizes[agg]; } Set<RCP<xdMV> >(coarseLevel, "Coordinates", coarseCoords); const ParameterList& pL = GetParameterList(); int writeStart = pL.get<int>("write start"), writeEnd = pL.get<int>("write end"); if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd) { std::ostringstream buf; buf << fineLevel.GetLevelID(); std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m"; Xpetra::IO<double,LO,GO,NO>::Write(fileName,*fineCoords); } if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd) { std::ostringstream buf; buf << coarseLevel.GetLevelID(); std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m"; Xpetra::IO<double,LO,GO,NO>::Write(fileName,*coarseCoords); } }
void EminPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildP(Level& fineLevel, Level& coarseLevel) const { FactoryMonitor m(*this, "Prolongator minimization", coarseLevel); const ParameterList & pL = GetParameterList(); // Get the matrix RCP<Matrix> A = Get< RCP<Matrix> >(fineLevel, "A"); // Get/make initial guess RCP<Matrix> P0; int Niterations; if (coarseLevel.IsAvailable("P0", this)) { // Reuse data P0 = coarseLevel.Get<RCP<Matrix> >("P0", this); Niterations = pL.get<int>("Reuse Niterations"); GetOStream(Runtime0, 0) << "Reusing P0" << std::endl; } else { // Construct data P0 = Get< RCP<Matrix> >(coarseLevel, "P"); Niterations = pL.get<int>("Niterations"); } // NOTE: the main assumption here that P0 satisfies both constraints: // - nonzero pattern // - nullspace preservation // Get/make constraint operator RCP<Constraint> X; if (coarseLevel.IsAvailable("Constraint0", this)) { // Reuse data X = coarseLevel.Get<RCP<Constraint> >("Constraint0", this); GetOStream(Runtime0, 0) << "Reusing Constraint0" << std::endl; } else { // Construct data X = Get< RCP<Constraint> >(coarseLevel, "Constraint"); } GetOStream(Runtime0,0) << "Number of emin iterations = " << Niterations << std::endl; RCP<Matrix> P; CGSolver EminSolver(Niterations); EminSolver.Iterate(*A, *X, *P0, P); Set(coarseLevel, "P", P); if (pL.get<bool>("Keep P0")) { // NOTE: we must do Keep _before_ set as the Needs class only sets if // a) data has been requested (which is not the case here), or // b) data has some keep flag coarseLevel.Keep("P0", this); Set(coarseLevel, "P0", P); } if (pL.get<bool>("Keep Constraint0")) { // NOTE: we must do Keep _before_ set as the Needs class only sets if // a) data has been requested (which is not the case here), or // b) data has some keep flag coarseLevel.Keep("Constraint0", this); Set(coarseLevel, "Constraint0", X); } RCP<ParameterList> params = rcp(new ParameterList()); params->set("printLoadBalancingInfo", true); GetOStream(Statistics1,0) << Utils::PrintMatrixInfo(*P, "P", params); }
void RigidBodyModeFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level ¤tLevel) const { FactoryMonitor m(*this, "Rigid body mode factory", currentLevel); RCP<MultiVector> nullspace; if (currentLevel.GetLevelID() == 0) { if (currentLevel.IsAvailable(nspName_, NoFactory::get())) { nullspace = currentLevel.Get< RCP<MultiVector> >(nspName_, NoFactory::get()); GetOStream(Runtime1, 0) << "Use user-given rigid body modes " << nspName_ << ": nullspace dimension=" << nullspace->getNumVectors() << " nullspace length=" << nullspace->getGlobalLength() << std::endl; } else { RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A"); GetOStream(Runtime1, 0) << "Generating rigid body modes: dimension = " << numPDEs_ << std::endl; RCP<const Map> xmap=A->getDomainMap(); if(numPDEs_==1) { nullspace = MultiVectorFactory::Build(xmap, 1); } else if(numPDEs_==2) { nullspace = MultiVectorFactory::Build(xmap, 3); } else if(numPDEs_==3) { nullspace = MultiVectorFactory::Build(xmap, 6); } Scalar zero(0.0); nullspace -> putScalar(zero); RCP<MultiVector> Coords = Get< RCP<MultiVector> >(currentLevel,"Coordinates"); ArrayRCP<Scalar> xnodes, ynodes, znodes; Scalar cx, cy, cz; ArrayRCP<Scalar> nsValues0, nsValues1, nsValues2, nsValues3, nsValues4, nsValues5; int nDOFs=xmap->getNodeNumElements(); if(numPDEs_==1) { nsValues0 = nullspace->getDataNonConst(0); for(int j=0; j<nDOFs; j++) { // constant null space for scalar PDE nsValues0[j]=1.0; } } else if(numPDEs_==2) { xnodes = Coords->getDataNonConst(0); ynodes = Coords->getDataNonConst(1); cx = Coords->getVector(0)->meanValue(); cy = Coords->getVector(1)->meanValue(); nsValues0 = nullspace->getDataNonConst(0); nsValues1 = nullspace->getDataNonConst(1); nsValues2 = nullspace->getDataNonConst(2); for (int j=0; j<nDOFs; j+=numPDEs_) { // translation nsValues0[j+0] = 1.0; nsValues1[j+1] = 1.0; // rotate around z-axis (x-y plane) nsValues2[j+0] = -(ynodes[j]-cy); nsValues2[j+1] = (xnodes[j]-cx); } } else if(numPDEs_==3) { xnodes = Coords->getDataNonConst(0); ynodes = Coords->getDataNonConst(1); znodes = Coords->getDataNonConst(2); cx = Coords->getVector(0)->meanValue(); cy = Coords->getVector(1)->meanValue(); cz = Coords->getVector(2)->meanValue(); nsValues0 = nullspace->getDataNonConst(0); nsValues1 = nullspace->getDataNonConst(1); nsValues2 = nullspace->getDataNonConst(2); nsValues3 = nullspace->getDataNonConst(3); nsValues4 = nullspace->getDataNonConst(4); nsValues5 = nullspace->getDataNonConst(5); for (int j=0; j<nDOFs; j+=numPDEs_) { // translation nsValues0[j+0] = 1.0; nsValues1[j+1] = 1.0; nsValues2[j+2] = 1.0; // rotate around z-axis (x-y plane) nsValues3[j+0] = -(ynodes[j]-cy); nsValues3[j+1] = (xnodes[j]-cx); // rotate around x-axis (y-z plane) nsValues4[j+1] = -(znodes[j]-cz); nsValues4[j+2] = (ynodes[j]-cy); // rotate around y-axis (x-z plane) nsValues5[j+0] = (znodes[j]-cz); nsValues5[j+2] = -(xnodes[j]-cx); } } } // end if "Nullspace" not available } else { nullspace = currentLevel.Get< RCP<MultiVector> >("Nullspace", GetFactory(nspName_).get()); } Set(currentLevel, "Nullspace", nullspace); }