void calculateStats(Type& minVal, Type& maxVal, double& avgVal, double& devVal, const RCP<const Teuchos::Comm<int> >& comm, const Type& v) { int numProcs = comm->getSize(); Type sumVal, sum2Val; MueLu_sumAll(comm, v, sumVal); MueLu_sumAll(comm, v*v, sum2Val); MueLu_minAll(comm, v, minVal); MueLu_maxAll(comm, v, maxVal); avgVal = as<double>(sumVal) / numProcs; devVal = (numProcs != 1 ? sqrt((sum2Val - sumVal*avgVal)/(numProcs-1)) : 0); }
std::string MHDRAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::PrintLoadBalancingInfo(const Matrix & Ac, const std::string & msgTag) { std::stringstream ss(std::stringstream::out); // TODO: provide a option to skip this (to avoid global communication) // TODO: skip if nproc == 1 //nonzero imbalance size_t numMyNnz = Ac.getNodeNumEntries(); GO maxNnz, minNnz; RCP<const Teuchos::Comm<int> > comm = Ac.getRowMap()->getComm(); MueLu_maxAll(comm,(GO)numMyNnz,maxNnz); //min nnz over all proc (disallow any processors with 0 nnz) MueLu_minAll(comm, (GO)((numMyNnz > 0) ? numMyNnz : maxNnz), minNnz); double imbalance = ((double) maxNnz) / minNnz; size_t numMyRows = Ac.getNodeNumRows(); //Check whether Ac is spread over more than one process. GO numActiveProcesses=0; MueLu_sumAll(comm, (GO)((numMyRows > 0) ? 1 : 0), numActiveProcesses); //min, max, and avg # rows per proc GO minNumRows, maxNumRows; double avgNumRows; MueLu_maxAll(comm, (GO)numMyRows, maxNumRows); MueLu_minAll(comm, (GO)((numMyRows > 0) ? numMyRows : maxNumRows), minNumRows); assert(numActiveProcesses > 0); avgNumRows = Ac.getGlobalNumRows() / numActiveProcesses; ss << msgTag << " # processes with rows = " << numActiveProcesses << std::endl; ss << msgTag << " min # rows per proc = " << minNumRows << ", max # rows per proc = " << maxNumRows << ", avg # rows per proc = " << avgNumRows << std::endl; ss << msgTag << " nonzero imbalance = " << imbalance << std::endl; return ss.str(); }
void RAPFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::CheckRepairMainDiagonal(RCP<Matrix>& Ac) const { const Teuchos::ParameterList& pL = GetParameterList(); bool repairZeroDiagonals = pL.get<bool>("RepairMainDiagonal"); bool checkAc = pL.get<bool>("CheckMainDiagonal"); if (!checkAc && !repairZeroDiagonals) return; SC zero = Teuchos::ScalarTraits<SC>::zero(), one = Teuchos::ScalarTraits<SC>::one(); Teuchos::RCP<Teuchos::ParameterList> p = Teuchos::rcp(new Teuchos::ParameterList()); p->set("DoOptimizeStorage", true); RCP<const Map> rowMap = Ac->getRowMap(); RCP<Vector> diagVec = VectorFactory::Build(rowMap); Ac->getLocalDiagCopy(*diagVec); LO lZeroDiags = 0; Teuchos::ArrayRCP< Scalar > diagVal = diagVec->getDataNonConst(0); for (size_t i = 0; i < rowMap->getNodeNumElements(); i++) { if (diagVal[i] == zero) { lZeroDiags++; } } GO gZeroDiags; MueLu_sumAll(rowMap->getComm(), Teuchos::as<GO>(lZeroDiags), gZeroDiags); if (repairZeroDiagonals && gZeroDiags > 0) { // TAW: If Ac has empty rows, put a 1 on the diagonal of Ac. Be aware that Ac might have empty rows AND columns. // The columns might not exist in the column map at all. // // It would be nice to add the entries to the original matrix Ac. But then we would have to use // insertLocalValues. However we cannot add new entries for local column indices that do not exist in the column map // of Ac (at least Epetra is not able to do this). // // Here we build a diagonal matrix with zeros on the diagonal and ones on the diagonal for the rows where Ac has empty rows // We have to build a new matrix to be able to use insertGlobalValues. Then we add the original matrix Ac to our new block // diagonal matrix and use the result as new (non-singular) matrix Ac. // This is very inefficient. // // If you know something better, please let me know. RCP<Matrix> fixDiagMatrix = Teuchos::null; fixDiagMatrix = MatrixFactory::Build(rowMap, 1); for (size_t r = 0; r < rowMap->getNodeNumElements(); r++) { if (diagVal[r] == zero) { GO grid = rowMap->getGlobalElement(r); Teuchos::ArrayRCP<GO> indout(1,grid); Teuchos::ArrayRCP<SC> valout(1, one); fixDiagMatrix->insertGlobalValues(grid,indout.view(0, 1), valout.view(0, 1)); } } { Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete1")); Ac->fillComplete(p); } MueLu::Utils2<Scalar, LocalOrdinal, GlobalOrdinal, Node>::TwoMatrixAdd(*Ac, false, 1.0, *fixDiagMatrix, 1.0); if (Ac->IsView("stridedMaps")) fixDiagMatrix->CreateView("stridedMaps", Ac); Ac = Teuchos::null; // free singular coarse level matrix Ac = fixDiagMatrix; // set fixed non-singular coarse level matrix } // call fillComplete with optimized storage option set to true // This is necessary for new faster Epetra MM kernels. { Teuchos::TimeMonitor m1(*Teuchos::TimeMonitor::getNewTimer("CheckRepairMainDiagonal: fillComplete2")); Ac->fillComplete(p); } // print some output if (IsPrint(Warnings0)) GetOStream(Warnings0) << "RAPFactory (WARNING): " << (repairZeroDiagonals ? "repaired " : "found ") << gZeroDiags << " zeros on main diagonal of Ac." << std::endl; #ifdef HAVE_MUELU_DEBUG // only for debugging // check whether Ac has been repaired... Ac->getLocalDiagCopy(*diagVec); Teuchos::ArrayRCP< Scalar > diagVal2 = diagVec->getDataNonConst(0); for (size_t r = 0; r < Ac->getRowMap()->getNodeNumElements(); r++) { if (diagVal2[r] == zero) { GetOStream(Errors,-1) << "Error: there are zeros left on diagonal after repair..." << std::endl; break; } } #endif }
void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const { FactoryMonitor m(*this, "Build", currentLevel); const Teuchos::ParameterList & pL = GetParameterList(); // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation. // TODO (JG): I don't really know if we want to do this. const int startLevel = pL.get<int> ("repartition: start level"); const LO minRowsPerProcessor = pL.get<LO> ("repartition: min rows per proc"); const double nonzeroImbalance = pL.get<double>("repartition: max imbalance"); const bool remapPartitions = pL.get<bool> ("repartition: remap parts"); // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types. RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A"); // ====================================================================================================== // Determine whether partitioning is needed // ====================================================================================================== // NOTE: most tests include some global communication, which is why we currently only do tests until we make // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to // rebalance an operator. So, it would probably be beneficial to do and report *all* tests. // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning if (currentLevel.GetLevelID() < startLevel) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } RCP<const Map> rowMap = A->getRowMap(); // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it // does not matter. RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm(); RCP<const Teuchos::Comm<int> > comm = origComm->duplicate(); // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created) // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size { int numActiveProcesses = 0; MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses); if (numActiveProcesses == 1) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } bool test3 = false, test4 = false; std::string msg3, msg4; // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3) if (minRowsPerProcessor > 0) { LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max(); LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0; MueLu_sumAll(comm, haveFewRows, numWithFewRows); MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows); // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements. // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement. if (numWithFewRows > 0) test3 = true; msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor); } // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold if (!test3) { GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries()); MueLu_maxAll(comm, numMyNnz, maxNnz); MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors double imbalance = Teuchos::as<double>(maxNnz)/minNnz; if (imbalance > nonzeroImbalance) test4 = true; msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance); } if (!test3 && !test4) { GetOStream(Statistics0) << "Repartitioning? NO:" << msg3 + msg4 << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl; GO indexBase = rowMap->getIndexBase(); Xpetra::UnderlyingLib lib = rowMap->lib(); int myRank = comm->getRank(); int numProcs = comm->getSize(); RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm); TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object."); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); // ====================================================================================================== // Calculate number of partitions // ====================================================================================================== // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML) // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold. GO numPartitions; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } else { if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) { // System is too small, migrate it to a single processor numPartitions = 1; } else { // Make sure that each processor has approximately minRowsPerProcessor numPartitions = A->getGlobalNumRows() / minRowsPerProcessor; } numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs)); currentLevel.Set("number of partitions", numPartitions, NoFactory::get()); } GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl; // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition; if (numPartitions == 1) { // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut // in Zoltan[12]Interface). // TODO: We can probably skip more work in this case (like building all extra data structures) GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl; decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true); } else { decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } // ====================================================================================================== // Remap if necessary // ====================================================================================================== // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature. // There are two problems, however. // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings. // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for // each partition is the same. // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level. // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking // algorithm can resolve these differently. For instance, running // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12 // with // <ParameterList name="MueLu"> // <Parameter name="coarse: max size" type="int" value="1"/> // <Parameter name="repartition: enable" type="bool" value="true"/> // <Parameter name="repartition: min rows per proc" type="int" value="2"/> // <ParameterList name="level 1"> // <Parameter name="repartition: remap parts" type="bool" value="false/true"/> // </ParameterList> // </ParameterList> // produces different repartitioning for level 2. // This different repartitioning may then escalate into different aggregation for the next level. // // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector. // Fixing (2) is more complicated. // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu, // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1 // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1. // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place. // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in // MPI_COMM_WORLD, which may lead to issues for logging. if (remapPartitions) { SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel); DeterminePartitionPlacement(*A, *decomposition, numPartitions); } // ====================================================================================================== // Construct importer // ====================================================================================================== // At this point, the following is true: // * Each processors owns 0 or 1 partitions // * If a processor owns a partition, that partition number is equal to the processor rank // * The decomposition vector contains the partitions ids that the corresponding GID belongs to ArrayRCP<const GO> decompEntries; if (decomposition->getLocalLength() > 0) decompEntries = decomposition->getData(0); #ifdef HAVE_MUELU_DEBUG // Test range of partition ids int incorrectRank = -1; for (int i = 0; i < decompEntries.size(); i++) if (decompEntries[i] >= numProcs || decompEntries[i] < 0) { incorrectRank = myRank; break; } int incorrectGlobalRank = -1; MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank); TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range"); #endif Array<GO> myGIDs; myGIDs.reserve(decomposition->getLocalLength()); // Step 0: Construct mapping // part number -> GIDs I own which belong to this part // NOTE: my own part GIDs are not part of the map typedef std::map<GO, Array<GO> > map_type; map_type sendMap; for (LO i = 0; i < decompEntries.size(); i++) { GO id = decompEntries[i]; GO GID = rowMap->getGlobalElement(i); if (id == myRank) myGIDs .push_back(GID); else sendMap[id].push_back(GID); } decompEntries = Teuchos::null; if (IsPrint(Statistics2)) { GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows(); MueLu_sumAll(comm,numLocalKept, numGlobalKept); GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl; } int numSend = sendMap.size(), numRecv; // Arrayify map keys Array<GO> myParts(numSend), myPart(1); int cnt = 0; myPart[0] = myRank; for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++) myParts[cnt++] = it->first; // Step 1: Find out how many processors send me data // partsIndexBase starts from zero, as the processors ids start from zero GO partsIndexBase = 0; RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm); RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm); RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn); RCP<GOVector> partsISend = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave); RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn); if (numSend) { ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0); for (int i = 0; i < numSend; i++) partsISendData[i] = 1; } (numPartsIRecv->getDataNonConst(0))[0] = 0; numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD); numRecv = (numPartsIRecv->getData(0))[0]; // Step 2: Get my GIDs from everybody else MPI_Datatype MpiType = MpiTypeTraits<GO>::getType(); int msgTag = 12345; // TODO: use Comm::dup for all internal messaging // Post sends Array<MPI_Request> sendReqs(numSend); cnt = 0; for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++) MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]); map_type recvMap; size_t totalGIDs = myGIDs.size(); for (int i = 0; i < numRecv; i++) { MPI_Status status; MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status); // Get rank and number of elements from status int fromRank = status.MPI_SOURCE, count; MPI_Get_count(&status, MpiType, &count); recvMap[fromRank].resize(count); MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status); totalGIDs += count; } // Do waits on send requests if (numSend) { Array<MPI_Status> sendStatuses(numSend); MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr()); } // Merge GIDs myGIDs.reserve(totalGIDs); for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) { int offset = myGIDs.size(), len = it->second.size(); if (len) { myGIDs.resize(offset + len); memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO)); } } // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation. std::sort(myGIDs.begin(), myGIDs.end()); // Step 3: Construct importer RCP<Map> newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm); RCP<const Import> rowMapImporter; { SubFactoryMonitor m1(*this, "Import construction", currentLevel); rowMapImporter = ImportFactory::Build(rowMap, newRowMap); } Set(currentLevel, "Importer", rowMapImporter); // ====================================================================================================== // Print some data // ====================================================================================================== if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) { // Print the grid of processors GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl; char amActive = (myGIDs.size() ? 1 : 0); std::vector<char> areActive(numProcs, 0); MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm); int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100); for (int proc = 0; proc < numProcs; proc += rowWidth) { for (int j = 0; j < rowWidth; j++) if (proc + j < numProcs) GetOStream(Statistics2) << (areActive[proc + j] ? "+" : "."); else GetOStream(Statistics2) << " "; GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl; } } } // Build
void BrickAggregationFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const { FactoryMonitor m(*this, "Build", currentLevel); typedef Xpetra::MultiVector<double,LO,GO,NO> MultiVector_d; const ParameterList& pL = GetParameterList(); RCP<MultiVector_d> coords = Get<RCP<MultiVector_d> >(currentLevel, "Coordinates"); RCP<Matrix> A = Get< RCP<Matrix> > (currentLevel, "A"); RCP<const Map> rowMap = A->getRowMap(); RCP<const Map> colMap = A->getColMap(); RCP<const Teuchos::Comm<int> > comm = rowMap->getComm(); int numProcs = comm->getSize(); int myRank = comm->getRank(); int numPoints = colMap->getNodeNumElements(); bx_ = pL.get<int>("aggregation: brick x size"); by_ = pL.get<int>("aggregation: brick y size"); bz_ = pL.get<int>("aggregation: brick z size"); if (numProcs > 1) { // TODO: deal with block size > 1 (see comments above) TEUCHOS_TEST_FOR_EXCEPTION(bx_ > 3 || by_ > 3 || bz_ > 3, Exceptions::RuntimeError, "Currently cannot deal with brick size > 3"); } RCP<MultiVector_d> overlappedCoords = coords; RCP<const Import> importer = ImportFactory::Build(coords->getMap(), colMap); if (!importer.is_null()) { overlappedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(colMap, coords->getNumVectors()); overlappedCoords->doImport(*coords, *importer, Xpetra::INSERT); } // Setup misc structures // Logically, we construct enough data to query topological information of a rectangular grid Setup(comm, overlappedCoords, colMap); GetOStream(Runtime0) << "Using brick size: " << bx_ << (nDim_ > 1 ? "x " + toString(by_) : "") << (nDim_ > 2 ? "x " + toString(bz_) : "") << std::endl; // Construct aggregates RCP<Aggregates> aggregates = rcp(new Aggregates(colMap)); aggregates->setObjectLabel("Brick"); ArrayRCP<LO> vertex2AggId = aggregates->GetVertex2AggId()->getDataNonConst(0); ArrayRCP<LO> procWinner = aggregates->GetProcWinner() ->getDataNonConst(0); // In the first pass, we set a mapping from a vertex to aggregate global id. We deal with a structured // rectangular mesh, therefore we know the structure of aggregates. For each vertex we can tell exactly // which aggregate it belongs to. // If we determine that the aggregate does not belong to us (i.e. the root vertex does not belong to this // processor, or is outside and we lost "" arbitration), we record the global aggregate id in order to // fetch the local info from the processor owning the aggregate. This is required for aggregates, as it // uses the local aggregate ids of the owning processor. std::set<GO> myAggGIDs, remoteAggGIDs; for (LO LID = 0; LID < numPoints; LID++) { GO aggGID = getAggGID(LID); if ((revMap_.find(getRoot(LID)) != revMap_.end()) && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) { // Root of the brick aggregate containing GID (<- LID) belongs to us vertex2AggId[LID] = aggGID; myAggGIDs.insert(aggGID); if (isRoot(LID)) aggregates->SetIsRoot(LID); } else { remoteAggGIDs.insert(aggGID); } } size_t numAggregates = myAggGIDs .size(); size_t numRemote = remoteAggGIDs.size(); aggregates->SetNumAggregates(numAggregates); std::map<GO,LO> AggG2L; // Map: Agg GID -> Agg LID (possibly on a different processor) std::map<GO,int> AggG2R; // Map: Agg GID -> processor rank owning aggregate Array<GO> myAggGIDsArray(numAggregates), remoteAggGIDsArray(numRemote); // Fill in the maps for aggregates that we own size_t ind = 0; for (typename std::set<GO>::const_iterator it = myAggGIDs.begin(); it != myAggGIDs.end(); it++) { AggG2L[*it] = ind; AggG2R[*it] = myRank; myAggGIDsArray[ind++] = *it; } // The map is a convenient way to fetch remote local indices from global indices. RCP<Map> aggMap = MapFactory::Build(rowMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myAggGIDsArray, 0, comm); ind = 0; for (typename std::set<GO>::const_iterator it = remoteAggGIDs.begin(); it != remoteAggGIDs.end(); it++) remoteAggGIDsArray[ind++] = *it; // Fetch the required aggregate local ids and ranks Array<int> remoteProcIDs(numRemote); Array<LO> remoteLIDs (numRemote); aggMap->getRemoteIndexList(remoteAggGIDsArray, remoteProcIDs, remoteLIDs); // Fill in the maps for aggregates that we don't own but which have some of our vertices for (size_t i = 0; i < numRemote; i++) { AggG2L[remoteAggGIDsArray[i]] = remoteLIDs [i]; AggG2R[remoteAggGIDsArray[i]] = remoteProcIDs[i]; } // Remap aggregate GIDs to LIDs and set up owning processors for (LO LID = 0; LID < numPoints; LID++) { if (revMap_.find(getRoot(LID)) != revMap_.end() && rowMap->isNodeGlobalElement(colMap->getGlobalElement(revMap_[getRoot(LID)]))) { GO aggGID = vertex2AggId[LID]; vertex2AggId[LID] = AggG2L[aggGID]; procWinner [LID] = AggG2R[aggGID]; } } GO numGlobalRemote; MueLu_sumAll(comm, as<GO>(numRemote), numGlobalRemote); aggregates->AggregatesCrossProcessors(numGlobalRemote); Set(currentLevel, "Aggregates", aggregates); GetOStream(Statistics0) << aggregates->description() << std::endl; }
void UncoupledAggregationFactory_kokkos<LocalOrdinal, GlobalOrdinal, Node>::Build(Level ¤tLevel) const { FactoryMonitor m(*this, "Build", currentLevel); ParameterList pL = GetParameterList(); bDefinitionPhase_ = false; // definition phase is finished, now all aggregation algorithm information is fixed if (pL.get<int>("aggregation: max agg size") == -1) pL.set("aggregation: max agg size", INT_MAX); // define aggregation algorithms RCP<const FactoryBase> graphFact = GetFactory("Graph"); // TODO Can we keep different aggregation algorithms over more Build calls? algos_.clear(); algos_.push_back(rcp(new PreserveDirichletAggregationAlgorithm_kokkos(graphFact))); if (pL.get<bool>("aggregation: allow user-specified singletons") == true) algos_.push_back(rcp(new OnePtAggregationAlgorithm_kokkos (graphFact))); if (pL.get<bool>("aggregation: enable phase 1" ) == true) algos_.push_back(rcp(new AggregationPhase1Algorithm_kokkos (graphFact))); if (pL.get<bool>("aggregation: enable phase 2a") == true) algos_.push_back(rcp(new AggregationPhase2aAlgorithm_kokkos (graphFact))); if (pL.get<bool>("aggregation: enable phase 2b") == true) algos_.push_back(rcp(new AggregationPhase2bAlgorithm_kokkos (graphFact))); if (pL.get<bool>("aggregation: enable phase 3" ) == true) algos_.push_back(rcp(new AggregationPhase3Algorithm_kokkos (graphFact))); std::string mapOnePtName = pL.get<std::string>("OnePt aggregate map name"); RCP<Map> OnePtMap = Teuchos::null; if (mapOnePtName.length()) { std::string mapOnePtFactName = pL.get<std::string>("OnePt aggregate map factory"); if (mapOnePtFactName == "" || mapOnePtFactName == "NoFactory") { OnePtMap = currentLevel.Get<RCP<Map> >(mapOnePtName, NoFactory::get()); } else { RCP<const FactoryBase> mapOnePtFact = GetFactory(mapOnePtFactName); OnePtMap = currentLevel.Get<RCP<Map> >(mapOnePtName, mapOnePtFact.get()); } } RCP<const LWGraph_kokkos> graph = Get< RCP<LWGraph_kokkos> >(currentLevel, "Graph"); // Build RCP<Aggregates_kokkos> aggregates = rcp(new Aggregates_kokkos(*graph)); aggregates->setObjectLabel("UC"); const LO numRows = graph->GetNodeNumVertices(); // construct aggStat information std::vector<unsigned> aggStat(numRows, READY); // TODO //ArrayRCP<const bool> dirichletBoundaryMap = graph->GetBoundaryNodeMap(); ArrayRCP<const bool> dirichletBoundaryMap; if (dirichletBoundaryMap != Teuchos::null) for (LO i = 0; i < numRows; i++) if (dirichletBoundaryMap[i] == true) aggStat[i] = BOUNDARY; LO nDofsPerNode = Get<LO>(currentLevel, "DofsPerNode"); GO indexBase = graph->GetDomainMap()->getIndexBase(); if (OnePtMap != Teuchos::null) { for (LO i = 0; i < numRows; i++) { // reconstruct global row id (FIXME only works for contiguous maps) GO grid = (graph->GetDomainMap()->getGlobalElement(i)-indexBase) * nDofsPerNode + indexBase; for (LO kr = 0; kr < nDofsPerNode; kr++) if (OnePtMap->isNodeGlobalElement(grid + kr)) aggStat[i] = ONEPT; } } const RCP<const Teuchos::Comm<int> > comm = graph->GetComm(); GO numGlobalRows = 0; if (IsPrint(Statistics1)) MueLu_sumAll(comm, as<GO>(numRows), numGlobalRows); LO numNonAggregatedNodes = numRows; GO numGlobalAggregatedPrev = 0, numGlobalAggsPrev = 0; for (size_t a = 0; a < algos_.size(); a++) { std::string phase = algos_[a]->description(); SubFactoryMonitor sfm(*this, "Algo \"" + phase + "\"", currentLevel); int oldRank = algos_[a]->SetProcRankVerbose(this->GetProcRankVerbose()); algos_[a]->BuildAggregates(pL, *graph, *aggregates, aggStat, numNonAggregatedNodes); algos_[a]->SetProcRankVerbose(oldRank); if (IsPrint(Statistics1)) { GO numLocalAggregated = numRows - numNonAggregatedNodes, numGlobalAggregated = 0; GO numLocalAggs = aggregates->GetNumAggregates(), numGlobalAggs = 0; MueLu_sumAll(comm, numLocalAggregated, numGlobalAggregated); MueLu_sumAll(comm, numLocalAggs, numGlobalAggs); double aggPercent = 100*as<double>(numGlobalAggregated)/as<double>(numGlobalRows); if (aggPercent > 99.99 && aggPercent < 100.00) { // Due to round off (for instance, for 140465733/140466897), we could // get 100.00% display even if there are some remaining nodes. This // is bad from the users point of view. It is much better to change // it to display 99.99%. aggPercent = 99.99; } GetOStream(Statistics1) << " aggregated : " << (numGlobalAggregated - numGlobalAggregatedPrev) << " (phase), " << std::fixed << std::setprecision(2) << numGlobalAggregated << "/" << numGlobalRows << " [" << aggPercent << "%] (total)\n" << " remaining : " << numGlobalRows - numGlobalAggregated << "\n" << " aggregates : " << numGlobalAggs-numGlobalAggsPrev << " (phase), " << numGlobalAggs << " (total)" << std::endl; numGlobalAggregatedPrev = numGlobalAggregated; numGlobalAggsPrev = numGlobalAggs; } } TEUCHOS_TEST_FOR_EXCEPTION(numNonAggregatedNodes, Exceptions::RuntimeError, "MueLu::UncoupledAggregationFactory::Build: Leftover nodes found! Error!"); aggregates->AggregatesCrossProcessors(false); Set(currentLevel, "Aggregates", aggregates); GetOStream(Statistics1) << aggregates->description() << std::endl; }