int main(int argc, char* argv[]) { using namespace std; using namespace Teuchos; const int num_samples = 1; const int num_loops = 500000; const int size = 10; const int num_vectors = 3; TEST_FOR_EXCEPTION(num_loops * size != 5000000, std::logic_error, "Work amount is not constant!"); // Make all vectors in a contiguous block MyVector<double>* vector_array = new MyVector<double>[num_vectors * size]; ArrayRCP< MyVector<double> > a = arcp< MyVector<double> >(vector_array, 0, size, false); ArrayRCP< MyVector<double> > b = arcp< MyVector<double> >(&vector_array[size], 0, size, false); ArrayRCP< MyVector<double> > c = arcp< MyVector<double> >(&vector_array[2*size], 0, size, false); #ifdef HAVE_PHALANX_TVMET tvmet::Vector<double, 3>* tvmet_array = new tvmet::Vector<double, 3>[num_vectors * size]; ArrayRCP< tvmet::Vector<double, 3> > d = arcp< tvmet::Vector<double, 3> >(tvmet_array, 0, size, false); ArrayRCP< tvmet::Vector<double, 3> > e = arcp< tvmet::Vector<double, 3> >(&tvmet_array[size], 0, size, false); ArrayRCP< tvmet::Vector<double, 3> > f = arcp< tvmet::Vector<double, 3> >(&tvmet_array[2*size], 0, size, false); #endif double* raw_array = new double[num_vectors * size * 3]; double* raw_a = raw_array; double* raw_b = &raw_array[size]; double* raw_c = &raw_array[2*size]; for (int i=0; i < a.size(); ++i) a[i] = 1.0; for (int i=0; i < b.size(); ++i) b[i] = 2.0; for (int i=0; i < c.size(); ++i) c[i] = 3.0; #ifdef HAVE_PHALANX_TVMET for (int i=0; i < d.size(); ++i) d[i] = 1.0; for (int i=0; i < e.size(); ++i) e[i] = 2.0; for (int i=0; i < f.size(); ++i) f[i] = 3.0; #endif for (int i=0; i < size; ++i) { int offset = i * 3; for (int j=0; j < 3; ++j) { raw_a[offset + j] = 1.0; raw_b[offset + j] = 2.0; raw_c[offset + j] = 3.0; } } RCP<Time> vector_time = TimeMonitor::getNewTimer("Vector Time"); RCP<Time> update_time = TimeMonitor::getNewTimer("Update Time"); #ifdef HAVE_PHALANX_TVMET RCP<Time> tvmet_time = TimeMonitor::getNewTimer("TVMET Time"); #endif RCP<Time> raw_time = TimeMonitor::getNewTimer("Raw Time"); RCP<Time> raw2_time = TimeMonitor::getNewTimer("Raw2 Time"); for (int sample = 0; sample < num_samples; ++sample) { cout << "Vector" << endl; { TimeMonitor t(*vector_time); for (int i=0; i < num_loops; ++i) for (int j=0; j < c.size(); ++j) c[j] = a[j] * b[j]; } cout << "Update" << endl; { TimeMonitor t(*update_time); for (int i=0; i < num_loops; ++i) for (int j=0; j < c.size(); ++j) c[j].update_multiply(a[j], b[j]); } #ifdef HAVE_PHALANX_TVMET cout << "TVMET" << endl; { TimeMonitor t(*tvmet_time); for (int i=0; i < num_loops; ++i) for (int j=0; j < d.size(); ++j) f[j] = d[j] * e[j]; } #endif cout << "Raw" << endl; { TimeMonitor t(*raw_time); for (int i=0; i < num_loops; ++i) { for (int j=0; j < size; ++j) { int offset = j * 3; for (int k=0; k < 3; ++k) raw_c[offset + k] = raw_a[offset + k] * raw_b[offset + k]; } } } cout << "Raw2" << endl; { TimeMonitor t(*raw2_time); const int raw_size = 3 * size; // 3 vector components for (int i=0; i < num_loops; ++i) { for (int j=0; j < raw_size; ++j) { raw_c[j] = raw_a[j] * raw_b[j]; } } } } // end loop over samples TimeMonitor::summarize(); double f_vector = vector_time->totalElapsedTime() / raw_time->totalElapsedTime(); double f_update = update_time->totalElapsedTime() / raw_time->totalElapsedTime(); #ifdef HAVE_PHALANX_TVMET double f_tvmet = tvmet_time->totalElapsedTime() / raw_time->totalElapsedTime(); #endif double f_raw = raw_time->totalElapsedTime() / raw_time->totalElapsedTime(); double f_raw2 = raw2_time->totalElapsedTime() / raw_time->totalElapsedTime(); std::cout << "vector = " << f_vector << std::endl; std::cout << "update = " << f_update << std::endl; #ifdef HAVE_PHALANX_TVMET std::cout << "tvmet = " << f_tvmet << std::endl; #endif std::cout << "raw = " << f_raw << std::endl; std::cout << "raw2 = " << f_raw2 << std::endl; delete [] vector_array; #ifdef HAVE_PHALANX_TVMET delete [] tvmet_array; #endif delete [] raw_array; std::cout << "\nTest passed!\n" << std::endl; return 0; }
void debug_assert_valid_ptr() const { #ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK arcp_.access_private_node().assert_valid_ptr(*this); #endif }
/*! \brief Return the object normed weight imbalance. * \param imbalance on return is the object normed weight imbalance. * If there were no weights, this is the object count imbalance. * If there was one weight, it is the imbalance with respect to that weight. */ void getNormedImbalance(scalar_t &imbalance) const{ if (metrics_.size() > 1) imbalance = metrics_[1].getMaxImbalance(); else imbalance = metrics_[0].getMaxImbalance(); }
void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>:: DeterminePartitionPlacement(const Matrix& A, GOVector& decomposition, GO numPartitions) const { RCP<const Map> rowMap = A.getRowMap(); RCP<const Teuchos::Comm<int> > comm = rowMap->getComm()->duplicate(); int numProcs = comm->getSize(); RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm); TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object."); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); const Teuchos::ParameterList& pL = GetParameterList(); // maxLocal is a constant which determins the number of largest edges which are being exchanged // The idea is that we do not want to construct the full bipartite graph, but simply a subset of // it, which requires less communication. By selecting largest local edges we hope to achieve // similar results but at a lower cost. const int maxLocal = pL.get<int>("repartition: remap num values"); const int dataSize = 2*maxLocal; ArrayRCP<GO> decompEntries; if (decomposition.getLocalLength() > 0) decompEntries = decomposition.getDataNonConst(0); // Step 1: Sort local edges by weight // Each edge of a bipartite graph corresponds to a triplet (i, j, v) where // i: processor id that has some piece of part with part_id = j // j: part id // v: weight of the edge // We set edge weights to be the total number of nonzeros in rows on this processor which // correspond to this part_id. The idea is that when we redistribute matrix, this weight // is a good approximation of the amount of data to move. // We use two maps, original which maps a partition id of an edge to the corresponding weight, // and a reverse one, which is necessary to sort by edges. std::map<GO,GO> lEdges; for (LO i = 0; i < decompEntries.size(); i++) lEdges[decompEntries[i]] += A.getNumEntriesInLocalRow(i); // Reverse map, so that edges are sorted by weight. // This results in multimap, as we may have edges with the same weight std::multimap<GO,GO> revlEdges; for (typename std::map<GO,GO>::const_iterator it = lEdges.begin(); it != lEdges.end(); it++) revlEdges.insert(std::make_pair(it->second, it->first)); // Both lData and gData are arrays of data which we communicate. The data is stored // in pairs, so that data[2*i+0] is the part index, and data[2*i+1] is the corresponding edge weight. // We do not store processor id in data, as we can compute that by looking on the offset in the gData. Array<GO> lData(dataSize, -1), gData(numProcs * dataSize); int numEdges = 0; for (typename std::multimap<GO,GO>::reverse_iterator rit = revlEdges.rbegin(); rit != revlEdges.rend() && numEdges < maxLocal; rit++) { lData[2*numEdges+0] = rit->second; // part id lData[2*numEdges+1] = rit->first; // edge weight numEdges++; } // Step 2: Gather most edges // Each processors contributes maxLocal edges by providing maxLocal pairs <part id, weight>, which is of size dataSize MPI_Datatype MpiType = MpiTypeTraits<GO>::getType(); MPI_Allgather(static_cast<void*>(lData.getRawPtr()), dataSize, MpiType, static_cast<void*>(gData.getRawPtr()), dataSize, MpiType, *rawMpiComm); // Step 3: Construct mapping // Construct the set of triplets std::vector<Triplet<int,int> > gEdges(numProcs * maxLocal); size_t k = 0; for (LO i = 0; i < gData.size(); i += 2) { GO part = gData[i+0]; GO weight = gData[i+1]; if (part != -1) { // skip nonexistent edges gEdges[k].i = i/dataSize; // determine the processor by its offset (since every processor sends the same amount) gEdges[k].j = part; gEdges[k].v = weight; k++; } } gEdges.resize(k); // Sort edges by weight // NOTE: compareTriplets is actually a reverse sort, so the edges weight is in decreasing order std::sort(gEdges.begin(), gEdges.end(), compareTriplets<int,int>); // Do matching std::map<int,int> match; std::vector<char> matchedRanks(numProcs, 0), matchedParts(numProcs, 0); int numMatched = 0; for (typename std::vector<Triplet<int,int> >::const_iterator it = gEdges.begin(); it != gEdges.end(); it++) { GO rank = it->i; GO part = it->j; if (matchedRanks[rank] == 0 && matchedParts[part] == 0) { matchedRanks[rank] = 1; matchedParts[part] = 1; match[part] = rank; numMatched++; } } GetOStream(Statistics0) << "Number of unassigned paritions before cleanup stage: " << (numPartitions - numMatched) << " / " << numPartitions << std::endl; // Step 4: Assign unassigned partitions // We do that through random matching for remaining partitions. Not all part numbers are valid, but valid parts are a subset of [0, numProcs). // The reason it is done this way is that we don't need any extra communication, as we don't need to know which parts are valid. for (int part = 0, matcher = 0; part < numProcs; part++) if (match.count(part) == 0) { // Find first non-matched rank while (matchedRanks[matcher]) matcher++; match[part] = matcher++; } // Step 5: Permute entries in the decomposition vector for (LO i = 0; i < decompEntries.size(); i++) decompEntries[i] = match[decompEntries[i]]; }
/*! \brief Return the graph metric values. * \param values on return is the array of values. */ ArrayRCP<const GraphMetricValues<scalar_t> > getGraphMetrics() const{ if(graphMetricsConst_.is_null()) return graphMetrics_; return graphMetricsConst_; }
void globalWeightedCutsMessagesHopsByPart( const RCP<const Environment> &env, const RCP<const Comm<int> > &comm, const RCP<const GraphModel<typename Adapter::base_adapter_t> > &graph, const ArrayView<const typename Adapter::part_t> &parts, typename Adapter::part_t &numParts, ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > > &metrics, ArrayRCP<typename Adapter::scalar_t> &globalSums, const RCP <const MachineRep> machine) { env->debug(DETAILED_STATUS, "Entering globalWeightedCutsMessagesHopsByPart"); ////////////////////////////////////////////////////////// // Initialize return values typedef typename Adapter::lno_t t_lno_t; typedef typename Adapter::gno_t t_gno_t; typedef typename Adapter::scalar_t t_scalar_t; typedef typename Adapter::part_t part_t; typedef typename Adapter::node_t t_node_t; typedef typename Zoltan2::GraphModel<typename Adapter::base_adapter_t>::input_t t_input_t; t_lno_t localNumVertices = graph->getLocalNumVertices(); t_gno_t globalNumVertices = graph->getGlobalNumVertices(); t_lno_t localNumEdges = graph->getLocalNumEdges(); ArrayView<const t_gno_t> Ids; ArrayView<t_input_t> v_wghts; graph->getVertexList(Ids, v_wghts); typedef GraphMetrics<t_scalar_t> mv_t; //get the edge ids, and weights ArrayView<const t_gno_t> edgeIds; ArrayView<const t_lno_t> offsets; ArrayView<t_input_t> e_wgts; graph->getEdgeList(edgeIds, offsets, e_wgts); std::vector <t_scalar_t> edge_weights; int numWeightPerEdge = graph->getNumWeightsPerEdge(); int numMetrics = 4; // "edge cuts", messages, hops, weighted hops if (numWeightPerEdge) numMetrics += numWeightPerEdge * 2; // "weight n", weighted hops per weight n // add some more metrics to the array typedef typename ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > >::size_type array_size_type; metrics.resize( metrics.size() + numMetrics ); for( array_size_type n = metrics.size() - numMetrics; n < metrics.size(); ++n ){ mv_t * newMetric = new mv_t; // allocate the new memory env->localMemoryAssertion(__FILE__,__LINE__,1,newMetric); // check errors metrics[n] = rcp( newMetric); // create the new members } array_size_type next = metrics.size() - numMetrics; // MDM - this is most likely temporary to preserve the format here - we are now filling a larger array so we may not have started at 0 std::vector <part_t> e_parts (localNumEdges); #ifdef HAVE_ZOLTAN2_MPI if (comm->getSize() > 1) { Zoltan_DD_Struct *dd = NULL; MPI_Comm mpicomm = Teuchos::getRawMpiComm(*comm); int size_gnot = Zoltan2::TPL_Traits<ZOLTAN_ID_PTR, t_gno_t>::NUM_ID; int debug_level = 0; Zoltan_DD_Create(&dd, mpicomm, size_gnot, 0, sizeof(part_t), localNumVertices, debug_level); ZOLTAN_ID_PTR ddnotneeded = NULL; // Local IDs not needed Zoltan_DD_Update( dd, (ZOLTAN_ID_PTR) Ids.getRawPtr(), ddnotneeded, (char *) &(parts[0]), NULL, int(localNumVertices)); Zoltan_DD_Find( dd, (ZOLTAN_ID_PTR) edgeIds.getRawPtr(), ddnotneeded, (char *)&(e_parts[0]), NULL, localNumEdges, NULL ); Zoltan_DD_Destroy(&dd); } else #endif { std::map<t_gno_t,t_lno_t> global_id_to_local_index; //else everything is local. //we need a globalid to local index conversion. //this does not exists till this point, so we need to create one. for (t_lno_t i = 0; i < localNumVertices; ++i){ //at the local index i, we have the global index Ids[i]. //so write i, to Ids[i] index of the vector. global_id_to_local_index[Ids[i]] = i; } for (t_lno_t i = 0; i < localNumEdges; ++i){ t_gno_t ei = edgeIds[i]; //ei is the global index of the neighbor one. part_t p = parts[global_id_to_local_index[ei]]; e_parts[i] = p; } } RCP<const Teuchos::Comm<int> > tcomm = comm; env->timerStart(MACRO_TIMERS, "Communication Graph Create"); { //get the vertices in each part in my part. std::vector <t_lno_t> part_begins(numParts, -1); std::vector <t_lno_t> part_nexts(localNumVertices, -1); //cluster vertices according to their parts. //create local part graph. for (t_lno_t i = 0; i < localNumVertices; ++i){ part_t ap = parts[i]; part_nexts[i] = part_begins[ap]; part_begins[ap] = i; } for (int weight_index = -1; weight_index < numWeightPerEdge ; ++weight_index){ //MD: these two should be part_t. //but we dont want to compile tpetra from the beginning. //This can be changed when directory is updated. typedef t_lno_t local_part_type; typedef t_gno_t global_part_type; typedef Tpetra::Map<local_part_type, global_part_type, t_node_t> map_t; Teuchos::RCP<const map_t> map = Teuchos::rcp (new map_t (numParts, 0, tcomm)); typedef Tpetra::CrsMatrix<t_scalar_t, local_part_type, global_part_type, t_node_t> tcrsMatrix_t; Teuchos::RCP<tcrsMatrix_t> tMatrix(new tcrsMatrix_t (map, 0)); std::vector <global_part_type> part_neighbors (numParts); std::vector <t_scalar_t> part_neighbor_weights(numParts, 0); std::vector <t_scalar_t> part_neighbor_weights_ordered(numParts); //coarsen for all vertices in my part in order with parts. for (global_part_type i = 0; i < (global_part_type) numParts; ++i){ part_t num_neighbor_parts = 0; t_lno_t v = part_begins[i]; //get part i, and first vertex in this part v. while (v != -1){ //now get the neightbors of v. for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){ //get the part of the second vertex. part_t ep = e_parts[j]; t_scalar_t ew = 1; if (weight_index > -1){ ew = e_wgts[weight_index][j]; } //add it to my local part neighbors for part i. if (part_neighbor_weights[ep] < 0.00001){ part_neighbors[num_neighbor_parts++] = ep; } part_neighbor_weights[ep] += ew; } v = part_nexts[v]; } //now get the part list. for (t_lno_t j = 0; j < num_neighbor_parts; ++j){ part_t neighbor_part = part_neighbors[j]; part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part]; part_neighbor_weights[neighbor_part] = 0; } //insert it to tpetra crsmatrix. if (num_neighbor_parts > 0){ Teuchos::ArrayView<const global_part_type> destinations(&(part_neighbors[0]), num_neighbor_parts); Teuchos::ArrayView<const t_scalar_t> vals(&(part_neighbor_weights_ordered[0]), num_neighbor_parts); tMatrix->insertGlobalValues (i,destinations, vals); } } tMatrix->fillComplete (); local_part_type num_local_parts = map->getNodeNumElements(); Array<global_part_type> Indices; Array<t_scalar_t> Values; t_scalar_t max_edge_cut = 0; t_scalar_t total_edge_cut = 0; global_part_type max_message = 0; global_part_type total_message = 0; global_part_type total_hop_count = 0; t_scalar_t total_weighted_hop_count = 0; global_part_type max_hop_count = 0; t_scalar_t max_weighted_hop_count = 0; for (local_part_type i=0; i < num_local_parts; i++) { const global_part_type globalRow = map->getGlobalElement(i); size_t NumEntries = tMatrix->getNumEntriesInGlobalRow (globalRow); Indices.resize (NumEntries); Values.resize (NumEntries); tMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries); t_scalar_t part_edge_cut = 0; global_part_type part_messages = 0; for (size_t j=0; j < NumEntries; j++){ if (Indices[j] != globalRow){ part_edge_cut += Values[j]; part_messages += 1; typename MachineRep::machine_pcoord_t hop_count = 0; machine->getHopCount(globalRow, Indices[j], hop_count); global_part_type hop_counts = hop_count; t_scalar_t weighted_hop_counts = hop_count * Values[j]; total_hop_count += hop_counts; total_weighted_hop_count += weighted_hop_counts; if (hop_counts > max_hop_count ){ max_hop_count = hop_counts; } if (weighted_hop_counts > max_weighted_hop_count ){ max_weighted_hop_count = weighted_hop_counts; } } } if (part_edge_cut > max_edge_cut){ max_edge_cut = part_edge_cut; } total_edge_cut += part_edge_cut; if (part_messages > max_message){ max_message = part_messages; } total_message += part_messages; } t_scalar_t g_max_edge_cut = 0; t_scalar_t g_total_edge_cut = 0; global_part_type g_max_message = 0; global_part_type g_total_message = 0; global_part_type g_total_hop_count = 0; t_scalar_t g_total_weighted_hop_count = 0; global_part_type g_max_hop_count = 0; t_scalar_t g_max_weighted_hop_count = 0; try{ Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_MAX,1,&max_edge_cut,&g_max_edge_cut); Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_MAX,1,&max_message,&g_max_message); Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_MAX,1,&max_hop_count,&g_max_hop_count); Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_MAX,1,&max_weighted_hop_count,&g_max_weighted_hop_count); Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_SUM,1,&total_edge_cut,&g_total_edge_cut); Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_SUM,1,&total_message,&g_total_message); Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_SUM,1,&total_hop_count,&g_total_hop_count); Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_SUM,1,&total_weighted_hop_count,&g_total_weighted_hop_count); } Z2_THROW_OUTSIDE_ERROR(*env); if (weight_index == -1){ metrics[next]->setName("md edge cuts"); } else { std::ostringstream oss; oss << "md weight " << weight_index; metrics[next]->setName( oss.str()); } metrics[next]->setMetricValue("global maximum", g_max_edge_cut); metrics[next]->setMetricValue("global sum", g_total_edge_cut); next++; if (weight_index == -1){ metrics[next]->setName("message"); metrics[next]->setMetricValue("global maximum", g_max_message); metrics[next]->setMetricValue("global sum", g_total_message); next++; } if (weight_index == -1){ metrics[next]->setName("hops"); metrics[next]->setMetricValue("global maximum", g_max_hop_count); metrics[next]->setMetricValue("global sum", g_total_hop_count); next++; } std::ostringstream oss; oss << "weighted hops" << weight_index; metrics[next]->setName( oss.str()); metrics[next]->setMetricValue("global maximum", g_max_weighted_hop_count); metrics[next]->setMetricValue("global sum", g_total_weighted_hop_count); next++; } } env->timerStop(MACRO_TIMERS, "Communication Graph Create"); env->debug(DETAILED_STATUS, "Exiting globalWeightedCutsMessagesHopsByPart"); }
int main(int argc, char *argv[]) { Teuchos::GlobalMPISession session(&argc, &argv); RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm(); int rank = comm->getRank(); Teuchos::RCP<Teuchos::FancyOStream> outStream = Teuchos::VerboseObjectBase::getDefaultOStream(); Teuchos::EVerbosityLevel v=Teuchos::VERB_EXTREME; typedef Tpetra::CrsMatrix<zscalar_t,zlno_t,zgno_t,znode_t> tmatrix_t; typedef Tpetra::CrsGraph<zlno_t,zgno_t,znode_t> tgraph_t; typedef Tpetra::Vector<zscalar_t,zlno_t,zgno_t,znode_t> tvector_t; typedef Tpetra::MultiVector<zscalar_t,zlno_t,zgno_t,znode_t> tmvector_t; typedef Xpetra::CrsMatrix<zscalar_t,zlno_t,zgno_t,znode_t> xmatrix_t; typedef Xpetra::CrsGraph<zlno_t,zgno_t,znode_t> xgraph_t; typedef Xpetra::Vector<zscalar_t,zlno_t,zgno_t,znode_t> xvector_t; typedef Xpetra::MultiVector<zscalar_t,zlno_t,zgno_t,znode_t> xmvector_t; typedef Xpetra::TpetraMap<zlno_t,zgno_t,znode_t> xtmap_t; // Create object that can give us test Tpetra and Xpetra input. RCP<UserInputForTests> uinput; try{ uinput = rcp(new UserInputForTests(testDataFilePath,std::string("simple"), comm, true)); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("input ")+e.what(), 1); } ///////////////////////////////////////////////////////////////// // Tpetra::CrsMatrix // Tpetra::CrsGraph // Tpetra::Vector // Tpetra::MultiVector ///////////////////////////////////////////////////////////////// // XpetraTraits<Tpetra::CrsMatrix<zscalar_t, zlno_t, zgno_t, znode_t> > { RCP<tmatrix_t> M; try{ M = uinput->getUITpetraCrsMatrix(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getTpetraCrsMatrix ")+e.what(), 1); } if (rank== 0) std::cout << "Original Tpetra matrix " << M->getGlobalNumRows() << " x " << M->getGlobalNumCols() << std::endl; M->describe(*outStream,v); RCP<const xtmap_t> xmap(new xtmap_t(M->getRowMap())); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const tmatrix_t> newM; try{ newM = Zoltan2::XpetraTraits<tmatrix_t>::doMigration(*M, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<tmatrix_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Tpetra matrix" << std::endl; newM->describe(*outStream,v); } // XpetraTraits<Tpetra::CrsGraph<zscalar_t, zlno_t, zgno_t, znode_t> > { RCP<tgraph_t> G; try{ G = uinput->getUITpetraCrsGraph(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getTpetraCrsGraph ")+e.what(), 1); } if (rank== 0) std::cout << "Original Tpetra graph" << std::endl; G->describe(*outStream,v); RCP<const xtmap_t> xmap(new xtmap_t(G->getRowMap())); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const tgraph_t> newG; try{ newG = Zoltan2::XpetraTraits<tgraph_t>::doMigration(*G, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<tgraph_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Tpetra graph" << std::endl; newG->describe(*outStream,v); } // XpetraTraits<Tpetra::Vector<zscalar_t, zlno_t, zgno_t, znode_t>> { RCP<tvector_t> V; try{ V = rcp(new tvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 1)); V->randomize(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getTpetraVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Tpetra vector" << std::endl; V->describe(*outStream,v); RCP<const xtmap_t> xmap(new xtmap_t(V->getMap())); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const tvector_t> newV; try{ newV = Zoltan2::XpetraTraits<tvector_t>::doMigration(*V, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<tvector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Tpetra vector" << std::endl; newV->describe(*outStream,v); } // XpetraTraits<Tpetra::MultiVector<zscalar_t, zlno_t, zgno_t, znode_t>> { RCP<tmvector_t> MV; try{ MV = rcp(new tmvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 3)); MV->randomize(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getTpetraMultiVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Tpetra multivector" << std::endl; MV->describe(*outStream,v); RCP<const xtmap_t> xmap(new xtmap_t(MV->getMap())); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const tmvector_t> newMV; try{ newMV = Zoltan2::XpetraTraits<tmvector_t>::doMigration(*MV, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<tmvector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Tpetra multivector" << std::endl; newMV->describe(*outStream,v); } ///////////////////////////////////////////////////////////////// // Xpetra::CrsMatrix // Xpetra::CrsGraph // Xpetra::Vector // Xpetra::MultiVector ///////////////////////////////////////////////////////////////// // XpetraTraits<Xpetra::CrsMatrix<zscalar_t, zlno_t, zgno_t, znode_t> > { RCP<xmatrix_t> M; try{ M = uinput->getUIXpetraCrsMatrix(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getXpetraCrsMatrix ")+e.what(), 1); } if (rank== 0) std::cout << "Original Xpetra matrix" << std::endl; M->describe(*outStream,v); ArrayRCP<zgno_t> newRowIds = roundRobinMap(M->getRowMap()); zgno_t localNumRows = newRowIds.size(); RCP<const xmatrix_t> newM; try{ newM = Zoltan2::XpetraTraits<xmatrix_t>::doMigration(*M, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<xmatrix_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Xpetra matrix" << std::endl; newM->describe(*outStream,v); } // XpetraTraits<Xpetra::CrsGraph<zscalar_t, zlno_t, zgno_t, znode_t> > { RCP<xgraph_t> G; try{ G = uinput->getUIXpetraCrsGraph(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getXpetraCrsGraph ")+e.what(), 1); } if (rank== 0) std::cout << "Original Xpetra graph" << std::endl; G->describe(*outStream,v); ArrayRCP<zgno_t> newRowIds = roundRobinMap(G->getRowMap()); zgno_t localNumRows = newRowIds.size(); RCP<const xgraph_t> newG; try{ newG = Zoltan2::XpetraTraits<xgraph_t>::doMigration(*G, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<xgraph_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Xpetra graph" << std::endl; newG->describe(*outStream,v); } // XpetraTraits<Xpetra::Vector<zscalar_t, zlno_t, zgno_t, znode_t>> { RCP<xvector_t> V; try{ RCP<tvector_t> tV = rcp(new tvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 1)); tV->randomize(); V = Zoltan2::XpetraTraits<tvector_t>::convertToXpetra(tV); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getXpetraVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Xpetra vector" << std::endl; V->describe(*outStream,v); ArrayRCP<zgno_t> newRowIds = roundRobinMap(V->getMap()); zgno_t localNumRows = newRowIds.size(); RCP<const xvector_t> newV; try{ newV = Zoltan2::XpetraTraits<xvector_t>::doMigration(*V, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<xvector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Xpetra vector" << std::endl; newV->describe(*outStream,v); } // XpetraTraits<Xpetra::MultiVector<zscalar_t, zlno_t, zgno_t, znode_t>> { RCP<xmvector_t> MV; try{ RCP<tmvector_t> tMV = rcp(new tmvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 3)); tMV->randomize(); MV = Zoltan2::XpetraTraits<tmvector_t>::convertToXpetra(tMV); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getXpetraMultiVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Xpetra multivector" << std::endl; MV->describe(*outStream,v); ArrayRCP<zgno_t> newRowIds = roundRobinMap(MV->getMap()); zgno_t localNumRows = newRowIds.size(); RCP<const xmvector_t> newMV; try{ newMV = Zoltan2::XpetraTraits<xmvector_t>::doMigration(*MV, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<xmvector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Xpetra multivector" << std::endl; newMV->describe(*outStream,v); } #ifdef HAVE_EPETRA_DATA_TYPES ///////////////////////////////////////////////////////////////// // Epetra_CrsMatrix // Epetra_CrsGraph // Epetra_Vector // Epetra_MultiVector ///////////////////////////////////////////////////////////////// typedef Epetra_CrsMatrix ematrix_t; typedef Epetra_CrsGraph egraph_t; typedef Epetra_Vector evector_t; typedef Epetra_MultiVector emvector_t; typedef Xpetra::EpetraMap xemap_t; typedef Epetra_BlockMap emap_t; // Create object that can give us test Epetra input. RCP<UserInputForTests> euinput; try{ euinput = rcp(new UserInputForTests(testDataFilePath,std::string("simple"), comm, true)); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("epetra input ")+e.what(), 1); } // XpetraTraits<Epetra_CrsMatrix> { RCP<ematrix_t> M; try{ M = euinput->getUIEpetraCrsMatrix(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getEpetraCrsMatrix ")+e.what(), 1); } if (rank== 0) std::cout << "Original Epetra matrix" << std::endl; M->Print(std::cout); RCP<const emap_t> emap = Teuchos::rcpFromRef(M->RowMap()); RCP<const xemap_t> xmap(new xemap_t(emap)); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const ematrix_t> newM; try{ newM = Zoltan2::XpetraTraits<ematrix_t>::doMigration(*M, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<ematrix_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Epetra matrix" << std::endl; newM->Print(std::cout); } // XpetraTraits<Epetra_CrsGraph> { RCP<egraph_t> G; try{ G = euinput->getUIEpetraCrsGraph(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getEpetraCrsGraph ")+e.what(), 1); } if (rank== 0) std::cout << "Original Epetra graph" << std::endl; G->Print(std::cout); RCP<const emap_t> emap = Teuchos::rcpFromRef(G->RowMap()); RCP<const xemap_t> xmap(new xemap_t(emap)); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const egraph_t> newG; try{ newG = Zoltan2::XpetraTraits<egraph_t>::doMigration(*G, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<egraph_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Epetra graph" << std::endl; newG->Print(std::cout); } // XpetraTraits<Epetra_Vector> { RCP<evector_t> V; try{ V = rcp(new Epetra_Vector(euinput->getUIEpetraCrsGraph()->RowMap())); V->Random(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getEpetraVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Epetra vector" << std::endl; V->Print(std::cout); RCP<const emap_t> emap = Teuchos::rcpFromRef(V->Map()); RCP<const xemap_t> xmap(new xemap_t(emap)); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const evector_t> newV; try{ newV = Zoltan2::XpetraTraits<evector_t>::doMigration(*V, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<evector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Epetra vector" << std::endl; newV->Print(std::cout); } // XpetraTraits<Epetra_MultiVector> { RCP<emvector_t> MV; try{ MV = rcp(new Epetra_MultiVector(euinput->getUIEpetraCrsGraph()->RowMap(),3)); MV->Random(); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string("getEpetraMultiVector")+e.what(), 1); } if (rank== 0) std::cout << "Original Epetra multivector" << std::endl; MV->Print(std::cout); RCP<const emap_t> emap = Teuchos::rcpFromRef(MV->Map()); RCP<const xemap_t> xmap(new xemap_t(emap)); ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap); zgno_t localNumRows = newRowIds.size(); RCP<const emvector_t> newMV; try{ newMV = Zoltan2::XpetraTraits<emvector_t>::doMigration(*MV, localNumRows, newRowIds.getRawPtr()); } catch(std::exception &e){ TEST_FAIL_AND_EXIT(*comm, 0, string(" Zoltan2::XpetraTraits<emvector_t>::doMigration ")+e.what(), 1); } if (rank== 0) std::cout << "Migrated Epetra multivector" << std::endl; newMV->Print(std::cout); } #endif // have epetra data types (int, int, double) ///////////////////////////////////////////////////////////////// // DONE ///////////////////////////////////////////////////////////////// if (rank==0) std::cout << "PASS" << std::endl; }
void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level & fineLevel, Level &coarseLevel) const { FactoryMonitor m(*this, "Build", coarseLevel); GetOStream(Runtime0, 0) << "Transferring coordinates" << std::endl; const ParameterList & pL = GetParameterList(); int writeStart = pL.get< int >("write start"); int writeEnd = pL.get< int >("write end"); RCP<Aggregates> aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates"); RCP<MultiVector> fineCoords = Get< RCP<MultiVector> >(fineLevel, "Coordinates"); RCP<const Map> coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap"); // coarseMap is being used to set up the domain map of tentative P, and therefore, the row map of Ac // Therefore, if we amalgamate coarseMap, logical nodes in the coordinates vector would correspond to // logical blocks in the matrix ArrayView<const GO> elementAList = coarseMap->getNodeElementList(); LO blkSize = 1; if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null) blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize(); GO indexBase = coarseMap->getIndexBase(); size_t numElements = elementAList.size() / blkSize; Array<GO> elementList(numElements); // Amalgamate the map for (LO i = 0; i < Teuchos::as<LO>(numElements); i++) elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase; RCP<const Map> coarseCoordMap = MapFactory ::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), elementList, indexBase, coarseMap->getComm()); RCP<MultiVector> coarseCoords = MultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors()); // Maps RCP<const Map> uniqueMap = fineCoords->getMap(); RCP<const Map> nonUniqueMap = aggregates->GetMap(); // Create overlapped fine coordinates to reduce global communication RCP<const Import> importer = ImportFactory ::Build(uniqueMap, nonUniqueMap); RCP<MultiVector> ghostedCoords = MultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors()); ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT); // Get some info about aggregates int myPID = uniqueMap->getComm()->getRank(); LO numAggs = aggregates->GetNumAggregates(); ArrayRCP<LO> aggSizes = aggregates->ComputeAggregateSizes(); const ArrayRCP<const LO> vertex2AggID = aggregates->GetVertex2AggId()->getData(0); const ArrayRCP<const LO> procWinner = aggregates->GetProcWinner()->getData(0); // Fill in coarse coordinates for (size_t j = 0; j < fineCoords->getNumVectors(); j++) { ArrayRCP<const Scalar> fineCoordsData = ghostedCoords->getData(j); ArrayRCP<Scalar> coarseCoordsData = coarseCoords->getDataNonConst(j); for (LO lnode = 0; lnode < vertex2AggID.size(); lnode++) if (procWinner[lnode] == myPID) coarseCoordsData[vertex2AggID[lnode]] += fineCoordsData[lnode]; for (LO agg = 0; agg < numAggs; agg++) coarseCoordsData[agg] /= aggSizes[agg]; } Set<RCP<MultiVector> >(coarseLevel, "Coordinates", coarseCoords); if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd) { std::ostringstream buf; buf << fineLevel.GetLevelID(); std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m"; Utils::Write(fileName,*fineCoords); } if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd) { std::ostringstream buf; buf << coarseLevel.GetLevelID(); std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m"; Utils::Write(fileName,*coarseCoords); } } // Build
void Ifpack2Smoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::SetupSchwarz(Level& currentLevel) { if (this->IsSetup() == true) this->GetOStream(Warnings0) << "MueLu::Ifpack2Smoother::Setup(): Setup() has already been called" << std::endl; // If we are doing "user" partitioning, we assume that what the user // really wants to do is make tiny little subdomains with one row // asssigned to each subdomain. The rows used for these little // subdomains correspond to those in the 2nd block row. Then, // if we overlap these mini-subdomains, we will do something that // looks like Vanka (grabbing all velocities associated with each // each pressure unknown). In addition, we put all Dirichlet points // as a little mini-domain. ParameterList& paramList = const_cast<ParameterList&>(this->GetParameterList()); bool isBlockedMatrix = false; RCP<Matrix> merged2Mat; std::string sublistName = "subdomain solver parameters"; if (paramList.isSublist(sublistName)) { ParameterList& subList = paramList.sublist(sublistName); std::string partName = "partitioner: type"; if (subList.isParameter(partName) && subList.get<std::string>(partName) == "user") { isBlockedMatrix = true; RCP<BlockedCrsMatrix> bA = rcp_dynamic_cast<BlockedCrsMatrix>(A_); TEUCHOS_TEST_FOR_EXCEPTION(bA.is_null(), Exceptions::BadCast, "Matrix A must be of type BlockedCrsMatrix."); size_t numVels = bA->getMatrix(0,0)->getNodeNumRows(); size_t numPres = bA->getMatrix(1,0)->getNodeNumRows(); size_t numRows = A_->getNodeNumRows(); ArrayRCP<LocalOrdinal> blockSeeds(numRows, Teuchos::OrdinalTraits<LocalOrdinal>::invalid()); size_t numBlocks = 0; for (size_t rowOfB = numVels; rowOfB < numVels+numPres; ++rowOfB) blockSeeds[rowOfB] = numBlocks++; RCP<BlockedCrsMatrix> bA2 = rcp_dynamic_cast<BlockedCrsMatrix>(A_); TEUCHOS_TEST_FOR_EXCEPTION(bA2.is_null(), Exceptions::BadCast, "Matrix A must be of type BlockedCrsMatrix."); RCP<CrsMatrix> mergedMat = bA2->Merge(); merged2Mat = rcp(new CrsMatrixWrap(mergedMat)); // Add Dirichlet rows to the list of seeds ArrayRCP<const bool> boundaryNodes; boundaryNodes = Utilities::DetectDirichletRows(*merged2Mat, 0.0); bool haveBoundary = false; for (LO i = 0; i < boundaryNodes.size(); i++) if (boundaryNodes[i]) { // FIXME: // 1. would not this [] overlap with some in the previos blockSeed loop? // 2. do we need to distinguish between pressure and velocity Dirichlet b.c. blockSeeds[i] = numBlocks; haveBoundary = true; } if (haveBoundary) numBlocks++; subList.set("partitioner: map", blockSeeds); subList.set("partitioner: local parts", as<int>(numBlocks)); } } RCP<const Tpetra::RowMatrix<SC, LO, GO, NO> > tpA; if (isBlockedMatrix == true) tpA = Utilities::Op2NonConstTpetraRow(merged2Mat); else tpA = Utilities::Op2NonConstTpetraRow(A_); prec_ = Ifpack2::Factory::create(type_, tpA, overlap_); SetPrecParameters(); prec_->initialize(); prec_->compute(); }
size_t computeLocalEdgeList( const RCP<const Environment> &env, const RCP<const Comm<int> > &comm, size_t numLocalEdges, // local edges size_t numLocalGraphEdges, // edges in "local" graph RCP<const IdentifierMap<User> > &idMap, ArrayRCP<const typename InputTraits<User>::zgid_t> &allEdgeIds, // in ArrayRCP<const typename InputTraits<User>::gno_t> &allEdgeGnos, // in ArrayRCP<int> &allProcs, // in ArrayRCP<const typename InputTraits<User>::lno_t> &allOffs, // in ArrayRCP<StridedData<typename InputTraits<User>::lno_t, typename InputTraits<User>::scalar_t> > &allWeights,// in ArrayRCP<const typename InputTraits<User>::lno_t> &edgeLocalIds, // ArrayRCP<const typename InputTraits<User>::lno_t> &offsets, // out ArrayRCP<StridedData<typename InputTraits<User>::lno_t, typename InputTraits<User>::scalar_t> > &eWeights) // out { typedef typename InputTraits<User>::zgid_t zgid_t; typedef typename InputTraits<User>::gno_t gno_t; typedef typename InputTraits<User>::scalar_t scalar_t; typedef typename InputTraits<User>::lno_t lno_t; typedef StridedData<lno_t, scalar_t> input_t; int rank = comm->getRank(); bool gnosAreGids = idMap->gnosAreGids(); edgeLocalIds = ArrayRCP<const lno_t>(Teuchos::null); eWeights = ArrayRCP<input_t>(Teuchos::null); offsets = ArrayRCP<const lno_t>(Teuchos::null); if (numLocalGraphEdges == 0) { // Set the offsets array and return size_t allOffsSize = allOffs.size(); lno_t *offs = new lno_t [allOffsSize]; env->localMemoryAssertion(__FILE__, __LINE__, allOffsSize, offs); for (size_t i = 0; i < allOffsSize; i++) offs[i] = 0; offsets = arcp(offs, 0, allOffsSize, true); return 0; } if (numLocalGraphEdges == numLocalEdges){ // Entire graph is local. lno_t *lnos = new lno_t [numLocalGraphEdges]; env->localMemoryAssertion(__FILE__, __LINE__, numLocalGraphEdges, lnos); if (comm->getSize() == 1) { // With one rank, Can use gnos as local index. if (gnosAreGids) for (size_t i=0; i < numLocalEdges; i++) lnos[i] = allEdgeIds[i]; else for (size_t i=0; i < numLocalEdges; i++) lnos[i] = allEdgeGnos[i]; } else { ArrayRCP<gno_t> gnoArray; if (gnosAreGids){ ArrayRCP<const gno_t> gnosConst = arcp_reinterpret_cast<const gno_t>(allEdgeIds); gnoArray = arcp_const_cast<gno_t>(gnosConst); } else { gnoArray = arcp_const_cast<gno_t>(allEdgeGnos); } // Need to translate to gnos to local indexing ArrayView<lno_t> lnoView(lnos, numLocalGraphEdges); try { idMap->lnoTranslate(lnoView, gnoArray.view(0,numLocalGraphEdges), TRANSLATE_LIB_TO_APP); } Z2_FORWARD_EXCEPTIONS; } edgeLocalIds = arcp(lnos, 0, numLocalGraphEdges, true); offsets = allOffs; eWeights = allWeights; }
void ZoltanInterface<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& level) const { FactoryMonitor m(*this, "Build", level); RCP<Matrix> A = Get< RCP<Matrix> > (level, "A"); RCP<const Map> rowMap = A->getRowMap(); RCP<MultiVector> Coords = Get< RCP<MultiVector> >(level, "Coordinates"); size_t dim = Coords->getNumVectors(); GO numParts = level.Get<GO>("number of partitions"); if (numParts == 1) { // Running on one processor, so decomposition is the trivial one, all zeros. RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true); Set(level, "Partition", decomposition); return; } float zoltanVersion_; Zoltan_Initialize(0, NULL, &zoltanVersion_); RCP<const Teuchos::MpiComm<int> > dupMpiComm = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(rowMap->getComm()->duplicate()); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > zoltanComm = dupMpiComm->getRawMpiComm(); RCP<Zoltan> zoltanObj_ = rcp(new Zoltan((*zoltanComm)())); //extract the underlying MPI_Comm handle and create a Zoltan object if (zoltanObj_ == Teuchos::null) throw Exceptions::RuntimeError("MueLu::Zoltan : Unable to create Zoltan data structure"); // Tell Zoltan what kind of local/global IDs we will use. // In our case, each GID is two ints and there are no local ids. // One can skip this step if the IDs are just single ints. int rv; if ((rv = zoltanObj_->Set_Param("num_gid_entries", "1")) != ZOLTAN_OK) throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_gid_entries' returned error code " + Teuchos::toString(rv)); if ((rv = zoltanObj_->Set_Param("num_lid_entries", "0") ) != ZOLTAN_OK) throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_lid_entries' returned error code " + Teuchos::toString(rv)); if ((rv = zoltanObj_->Set_Param("obj_weight_dim", "1") ) != ZOLTAN_OK) throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'obj_weight_dim' returned error code " + Teuchos::toString(rv)); if (GetVerbLevel() & Statistics1) zoltanObj_->Set_Param("debug_level", "1"); else zoltanObj_->Set_Param("debug_level", "0"); zoltanObj_->Set_Param("num_global_partitions", toString(numParts)); zoltanObj_->Set_Num_Obj_Fn(GetLocalNumberOfRows, (void *) &*A); zoltanObj_->Set_Obj_List_Fn(GetLocalNumberOfNonzeros, (void *) &*A); zoltanObj_->Set_Num_Geom_Fn(GetProblemDimension, (void *) &dim); zoltanObj_->Set_Geom_Multi_Fn(GetProblemGeometry, (void *) Coords.get()); // Data pointers that Zoltan requires. ZOLTAN_ID_PTR import_gids = NULL; // Global nums of objs to be imported ZOLTAN_ID_PTR import_lids = NULL; // Local indices to objs to be imported int *import_procs = NULL; // Proc IDs of procs owning objs to be imported. int *import_to_part = NULL; // Partition #s to which imported objs should be assigned. ZOLTAN_ID_PTR export_gids = NULL; // Global nums of objs to be exported ZOLTAN_ID_PTR export_lids = NULL; // local indices to objs to be exported int *export_procs = NULL; // Proc IDs of destination procs for objs to be exported. int *export_to_part = NULL; // Partition #s for objs to be exported. int num_imported; // Number of objs to be imported. int num_exported; // Number of objs to be exported. int newDecomp; // Flag indicating whether the decomposition has changed int num_gid_entries; // Number of array entries in a global ID. int num_lid_entries; { SubFactoryMonitor m1(*this, "Zoltan RCB", level); rv = zoltanObj_->LB_Partition(newDecomp, num_gid_entries, num_lid_entries, num_imported, import_gids, import_lids, import_procs, import_to_part, num_exported, export_gids, export_lids, export_procs, export_to_part); if (rv == ZOLTAN_FATAL) throw Exceptions::RuntimeError("Zoltan::LB_Partition() returned error code"); } // TODO check that A's row map is 1-1. Zoltan requires this. RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition; if (newDecomp) { decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, false); // Don't initialize, will be overwritten ArrayRCP<GO> decompEntries = decomposition->getDataNonConst(0); int mypid = rowMap->getComm()->getRank(); for (typename ArrayRCP<GO>::iterator i = decompEntries.begin(); i != decompEntries.end(); ++i) *i = mypid; LO blockSize = A->GetFixedBlockSize(); for (int i = 0; i < num_exported; ++i) { // We have assigned Zoltan gids to first row GID in the block // NOTE: Zoltan GIDs are different from GIDs in the Coordinates vector LO localEl = rowMap->getLocalElement(export_gids[i]); int partNum = export_to_part[i]; for (LO j = 0; j < blockSize; ++j) decompEntries[localEl + j] = partNum; } } Set(level, "Partition", decomposition); zoltanObj_->LB_Free_Part(&import_gids, &import_lids, &import_procs, &import_to_part); zoltanObj_->LB_Free_Part(&export_gids, &export_lids, &export_procs, &export_to_part); } //Build()
// ******************************************************** int main(int argc, char *argv[]) { using namespace std; using namespace Teuchos; using namespace PHX; GlobalMPISession mpi_session(&argc, &argv); try { RCP<Time> total_time = TimeMonitor::getNewTimer("Total Run Time"); TimeMonitor tm(*total_time); // ********************************************************************* // Start of MDField Testing // ********************************************************************* { typedef MDField<double,Cell,Node>::size_type size_type; std::vector<size_type> dims(3); dims[0] = 10; dims[1] = 4; dims[2] = 3; RCP<DataLayout> quad_vector = rcp(new MDALayout<Cell,Quadrature,Dim>(dims[0],dims[1],dims[2])); int size = quad_vector->size(); TEUCHOS_TEST_FOR_EXCEPTION(size != dims[0]*dims[1]*dims[2], std::runtime_error, "Size mismatch on MDField!"); ArrayRCP<double> a_mem = arcp<double>(size); ArrayRCP<double> b_mem = arcp<double>(size); for (int i=0; i < a_mem.size(); ++i) a_mem[i] = static_cast<double>(i); for (int i=0; i < b_mem.size(); ++i) b_mem[i] = static_cast<double>(i); MDField<double,Cell,Point,Dim> a("density",quad_vector); MDField<double> b("density",quad_vector); a.setFieldData(a_mem); b.setFieldData(b_mem); simulated_intrepid_integrate(a); simulated_intrepid_integrate(b); // *********************** // Shards tests // *********************** ArrayRCP<double> c_mem = arcp<double>(size); ArrayRCP<double> d_mem = arcp<double>(size); for (int i=0; i < c_mem.size(); ++i) c_mem[i] = static_cast<double>(i); for (int i=0; i < d_mem.size(); ++i) d_mem[i] = static_cast<double>(i); shards::Array<double,shards::NaturalOrder,Cell,Node,Dim> c(c_mem.get(), dims[0], dims[1], dims[2]); size_type rank = dims.size(); const ArrayRCP<const shards::ArrayDimTag*> tags = arcp<const shards::ArrayDimTag*>(rank); tags[0] = &Cell::tag(); tags[1] = &Point::tag(); tags[2] = &Dim::tag(); shards::Array<double,shards::NaturalOrder> d(d_mem.get(),rank, &dims[0],tags.get()); simulated_intrepid_integrate(d); simulated_intrepid_integrate((const shards::Array<double,shards::NaturalOrder>&)(c)); } // ********************************************************************* // ********************************************************************* std::cout << "\nTest passed!\n" << std::endl; // ********************************************************************* // ********************************************************************* } catch (const std::exception& e) { std::cout << "************************************************" << endl; std::cout << "************************************************" << endl; std::cout << "Exception Caught!" << endl; std::cout << "Error message is below\n " << e.what() << endl; std::cout << "************************************************" << endl; } catch (...) { std::cout << "************************************************" << endl; std::cout << "************************************************" << endl; std::cout << "Unknown Exception Caught!" << endl; std::cout << "************************************************" << endl; } TimeMonitor::summarize(); return 0; }
void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::AggregateLeftovers(GraphBase const &graph, Aggregates &aggregates) const { Monitor m(*this, "AggregateLeftovers"); my_size_t nVertices = graph.GetNodeNumVertices(); int exp_nRows = aggregates.GetMap()->getNodeNumElements(); // Tentative fix... was previously exp_nRows = nVertices + graph.GetNodeNumGhost(); int myPid = graph.GetComm()->getRank(); my_size_t nAggregates = aggregates.GetNumAggregates(); int minNodesPerAggregate = GetMinNodesPerAggregate(); const RCP<const Map> nonUniqueMap = aggregates.GetMap(); //column map of underlying graph const RCP<const Map> uniqueMap = graph.GetDomainMap(); MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> myWidget(uniqueMap, nonUniqueMap); //TODO JJH We want to skip this call RCP<Xpetra::Vector<double,LO,GO,NO> > distWeights = Xpetra::VectorFactory<double,LO,GO,NO>::Build(nonUniqueMap); // Aggregated vertices not "definitively" assigned to processors are // arbitrated by ArbitrateAndCommunicate(). There is some // additional logic to prevent losing root nodes in arbitration. { ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0); ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0); ArrayRCP<double> weights = distWeights->getDataNonConst(0); for (size_t i=0;i<nonUniqueMap->getNodeNumElements();i++) { if (procWinner[i] == MUELU_UNASSIGNED) { if (vertex2AggId[i] != MUELU_UNAGGREGATED) { weights[i] = 1.; if (aggregates.IsRoot(i)) weights[i] = 2.; } } } // views on distributed vectors are freed here. } //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true); // All tentatively assigned vertices are now definitive // Tentatively assign any vertex (ghost or local) which neighbors a root // to the aggregate associated with the root. { ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0); ArrayRCP<double> weights = distWeights->getDataNonConst(0); for (my_size_t i = 0; i < nVertices; i++) { if ( aggregates.IsRoot(i) && (procWinner[i] == myPid) ) { // neighOfINode is the neighbor node list of node 'i'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int colj = *it; if (vertex2AggId[colj] == MUELU_UNAGGREGATED) { weights[colj]= 1.; vertex2AggId[colj] = vertex2AggId[i]; } } } } // views on distributed vectors are freed here. } //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true); // All tentatively assigned vertices are now definitive // Record the number of aggregated vertices GO total_phase_one_aggregated = 0; { ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); GO phase_one_aggregated = 0; for (my_size_t i = 0; i < nVertices; i++) { if (vertex2AggId[i] != MUELU_UNAGGREGATED) phase_one_aggregated++; } sumAll(graph.GetComm(), phase_one_aggregated, total_phase_one_aggregated); GO local_nVertices = nVertices, total_nVertices = 0; sumAll(graph.GetComm(), local_nVertices, total_nVertices); /* Among unaggregated points, see if we can make a reasonable size */ /* aggregate out of it. We do this by looking at neighbors and seeing */ /* how many are unaggregated and on my processor. Loosely, */ /* base the number of new aggregates created on the percentage of */ /* unaggregated nodes. */ ArrayRCP<double> weights = distWeights->getDataNonConst(0); double factor = 1.; factor = ((double) total_phase_one_aggregated)/((double)(total_nVertices + 1)); factor = pow(factor, GetPhase3AggCreation()); for (my_size_t i = 0; i < nVertices; i++) { if (vertex2AggId[i] == MUELU_UNAGGREGATED) { // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); int rowi_N = neighOfINode.size(); int nonaggd_neighbors = 0; for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int colj = *it; if (vertex2AggId[colj] == MUELU_UNAGGREGATED && colj < nVertices) nonaggd_neighbors++; } if ( (nonaggd_neighbors > minNodesPerAggregate) && (((double) nonaggd_neighbors)/((double) rowi_N) > factor)) { vertex2AggId[i] = (nAggregates)++; for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int colj = *it; if (vertex2AggId[colj]==MUELU_UNAGGREGATED) { vertex2AggId[colj] = vertex2AggId[i]; if (colj < nVertices) weights[colj] = 2.; else weights[colj] = 1.; } } aggregates.SetIsRoot(i); weights[i] = 2.; } } } // for (i = 0; i < nVertices; i++) // views on distributed vectors are freed here. } //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true); //All tentatively assigned vertices are now definitive if (IsPrint(Statistics1)) { GO Nphase1_agg = nAggregates; GO total_aggs; sumAll(graph.GetComm(), Nphase1_agg, total_aggs); GetOStream(Statistics1, 0) << "Phase 1 - nodes aggregated = " << total_phase_one_aggregated << std::endl; GetOStream(Statistics1, 0) << "Phase 1 - total aggregates = " << total_aggs << std::endl; GO i = nAggregates - Nphase1_agg; { GO ii; sumAll(graph.GetComm(),i,ii); i = ii; } GetOStream(Statistics1, 0) << "Phase 3 - additional aggregates = " << i << std::endl; } // Determine vertices that are not shared by setting Temp to all ones // and doing NonUnique2NonUnique(..., ADD). This sums values of all // local copies associated with each Gid. Thus, sums > 1 are shared. // std::cout << "exp_nrows=" << exp_nRows << " (nVertices= " << nVertices << ", numGhost=" << graph.GetNodeNumGhost() << ")" << std::endl; // std::cout << "nonUniqueMap=" << nonUniqueMap->getNodeNumElements() << std::endl; RCP<Xpetra::Vector<double,LO,GO,NO> > temp_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap,false); //no need to zero out vector in ctor temp_->putScalar(1.); RCP<Xpetra::Vector<double,LO,GO,NO> > tempOutput_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap); myWidget.NonUnique2NonUnique(*temp_, *tempOutput_, Xpetra::ADD); std::vector<bool> gidNotShared(exp_nRows); { ArrayRCP<const double> tempOutput = tempOutput_->getData(0); for (int i = 0; i < exp_nRows; i++) { if (tempOutput[i] > 1.) gidNotShared[i] = false; else gidNotShared[i] = true; } } // Phase 4. double nAggregatesTarget; nAggregatesTarget = ((double) uniqueMap->getGlobalNumElements())* (((double) uniqueMap->getGlobalNumElements())/ ((double) graph.GetGlobalNumEdges())); GO nAggregatesLocal=nAggregates, nAggregatesGlobal; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal); LO minNAggs; minAll(graph.GetComm(), nAggregates, minNAggs); LO maxNAggs; maxAll(graph.GetComm(), nAggregates, maxNAggs); // // Only do this phase if things look really bad. THIS // CODE IS PRETTY EXPERIMENTAL // #define MUELU_PHASE4BUCKETS 6 if ((nAggregatesGlobal < graph.GetComm()->getSize()) && (2.5*nAggregatesGlobal < nAggregatesTarget) && (minNAggs ==0) && (maxNAggs <= 1)) { // Modify seed of the random algorithm used by temp_->randomize() { typedef Teuchos::ScalarTraits<double> scalarTrait; // temp_ is of type double. scalarTrait::seedrandom(static_cast<unsigned int>(myPid*2 + (int) (11*scalarTrait::random()))); int k = (int)ceil( (10.*myPid)/graph.GetComm()->getSize()); for (int i = 0; i < k+7; i++) scalarTrait::random(); temp_->setSeed(static_cast<unsigned int>(scalarTrait::random())); } temp_->randomize(); ArrayRCP<double> temp = temp_->getDataNonConst(0); // build a list of candidate root nodes (vertices not adjacent // to aggregated vertices) my_size_t nCandidates = 0; global_size_t nCandidatesGlobal; ArrayRCP<LO> candidates = Teuchos::arcp<LO>(nVertices+1); double priorThreshold = 0.; for (int kkk = 0; kkk < MUELU_PHASE4BUCKETS; kkk++) { { ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0); ArrayView<const LO> vertex2AggIdView = vertex2AggId(); RootCandidates(nVertices, vertex2AggIdView, graph, candidates, nCandidates, nCandidatesGlobal); // views on distributed vectors are freed here. } double nTargetNewGuys = nAggregatesTarget - nAggregatesGlobal; double threshold = priorThreshold + (1. - priorThreshold)*nTargetNewGuys/(nCandidatesGlobal + .001); threshold = (threshold*(kkk+1.))/((double) MUELU_PHASE4BUCKETS); priorThreshold = threshold; { ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<double> weights = distWeights->getDataNonConst(0); for (int k = 0; k < nCandidates; k++ ) { int i = candidates[k]; if ((vertex2AggId[i] == MUELU_UNAGGREGATED) && (fabs(temp[i]) < threshold)) { // Note: priorThreshold <= fabs(temp[i]) <= 1 // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); if (neighOfINode.size() > minNodesPerAggregate) { //TODO: check if this test is exactly was we want to do int count = 0; for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int Adjacent = *it; // This might not be true if someone close to i // is chosen as a root via fabs(temp[]) < Threshold if (vertex2AggId[Adjacent] == MUELU_UNAGGREGATED){ count++; vertex2AggId[Adjacent] = nAggregates; weights[Adjacent] = 1.; } } if (count >= minNodesPerAggregate) { vertex2AggId[i] = nAggregates++; weights[i] = 2.; aggregates.SetIsRoot(i); } else { // undo things for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int Adjacent = *it; if (vertex2AggId[Adjacent] == nAggregates){ vertex2AggId[Adjacent] = MUELU_UNAGGREGATED; weights[Adjacent] = 0.; } } } } } } // views on distributed vectors are freed here. } //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true); // All tentatively assigned vertices are now definitive nAggregatesLocal=nAggregates; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal); // check that there are no aggregates sizes below minNodesPerAggregate aggregates.SetNumAggregates(nAggregates); RemoveSmallAggs(aggregates, minNodesPerAggregate, distWeights, myWidget); nAggregates = aggregates.GetNumAggregates(); } // one possibility } // Initialize things for Phase 5. This includes building the transpose // of the matrix ONLY for transposed rows that correspond to unaggregted // ghost vertices. Further, the transpose is only a local transpose. // Nonzero edges which exist on other processors are not represented. int observedNAgg=-1; //number of aggregates that contain vertices on this process { ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0); for(LO k = 0; k < vertex2AggId.size(); ++k ) if(vertex2AggId[k]>observedNAgg) observedNAgg=vertex2AggId[k]; observedNAgg++; } ArrayRCP<int> Mark = Teuchos::arcp<int>(exp_nRows+1); ArrayRCP<int> agg_incremented = Teuchos::arcp<int>(observedNAgg); ArrayRCP<int> SumOfMarks = Teuchos::arcp<int>(observedNAgg); for (int i = 0; i < exp_nRows; i++) Mark[i] = MUELU_DISTONE_VERTEX_WEIGHT; for (int i = 0; i < agg_incremented.size(); i++) agg_incremented[i] = 0; for (int i = 0; i < SumOfMarks.size(); i++) SumOfMarks[i] = 0; // Grab the transpose matrix graph for unaggregated ghost vertices. // a) count the number of nonzeros per row in the transpose std::vector<int> RowPtr(exp_nRows+1-nVertices); //{ ArrayRCP<const LO> vertex2AggIdCst = aggregates.GetVertex2AggId()->getData(0); for (int i = nVertices; i < exp_nRows; i++) RowPtr[i-nVertices] = 0; for (int i = 0; i < nVertices; i++) { // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int j = *it; if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){ RowPtr[j-nVertices]++; } } } // b) Convert RowPtr[i] to point to 1st first nnz spot in row i. int iSum = 0, iTemp; for (int i = nVertices; i < exp_nRows; i++) { iTemp = RowPtr[i-nVertices]; RowPtr[i-nVertices] = iSum; iSum += iTemp; } RowPtr[exp_nRows-nVertices] = iSum; std::vector<LO> cols(iSum+1); // c) Traverse matrix and insert entries in proper location. for (int i = 0; i < nVertices; i++) { // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int j = *it; if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){ cols[RowPtr[j-nVertices]++] = i; } } } // d) RowPtr[i] points to beginning of row i+1 so shift by one location. for (int i = exp_nRows; i > nVertices; i--) RowPtr[i-nVertices] = RowPtr[i-1-nVertices]; RowPtr[0] = 0; // views on distributed vectors are freed here. vertex2AggIdCst = Teuchos::null; //} int bestScoreCutoff; int thresholds[10] = {300,200,100,50,25,13,7,4,2,0}; // Stick unaggregated vertices into existing aggregates as described above. { int ncalls=0; for (int kk = 0; kk < 10; kk += 2) { bestScoreCutoff = thresholds[kk]; ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0); ArrayRCP<double> weights = distWeights->getDataNonConst(0); for (int i = 0; i < exp_nRows; i++) { if (vertex2AggId[i] == MUELU_UNAGGREGATED) { // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode; // Grab neighboring vertices which is either in graph for local ids // or sits in transposed fragment just constructed above for ghosts. if (i < nVertices) { neighOfINode = graph.getNeighborVertices(i); } else { LO *rowi_col = NULL, rowi_N; rowi_col = &(cols[RowPtr[i-nVertices]]); rowi_N = RowPtr[i+1-nVertices] - RowPtr[i-nVertices]; neighOfINode = ArrayView<const LO>(rowi_col, rowi_N); } for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int Adjacent = *it; int AdjacentAgg = vertex2AggId[Adjacent]; //Adjacent is aggregated and either I own the aggregate // or I could own the aggregate after arbitration. if ((AdjacentAgg != MUELU_UNAGGREGATED) && ((procWinner[Adjacent] == myPid) || (procWinner[Adjacent] == MUELU_UNASSIGNED))){ SumOfMarks[AdjacentAgg] += Mark[Adjacent]; } } int best_score = MUELU_NOSCORE; int best_agg = -1; int BestMark = -1; bool cannotLoseAllFriends=false; // Used to address possible loss of vertices in arbitration of shared nodes discussed above. (Initialized to false only to avoid a compiler warning). for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int Adjacent = *it; int AdjacentAgg = vertex2AggId[Adjacent]; //Adjacent is unaggregated, has some value and no //other processor has definitively claimed him if ((AdjacentAgg != MUELU_UNAGGREGATED) && (SumOfMarks[AdjacentAgg] != 0) && ((procWinner[Adjacent] == myPid) || (procWinner[Adjacent] == MUELU_UNASSIGNED ))) { // first figure out the penalty associated with // AdjacentAgg having already been incremented // during this phase, then compute score. double penalty = (double) (INCR_SCALING*agg_incremented[AdjacentAgg]); if (penalty > MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg])) penalty = MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]); int score = SumOfMarks[AdjacentAgg]- ((int) floor(penalty)); if (score > best_score) { best_agg = AdjacentAgg; best_score = score; BestMark = Mark[Adjacent]; cannotLoseAllFriends = false; // This address issue mentioned above by checking whether // Adjacent could be lost in arbitration. weight==0 means that // Adjacent was not set during this loop of Phase 5 (and so it // has already undergone arbitration). GidNotShared == true // obviously implies that Adjacent cannot be lost to arbitration if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true)) cannotLoseAllFriends = true; } // Another vertex within current best aggregate found. // We should have (best_score == score). We need to see // if we can improve BestMark and cannotLoseAllFriends. else if (best_agg == AdjacentAgg) { if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true)) cannotLoseAllFriends = true; if (Mark[Adjacent] > BestMark) BestMark = Mark[Adjacent]; } } } // Clean up for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int Adjacent = *it; int AdjacentAgg = vertex2AggId[Adjacent]; if (AdjacentAgg >= 0) SumOfMarks[AdjacentAgg] = 0; } // Tentatively assign vertex to best_agg. if ( (best_score >= bestScoreCutoff) && (cannotLoseAllFriends)) { TEUCHOS_TEST_FOR_EXCEPTION(best_agg == -1 || BestMark == -1, MueLu::Exceptions::RuntimeError, "MueLu::CoupledAggregationFactory internal error"); // should never happen vertex2AggId[i] = best_agg; weights[i] = best_score; agg_incremented[best_agg]++; Mark[i] = (int) ceil( ((double) BestMark)/2.); } } // views on distributed vectors are freed here. } vertex2AggId = Teuchos::null; procWinner = Teuchos::null; weights = Teuchos::null; ++ncalls; //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true); // All tentatively assigned vertices are now definitive } // if (graph.GetComm()->getRank()==0) // std::cout << "#calls to Arb&Comm=" << ncalls << std::endl; } // Phase 6: Aggregate remain unaggregated vertices and try at all costs // to avoid small aggregates. // One case where we can find ourselves in this situation // is if all vertices vk adjacent to v have already been // put in other processor's aggregates and v does not have // a direct connection to a local vertex in any of these // aggregates. int Nleftover = 0, Nsingle = 0; { ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<double> weights = distWeights->getDataNonConst(0); ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0); int count = 0; for (my_size_t i = 0; i < nVertices; i++) { if (vertex2AggId[i] == MUELU_UNAGGREGATED) { Nleftover++; // neighOfINode is the neighbor node list of node 'iNode'. ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i); // We don't want too small of an aggregate. So lets see if there is an // unaggregated neighbor that we can also put with this vertex vertex2AggId[i] = nAggregates; weights[i] = 1.; if (count == 0) aggregates.SetIsRoot(i); count++; for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) { int j = *it; if ((j != i)&&(vertex2AggId[j] == MUELU_UNAGGREGATED)&& (j < nVertices)) { vertex2AggId[j] = nAggregates; weights[j] = 1.; count++; } } if ( count >= minNodesPerAggregate) { nAggregates++; count = 0; } } } // We have something which is under minNodesPerAggregate when if (count != 0) { #ifdef FIXME // Can stick small aggregate with 0th aggregate? if (nAggregates > 0) { for (my_size_t i = 0; i < nVertices; i++) { if ((vertex2AggId[i] == nAggregates) && (procWinner[i] == myPid)) { vertex2AggId[i] = 0; aggregates.SetIsRoot(i,false); } } } else { Nsingle++; nAggregates++; } #else // Can stick small aggregate with 0th aggregate? if (nAggregates > 0) { for (my_size_t i = 0; i < nVertices; i++) { // TW: This is not a real fix. This may produce ugly bad aggregates! // I removed the procWinner[i] == myPid check. it makes no sense to me since // it leaves vertex2AggId[i] == nAggregates -> crash in ComputeAggregateSizes(). // Maybe it's better to add the leftovers to the last generated agg on the current proc. // The best solution would be to add them to the "next"/nearest aggregate, that may be // on an other processor if (vertex2AggId[i] == nAggregates) { vertex2AggId[i] = nAggregates-1; //0; aggregates.SetIsRoot(i,false); } } } else { Nsingle++; nAggregates++; } #endif } // views on distributed vectors are freed here. } //TODO JJH We want to skip this call myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, false); if (IsPrint(Statistics1)) { GO total_Nsingle=0; sumAll(graph.GetComm(), (GO)Nsingle, total_Nsingle); GO total_Nleftover=0; sumAll(graph.GetComm(), (GO)Nleftover, total_Nleftover); // GO total_aggs; sumAll(graph.GetComm(), (GO)nAggregates, total_aggs); // GetOStream(Statistics1, 0) << "Phase 6 - total aggregates = " << total_aggs << std::endl; GetOStream(Statistics1, 0) << "Phase 6 - leftovers = " << total_Nleftover << " and singletons = " << total_Nsingle << std::endl; } aggregates.SetNumAggregates(nAggregates); } //AggregateLeftovers
void BraessSarazinSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Setup(Level& currentLevel) { FactoryMonitor m(*this, "Setup Smoother", currentLevel); if (SmootherPrototype::IsSetup() == true) this->GetOStream(Warnings0) << "MueLu::BreaessSarazinSmoother::Setup(): Setup() has already been called"; // Extract blocked operator A from current level A_ = Factory::Get<RCP<Matrix> > (currentLevel, "A"); RCP<BlockedCrsMatrix> bA = rcp_dynamic_cast<BlockedCrsMatrix>(A_); TEUCHOS_TEST_FOR_EXCEPTION(bA.is_null(), Exceptions::BadCast, "MueLu::BraessSarazinSmoother::Setup: input matrix A is not of type BlockedCrsMatrix! error."); // Store map extractors rangeMapExtractor_ = bA->getRangeMapExtractor(); domainMapExtractor_ = bA->getDomainMapExtractor(); // Store the blocks in local member variables A00_ = bA->getMatrix(0,0); A01_ = bA->getMatrix(0,1); A10_ = bA->getMatrix(1,0); A11_ = bA->getMatrix(1,1); const ParameterList& pL = Factory::GetParameterList(); SC omega = pL.get<SC>("Damping factor"); #if 0 // old code // Create the inverse of the diagonal of F D_ = VectorFactory::Build(A00_->getRowMap()); ArrayRCP<SC> diag; if (pL.get<bool>("lumping") == false) diag = Utilities::GetMatrixDiagonal (*A00_); else diag = Utilities::GetLumpedMatrixDiagonal(*A00_); SC one = Teuchos::ScalarTraits<SC>::one(); ArrayRCP<SC> Ddata = D_->getDataNonConst(0); for (GO row = 0; row < Ddata.size(); row++) Ddata[row] = one / (diag[row]*omega);*/ #else // Create the inverse of the diagonal of F // TODO add safety check for zeros on diagonal of F! RCP<Vector> diagFVector = VectorFactory::Build(A00_->getRowMap()); if (pL.get<bool>("lumping") == false) { A00_->getLocalDiagCopy(*diagFVector); // extract diagonal of F } else { diagFVector = Utilities::GetLumpedMatrixDiagonal(A00_); } diagFVector->scale(omega); D_ = Utilities::GetInverse(diagFVector); #endif // Set the Smoother // carefully switch to the SubFactoryManagers (defined by the users) { SetFactoryManager currentSFM(rcpFromRef(currentLevel), FactManager_); smoo_ = currentLevel.Get<RCP<SmootherBase> >("PreSmoother", FactManager_->GetFactory("Smoother").get()); S_ = currentLevel.Get<RCP<Matrix> > ("A", FactManager_->GetFactory("A").get()); } SmootherPrototype::IsSetup(true); }
int main(int argc, char *argv[]) { #if defined(HAVE_MUELU_EPETRA) && defined(HAVE_MUELU_EPETRAEXT) typedef double Scalar; typedef int LocalOrdinal; typedef int GlobalOrdinal; typedef LocalOrdinal LO; typedef GlobalOrdinal GO; typedef Xpetra::EpetraNode Node; #include "MueLu_UseShortNames.hpp" using Teuchos::RCP; using Teuchos::rcp; using namespace MueLuTests; using namespace Teuchos; oblackholestream blackhole; GlobalMPISession mpiSession(&argc,&argv,&blackhole); bool success = false; bool verbose = true; try { // default parameters std::string xmlFile = "myXML.xml"; // Note: use --help to list available options. CommandLineProcessor clp(false); clp.setOption("xml", &xmlFile, "xml file with solver parameters for a 2x2 blocked NS example"); switch (clp.parse(argc,argv)) { case CommandLineProcessor::PARSE_HELP_PRINTED: return EXIT_SUCCESS; break; case CommandLineProcessor::PARSE_ERROR: case CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; break; case CommandLineProcessor::PARSE_SUCCESSFUL: break; } RCP<const Comm<int> > comm = DefaultComm<int>::getComm(); RCP<FancyOStream> out = fancyOStream(rcpFromRef(std::cout)); out->setOutputToRootOnly(0); *out << MueLu::MemUtils::PrintMemoryUsage() << std::endl; // Timing Time myTime("global"); TimeMonitor MM(myTime); GO maxCoarseSize=1; //FIXME clp doesn't like long long int int globalNumDofs = 1500; // used for the maps int nDofsPerNode = 3; // used for generating the fine level null-space // build strided maps // striding information: 2 velocity dofs and 1 pressure dof = 3 dofs per node std::vector<size_t> stridingInfo; stridingInfo.push_back(2); stridingInfo.push_back(1); /////////////////////////////////////// build strided maps // build strided maps: // xstridedfullmap: full map (velocity and pressure dof gids), continous // xstridedvelmap: only velocity dof gid maps (i.e. 0,1,3,4,6,7...) // xstridedpremap: only pressure dof gid maps (i.e. 2,5,8,...) Xpetra::UnderlyingLib lib = Xpetra::UseEpetra; RCP<const StridedMap> xstridedfullmap = StridedMapFactory::Build(lib,globalNumDofs,0,stridingInfo,comm,-1); RCP<const StridedMap> xstridedvelmap = StridedMapFactory::Build(xstridedfullmap,0); RCP<const StridedMap> xstridedpremap = StridedMapFactory::Build(xstridedfullmap,1); /////////////////////////////////////// transform Xpetra::Map objects to Epetra // this is needed for AztecOO const RCP<const Epetra_Map> fullmap = rcpFromRef(Xpetra::toEpetra(*xstridedfullmap)); RCP<const Epetra_Map> velmap = rcpFromRef(Xpetra::toEpetra(*xstridedvelmap)); RCP<const Epetra_Map> premap = rcpFromRef(Xpetra::toEpetra(*xstridedpremap)); /////////////////////////////////////// import problem matrix and RHS from files (-> Epetra) // read in problem Epetra_CrsMatrix * ptrA = 0; Epetra_Vector * ptrf = 0; Epetra_MultiVector* ptrNS = 0; *out << "Reading matrix market file" << std::endl; EpetraExt::MatrixMarketFileToCrsMatrix("A_re1000_5932.txt",*fullmap,*fullmap,*fullmap,ptrA); EpetraExt::MatrixMarketFileToVector("b_re1000_5932.txt",*fullmap,ptrf); RCP<Epetra_CrsMatrix> epA = rcp(ptrA); RCP<Epetra_Vector> epv = rcp(ptrf); RCP<Epetra_MultiVector> epNS = rcp(ptrNS); /////////////////////////////////////// split system into 2x2 block system *out << "Split matrix into 2x2 block matrix" << std::endl; // split fullA into A11,..., A22 RCP<Epetra_CrsMatrix> A11; RCP<Epetra_CrsMatrix> A12; RCP<Epetra_CrsMatrix> A21; RCP<Epetra_CrsMatrix> A22; if(SplitMatrix2x2(epA,*velmap,*premap,A11,A12,A21,A22)==false) *out << "Problem with splitting matrix"<< std::endl; /////////////////////////////////////// transform Epetra objects to Xpetra (needed for MueLu) // build Xpetra objects from Epetra_CrsMatrix objects RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA11 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A11)); RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA12 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A12)); RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA21 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A21)); RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA22 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A22)); /////////////////////////////////////// generate MapExtractor object std::vector<RCP<const Xpetra::Map<LO,GO,Node> > > xmaps; xmaps.push_back(xstridedvelmap); xmaps.push_back(xstridedpremap); RCP<const Xpetra::MapExtractor<Scalar,LO,GO,Node> > map_extractor = Xpetra::MapExtractorFactory<Scalar,LO,GO,Node>::Build(xstridedfullmap,xmaps); /////////////////////////////////////// build blocked transfer operator // using the map extractor RCP<Xpetra::BlockedCrsMatrix<Scalar,LO,GO,Node> > bOp = rcp(new Xpetra::BlockedCrsMatrix<Scalar,LO,GO,Node>(map_extractor,map_extractor,10)); bOp->setMatrix(0,0,xA11); bOp->setMatrix(0,1,xA12); bOp->setMatrix(1,0,xA21); bOp->setMatrix(1,1,xA22); bOp->fillComplete(); //////////////////////////////////////// prepare setup ParameterListInterpreter mueLuFactory(xmlFile, *comm); RCP<Hierarchy> H = mueLuFactory.CreateHierarchy(); H->setDefaultVerbLevel(VERB_HIGH); H->SetMaxCoarseSize(maxCoarseSize); RCP<MueLu::Level> Finest = H->GetLevel(0); Finest->setDefaultVerbLevel(VERB_HIGH); Finest->Set("A", rcp_dynamic_cast<Matrix>(bOp)); ////////////////////////////////////////// prepare null space for A11 RCP<MultiVector> nullspace11 = MultiVectorFactory::Build(xstridedvelmap, 2); // this is a 2D standard null space for (int i=0; i<nDofsPerNode-1; ++i) { ArrayRCP<Scalar> nsValues = nullspace11->getDataNonConst(i); int numBlocks = nsValues.size() / (nDofsPerNode - 1); for (int j=0; j< numBlocks; ++j) { nsValues[j*(nDofsPerNode - 1) + i] = 1.0; } } Finest->Set("Nullspace1",nullspace11); ////////////////////////////////////////// prepare null space for A22 RCP<MultiVector> nullspace22 = MultiVectorFactory::Build(xstridedpremap, 1); // this is a 2D standard null space ArrayRCP<Scalar> nsValues22 = nullspace22->getDataNonConst(0); for (int j=0; j< nsValues22.size(); ++j) { nsValues22[j] = 1.0; } Finest->Set("Nullspace2",nullspace22); /////////////////////////////////// BEGIN setup mueLuFactory.SetupHierarchy(*H); ///////////////////////////////////// END setup *out << std::endl; RCP<MultiVector> xLsg = MultiVectorFactory::Build(xstridedfullmap,1); // Use AMG directly as an iterative method { xLsg->putScalar( (SC) 0.0); // Epetra_Vector -> Xpetra::Vector RCP<Vector> xRhs = rcp(new Xpetra::EpetraVectorT<int,Node>(epv)); // calculate initial (absolute) residual Array<ScalarTraits<SC>::magnitudeType> norms(1); xRhs->norm2(norms); *out << "||x_0|| = " << norms[0] << std::endl; // apply ten multigrid iterations H->Iterate(*xRhs,*xLsg,100); // calculate and print residual RCP<MultiVector> xTmp = MultiVectorFactory::Build(xstridedfullmap,1); bOp->apply(*xLsg,*xTmp,NO_TRANS,(SC)1.0,(SC)0.0); xRhs->update((SC)-1.0,*xTmp,(SC)1.0); xRhs->norm2(norms); *out << "||r|| = " << norms[0] << std::endl; } // TODO: don't forget to add Aztec as prerequisite in CMakeLists.txt! // // Solve Ax = b using AMG as a preconditioner in AztecOO // { RCP<Epetra_Vector> X = rcp(new Epetra_Vector(epv->Map())); X->PutScalar(0.0); Epetra_LinearProblem epetraProblem(epA.get(), X.get(), epv.get()); AztecOO aztecSolver(epetraProblem); aztecSolver.SetAztecOption(AZ_solver, AZ_gmres); MueLu::EpetraOperator aztecPrec(H); aztecSolver.SetPrecOperator(&aztecPrec); int maxIts = 50; double tol = 1e-8; aztecSolver.Iterate(maxIts, tol); } success = true; } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); return ( success ? EXIT_SUCCESS : EXIT_FAILURE ); #else std::cout << "Epetra (and/or EpetraExt) are not available. Skip test." << std::endl; return EXIT_SUCCESS; #endif }
EpetraCrsMatrixT<EpetraGlobalOrdinal>::EpetraCrsMatrixT(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype, const Teuchos::RCP< Teuchos::ParameterList > &plist) : isFillResumed_(false) { Teuchos::Array<int> numEntriesPerRowToAlloc(NumEntriesPerRowToAlloc.begin(), NumEntriesPerRowToAlloc.end()); // convert array of "size_t" to array of "int" mtx_ = Teuchos::rcp(new Epetra_CrsMatrix(Copy, toEpetra(rowMap), toEpetra(colMap), numEntriesPerRowToAlloc.getRawPtr(), toEpetra(pftype))); }
/* this file is automatically generated - do not edit (see script/tpetra.py) */ #include "Xpetra_TpetraConfigDefs.hpp" #include "Tpetra_CrsMatrix.hpp" #include "Xpetra_CrsMatrix.hpp" #include "Xpetra_TpetraMap.hpp" #include "Xpetra_TpetraMultiVector.hpp" #include "Xpetra_TpetraVector.hpp" #include "Xpetra_TpetraCrsGraph.hpp" //#include "Xpetra_TpetraRowMatrix.hpp" #include "Xpetra_Exceptions.hpp" namespace Xpetra { // TODO: move that elsewhere // template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalMatOps> // const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> toTpetraCrsMatrix(const Xpetra::DistObject<char, LocalOrdinal, GlobalOrdinal, Node> &); // template <class Scalar, class LocalOrdinal = int, class GlobalOrdinal = LocalOrdinal, class Node = Kokkos::DefaultNode::DefaultNodeType, class LocalMatOps = typename Kokkos::DefaultKernels<Scalar,LocalOrdinal,Node>::SparseOps> class TpetraCrsMatrix : public CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps>//, public TpetraRowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> { // The following typedef are used by the XPETRA_DYNAMIC_CAST() macro. typedef TpetraCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> TpetraCrsMatrixClass; typedef TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TpetraVectorClass; typedef TpetraImport<LocalOrdinal,GlobalOrdinal,Node> TpetraImportClass; typedef TpetraExport<LocalOrdinal,GlobalOrdinal,Node> TpetraExportClass; public: //! @name Constructor/Destructor Methods //@{ //! Constructor specifying fixed number of entries for each row. TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, size_t maxNumEntriesPerRow, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null) : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), maxNumEntriesPerRow, toTpetra(pftype), params))) { } //! Constructor specifying (possibly different) number of entries in each row. TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null) : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), NumEntriesPerRowToAlloc, toTpetra(pftype), params))) { } //! Constructor specifying column Map and fixed number of entries for each row. TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, size_t maxNumEntriesPerRow, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null) : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), toTpetra(colMap), maxNumEntriesPerRow, toTpetra(pftype), params))) { } //! Constructor specifying column Map and number of entries in each row. TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null) : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), toTpetra(colMap), NumEntriesPerRowToAlloc, toTpetra(pftype), params))) { } //! Constructor specifying a previously constructed graph. TpetraCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node, LocalMatOps > > &graph, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null) : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(graph), params))) { } //! Constructor for a fused import TpetraCrsMatrix(const Teuchos::RCP<const CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& sourceMatrix, const Import<LocalOrdinal,GlobalOrdinal,Node> & importer, const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap = Teuchos::null, const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap = Teuchos::null, const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null) { typedef Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MyTpetraCrsMatrix; XPETRA_DYNAMIC_CAST(const TpetraCrsMatrixClass, *sourceMatrix, tSourceMatrix, "Xpetra::TpetraCrsMatrix constructor only accepts Xpetra::TpetraCrsMatrix as the input argument.");//TODO: remove and use toTpetra() RCP< const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> > v = tSourceMatrix.getTpetra_CrsMatrix(); RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myDomainMap = domainMap!=Teuchos::null ? toTpetra(domainMap) : Teuchos::null; RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myRangeMap = rangeMap!=Teuchos::null ? toTpetra(rangeMap) : Teuchos::null; mtx_=Tpetra::importAndFillCompleteCrsMatrix<MyTpetraCrsMatrix>(tSourceMatrix.getTpetra_CrsMatrix(),toTpetra(importer),myDomainMap,myRangeMap,params); bool restrictComm=false; if(!params.is_null()) restrictComm = params->get("Restrict Communicator",restrictComm); if(restrictComm && mtx_->getRowMap().is_null()) mtx_=Teuchos::null; } //! Constructor for a fused export TpetraCrsMatrix(const Teuchos::RCP<const CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& sourceMatrix, const Export<LocalOrdinal,GlobalOrdinal,Node> & exporter, const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap = Teuchos::null, const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap = Teuchos::null, const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null) { typedef Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MyTpetraCrsMatrix; XPETRA_DYNAMIC_CAST(const TpetraCrsMatrixClass, *sourceMatrix, tSourceMatrix, "Xpetra::TpetraCrsMatrix constructor only accepts Xpetra::TpetraCrsMatrix as the input argument.");//TODO: remove and use toTpetra() RCP< const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> > v = tSourceMatrix.getTpetra_CrsMatrix(); RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myDomainMap = domainMap!=Teuchos::null ? toTpetra(domainMap) : Teuchos::null; RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myRangeMap = rangeMap!=Teuchos::null ? toTpetra(rangeMap) : Teuchos::null; mtx_=Tpetra::exportAndFillCompleteCrsMatrix<MyTpetraCrsMatrix>(tSourceMatrix.getTpetra_CrsMatrix(),toTpetra(exporter),myDomainMap,myRangeMap,params); } //! Destructor. virtual ~TpetraCrsMatrix() { } //@} //! @name Insertion/Removal Methods //@{ //! Insert matrix entries, using global IDs. void insertGlobalValues(GlobalOrdinal globalRow, const ArrayView< const GlobalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::insertGlobalValues"); mtx_->insertGlobalValues(globalRow, cols, vals); } //! Insert matrix entries, using local IDs. void insertLocalValues(LocalOrdinal localRow, const ArrayView< const LocalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::insertLocalValues"); mtx_->insertLocalValues(localRow, cols, vals); } //! Replace matrix entries, using global IDs. void replaceGlobalValues(GlobalOrdinal globalRow, const ArrayView< const GlobalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::replaceGlobalValues"); mtx_->replaceGlobalValues(globalRow, cols, vals); } //! Replace matrix entries, using local IDs. void replaceLocalValues(LocalOrdinal localRow, const ArrayView< const LocalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::replaceLocalValues"); mtx_->replaceLocalValues(localRow, cols, vals); } //! Set all matrix entries equal to scalarThis. void setAllToScalar(const Scalar &alpha) { XPETRA_MONITOR("TpetraCrsMatrix::setAllToScalar"); mtx_->setAllToScalar(alpha); } //! Scale the current values of a matrix, this = alpha*this. void scale(const Scalar &alpha) { XPETRA_MONITOR("TpetraCrsMatrix::scale"); mtx_->scale(alpha); } //! Allocates and returns ArrayRCPs of the Crs arrays --- This is an Xpetra-only routine. //** \warning This is an expert-only routine and should not be called from user code. */ void allocateAllValues(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind, ArrayRCP<Scalar> & values) { XPETRA_MONITOR("TpetraCrsMatrix::allocateAllValues"); rowptr.resize(getNodeNumRows()+1); colind.resize(numNonZeros); values.resize(numNonZeros);}
//! @name Constructor/Destructor //@{ AMGXOperator(const Teuchos::RCP<Tpetra::CrsMatrix<SC,LO,GO,NO> > &inA, Teuchos::ParameterList ¶mListIn) { RCP<const Teuchos::Comm<int> > comm = inA->getRowMap()->getComm(); int numProcs = comm->getSize(); int myRank = comm->getRank(); RCP<Teuchos::Time> amgxTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: initialize"); amgxTimer->start(); // Initialize AMGX_SAFE_CALL(AMGX_initialize()); AMGX_SAFE_CALL(AMGX_initialize_plugins()); /*system*/ //AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback)); AMGX_SAFE_CALL(AMGX_install_signal_handler()); Teuchos::ParameterList configs = paramListIn.sublist("amgx:params", true); if (configs.isParameter("json file")) { AMGX_SAFE_CALL(AMGX_config_create_from_file(&Config_, (const char *) &configs.get<std::string>("json file")[0])); } else { std::ostringstream oss; oss << ""; ParameterList::ConstIterator itr; for (itr = configs.begin(); itr != configs.end(); ++itr) { const std::string& name = configs.name(itr); const ParameterEntry& entry = configs.entry(itr); oss << name << "=" << filterValueToString(entry) << ", "; } oss << "\0"; std::string configString = oss.str(); if (configString == "") { //print msg that using defaults //GetOStream(Warnings0) << "Warning: No configuration parameters specified, using default AMGX configuration parameters. \n"; } AMGX_SAFE_CALL(AMGX_config_create(&Config_, configString.c_str())); } // TODO: we probably need to add "exception_handling=1" to the parameter list // to switch on internal error handling (with no need for AMGX_SAFE_CALL) #define NEW_COMM #ifdef NEW_COMM // NOTE: MPI communicator used in AMGX_resources_create must exist in the scope of AMGX_matrix_comm_from_maps_one_ring // FIXME: fix for serial comm RCP<const Teuchos::MpiComm<int> > tmpic = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm->duplicate()); TEUCHOS_TEST_FOR_EXCEPTION(tmpic.is_null(), Exceptions::RuntimeError, "Communicator is not MpiComm"); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); MPI_Comm mpiComm = *rawMpiComm; #endif // Construct AMGX resources if (numProcs == 1) { AMGX_resources_create_simple(&Resources_, Config_); } else { int numGPUDevices; cudaGetDeviceCount(&numGPUDevices); int device[] = {(comm->getRank() % numGPUDevices)}; AMGX_config_add_parameters(&Config_, "communicator=MPI"); #ifdef NEW_COMM AMGX_resources_create(&Resources_, Config_, &mpiComm, 1/* number of GPU devices utilized by this rank */, device); #else AMGX_resources_create(&Resources_, Config_, MPI_COMM_WORLD, 1/* number of GPU devices utilized by this rank */, device); #endif } AMGX_Mode mode = AMGX_mode_dDDI; AMGX_solver_create(&Solver_, Resources_, mode, Config_); AMGX_matrix_create(&A_, Resources_, mode); AMGX_vector_create(&X_, Resources_, mode); AMGX_vector_create(&Y_, Resources_, mode); amgxTimer->stop(); amgxTimer->incrementNumCalls(); std::vector<int> amgx2muelu; // Construct AMGX communication pattern if (numProcs > 1) { RCP<const Tpetra::Import<LO,GO> > importer = inA->getCrsGraph()->getImporter(); TEUCHOS_TEST_FOR_EXCEPTION(importer.is_null(), MueLu::Exceptions::RuntimeError, "The matrix A has no Import object."); Tpetra::Distributor distributor = importer->getDistributor(); Array<int> sendRanks = distributor.getImagesTo(); Array<int> recvRanks = distributor.getImagesFrom(); std::sort(sendRanks.begin(), sendRanks.end()); std::sort(recvRanks.begin(), recvRanks.end()); bool match = true; if (sendRanks.size() != recvRanks.size()) { match = false; } else { for (int i = 0; i < sendRanks.size(); i++) { if (recvRanks[i] != sendRanks[i]) match = false; break; } } TEUCHOS_TEST_FOR_EXCEPTION(!match, MueLu::Exceptions::RuntimeError, "AMGX requires that the processors that we send to and receive from are the same. " "This is not the case: we send to {" << sendRanks << "} and receive from {" << recvRanks << "}"); int num_neighbors = sendRanks.size(); // does not include the calling process const int* neighbors = &sendRanks[0]; // Later on, we'll have to organize the send and recv data by PIDs, // i.e, a vector V of vectors, where V[i] is PID i's vector of data. // Hence we need to be able to quickly look up an array index // associated with each PID. Tpetra::Details::HashTable<int,int> hashTable(3*num_neighbors); for (int i = 0; i < num_neighbors; i++) hashTable.add(neighbors[i], i); // Get some information out ArrayView<const int> exportLIDs = importer->getExportLIDs(); ArrayView<const int> exportPIDs = importer->getExportPIDs(); Array<int> importPIDs; Tpetra::Import_Util::getPids(*importer, importPIDs, true/* make local -1 */); // Construct the reordering for AMGX as in AMGX_matrix_upload_all documentation RCP<const Map> rowMap = inA->getRowMap(); RCP<const Map> colMap = inA->getColMap(); int N = rowMap->getNodeNumElements(), Nc = colMap->getNodeNumElements(); muelu2amgx_.resize(Nc, -1); int numUniqExports = 0; for (int i = 0; i < exportLIDs.size(); i++) if (muelu2amgx_[exportLIDs[i]] == -1) { numUniqExports++; muelu2amgx_[exportLIDs[i]] = -2; } int localOffset = 0, exportOffset = N - numUniqExports; // Go through exported LIDs and put them at the end of LIDs for (int i = 0; i < exportLIDs.size(); i++) if (muelu2amgx_[exportLIDs[i]] < 0) // exportLIDs are not unique muelu2amgx_[exportLIDs[i]] = exportOffset++; // Go through all non-export LIDs, and put them at the beginning of LIDs for (int i = 0; i < N; i++) if (muelu2amgx_[i] == -1) muelu2amgx_[i] = localOffset++; // Go through the tail (imported LIDs), and order those by neighbors int importOffset = N; for (int k = 0; k < num_neighbors; k++) for (int i = 0; i < importPIDs.size(); i++) if (importPIDs[i] != -1 && hashTable.get(importPIDs[i]) == k) muelu2amgx_[i] = importOffset++; amgx2muelu.resize(muelu2amgx_.size()); for (int i = 0; i < muelu2amgx_.size(); i++) amgx2muelu[muelu2amgx_[i]] = i; // Construct send arrays std::vector<std::vector<int> > sendDatas (num_neighbors); std::vector<int> send_sizes(num_neighbors, 0); for (int i = 0; i < exportPIDs.size(); i++) { int index = hashTable.get(exportPIDs[i]); sendDatas [index].push_back(muelu2amgx_[exportLIDs[i]]); send_sizes[index]++; } // FIXME: sendDatas must be sorted (based on GIDs) std::vector<const int*> send_maps(num_neighbors); for (int i = 0; i < num_neighbors; i++) send_maps[i] = &(sendDatas[i][0]); // Debugging printMaps(comm, sendDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "send_map_vector"); // Construct recv arrays std::vector<std::vector<int> > recvDatas (num_neighbors); std::vector<int> recv_sizes(num_neighbors, 0); for (int i = 0; i < importPIDs.size(); i++) if (importPIDs[i] != -1) { int index = hashTable.get(importPIDs[i]); recvDatas [index].push_back(muelu2amgx_[i]); recv_sizes[index]++; } // FIXME: recvDatas must be sorted (based on GIDs) std::vector<const int*> recv_maps(num_neighbors); for (int i = 0; i < num_neighbors; i++) recv_maps[i] = &(recvDatas[i][0]); // Debugging printMaps(comm, recvDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "recv_map_vector"); AMGX_SAFE_CALL(AMGX_matrix_comm_from_maps_one_ring(A_, 1, num_neighbors, neighbors, &send_sizes[0], &send_maps[0], &recv_sizes[0], &recv_maps[0])); AMGX_vector_bind(X_, A_); AMGX_vector_bind(Y_, A_); } RCP<Teuchos::Time> matrixTransformTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transform matrix"); matrixTransformTimer->start(); ArrayRCP<const size_t> ia_s; ArrayRCP<const int> ja; ArrayRCP<const double> a; inA->getAllValues(ia_s, ja, a); ArrayRCP<int> ia(ia_s.size()); for (int i = 0; i < ia.size(); i++) ia[i] = Teuchos::as<int>(ia_s[i]); N_ = inA->getNodeNumRows(); int nnz = inA->getNodeNumEntries(); matrixTransformTimer->stop(); matrixTransformTimer->incrementNumCalls(); // Upload matrix // TODO Do we need to pin memory here through AMGX_pin_memory? RCP<Teuchos::Time> matrixTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer matrix CPU->GPU"); matrixTimer->start(); if (numProcs == 1) { AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia[0], &ja[0], &a[0], NULL); } else { // Transform the matrix std::vector<int> ia_new(ia.size()); std::vector<int> ja_new(ja.size()); std::vector<double> a_new (a.size()); ia_new[0] = 0; for (int i = 0; i < N_; i++) { int oldRow = amgx2muelu[i]; ia_new[i+1] = ia_new[i] + (ia[oldRow+1] - ia[oldRow]); for (int j = ia[oldRow]; j < ia[oldRow+1]; j++) { int offset = j - ia[oldRow]; ja_new[ia_new[i] + offset] = muelu2amgx_[ja[j]]; a_new [ia_new[i] + offset] = a[j]; } // Do bubble sort on two arrays // NOTE: There are multiple possible optimizations here (even of bubble sort) bool swapped; do { swapped = false; for (int j = ia_new[i]; j < ia_new[i+1]-1; j++) if (ja_new[j] > ja_new[j+1]) { std::swap(ja_new[j], ja_new[j+1]); std::swap(a_new [j], a_new [j+1]); swapped = true; } } while (swapped == true); } AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia_new[0], &ja_new[0], &a_new[0], NULL); } matrixTimer->stop(); matrixTimer->incrementNumCalls(); domainMap_ = inA->getDomainMap(); rangeMap_ = inA->getRangeMap(); RCP<Teuchos::Time> realSetupTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: real setup"); realSetupTimer->start(); AMGX_solver_setup(Solver_, A_); realSetupTimer->stop(); realSetupTimer->incrementNumCalls(); vectorTimer1_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vectors CPU->GPU"); vectorTimer2_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vector GPU->CPU"); }
void globalWeightedCutsByPart( const RCP<const Environment> &env, const RCP<const Comm<int> > &comm, const RCP<const GraphModel<typename Adapter::base_adapter_t> > &graph, const ArrayView<const typename Adapter::part_t> &part, typename Adapter::part_t &numParts, ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > > &metrics, ArrayRCP<typename Adapter::scalar_t> &globalSums) { env->debug(DETAILED_STATUS, "Entering globalWeightedCutsByPart"); ////////////////////////////////////////////////////////// // Initialize return values numParts = 0; int ewgtDim = graph->getNumWeightsPerEdge(); int numMetrics = 1; // "edge cuts" if (ewgtDim) numMetrics += ewgtDim; // "weight n" typedef typename Adapter::scalar_t scalar_t; typedef typename Adapter::gno_t gno_t; typedef typename Adapter::lno_t lno_t; typedef typename Adapter::node_t node_t; typedef typename Adapter::part_t part_t; typedef StridedData<lno_t, scalar_t> input_t; typedef GraphMetrics<scalar_t> mv_t; typedef Tpetra::CrsMatrix<part_t,lno_t,gno_t,node_t> sparse_matrix_type; typedef Tpetra::Vector<part_t,lno_t,gno_t,node_t> vector_t; typedef Tpetra::Map<lno_t, gno_t, node_t> map_type; typedef Tpetra::global_size_t GST; const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid (); using Teuchos::as; // add some more metrics to the array typedef typename ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > >::size_type array_size_type; metrics.resize( metrics.size() + numMetrics ); for( array_size_type n = metrics.size() - numMetrics; n < metrics.size(); ++n ) { mv_t * newMetric = new mv_t; // allocate the new memory env->localMemoryAssertion(__FILE__,__LINE__,1,newMetric); // check errors metrics[n] = rcp( newMetric); // create the new members } array_size_type next = metrics.size() - numMetrics; // MDM - this is most likely temporary to preserve the format here - we are now filling a larger array so we may not have started at 0 ////////////////////////////////////////////////////////// // Figure out the global number of parts in use. // Verify number of vertex weights is the same everywhere. lno_t localNumObj = part.size(); part_t localNum[2], globalNum[2]; localNum[0] = static_cast<part_t>(ewgtDim); localNum[1] = 0; for (lno_t i=0; i < localNumObj; i++) if (part[i] > localNum[1]) localNum[1] = part[i]; try{ reduceAll<int, part_t>(*comm, Teuchos::REDUCE_MAX, 2, localNum, globalNum); } Z2_THROW_OUTSIDE_ERROR(*env) env->globalBugAssertion(__FILE__,__LINE__, "inconsistent number of edge weights", globalNum[0] == localNum[0], DEBUG_MODE_ASSERTION, comm); part_t nparts = globalNum[1] + 1; part_t globalSumSize = nparts * numMetrics; scalar_t * sumBuf = new scalar_t [globalSumSize]; env->localMemoryAssertion(__FILE__, __LINE__, globalSumSize, sumBuf); globalSums = arcp(sumBuf, 0, globalSumSize); ////////////////////////////////////////////////////////// // Calculate the local totals by part. scalar_t *localBuf = new scalar_t [globalSumSize]; env->localMemoryAssertion(__FILE__,__LINE__,globalSumSize,localBuf); memset(localBuf, 0, sizeof(scalar_t) * globalSumSize); scalar_t *cut = localBuf; // # of cuts ArrayView<const gno_t> Ids; ArrayView<input_t> vwgts; //size_t nv = graph->getVertexList(Ids, vwgts); ArrayView<const gno_t> edgeIds; ArrayView<const lno_t> offsets; ArrayView<input_t> wgts; //size_t numLocalEdges = graph->getEdgeList(edgeIds, offsets, wgts); // ************************************************************************** // *************************** BUILD MAP FOR ADJS *************************** // ************************************************************************** RCP<const map_type> vertexMapG; // Build a list of the global vertex ids... gno_t min = std::numeric_limits<gno_t>::max(); size_t maxcols = 0; for (lno_t i = 0; i < localNumObj; ++i) { if (Ids[i] < min) min = Ids[i]; size_t ncols = offsets[i+1] - offsets[i]; if (ncols > maxcols) maxcols = ncols; } gno_t gmin; Teuchos::reduceAll<int, gno_t>(*comm,Teuchos::REDUCE_MIN,1,&min,&gmin); //Generate Map for vertex vertexMapG = rcp(new map_type(INVALID, Ids, gmin, comm)); // ************************************************************************** // ************************** BUILD GRAPH FOR ADJS ************************** // ************************************************************************** //MD:Zoltan Directory could be used instead of adjMatrix. RCP<sparse_matrix_type> adjsMatrix; // Construct Tpetra::CrsGraph objects. adjsMatrix = rcp (new sparse_matrix_type (vertexMapG, 0)); Array<part_t> justOneA(maxcols, 1); for (lno_t localElement=0; localElement<localNumObj; ++localElement){ // Insert all columns for global row Ids[localElement] size_t ncols = offsets[localElement+1] - offsets[localElement]; adjsMatrix->insertGlobalValues(Ids[localElement], edgeIds(offsets[localElement], ncols), justOneA(0, ncols)); } //Fill-complete adjs Graph adjsMatrix->fillComplete (); // Compute part RCP<vector_t> scaleVec = Teuchos::rcp( new vector_t(vertexMapG,false) ); for (lno_t localElement=0; localElement<localNumObj; ++localElement) { scaleVec->replaceLocalValue(localElement,part[localElement]); } // Postmultiply adjsMatrix by part adjsMatrix->rightScale(*scaleVec); Array<gno_t> Indices; Array<part_t> Values; for (lno_t i=0; i < localNumObj; i++) { const gno_t globalRow = Ids[i]; size_t NumEntries = adjsMatrix->getNumEntriesInGlobalRow (globalRow); Indices.resize (NumEntries); Values.resize (NumEntries); adjsMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries); for (size_t j=0; j < NumEntries; j++) if (part[i] != Values[j]) cut[part[i]]++; } if (numMetrics > 1) { scalar_t *wgt = localBuf + nparts; // weight 0 // This code assumes the solution has the part ordered the // same way as the user input. (Bug 5891 is resolved.) for (int edim = 0; edim < ewgtDim; edim++){ for (lno_t i=0; i < localNumObj; i++) { const gno_t globalRow = Ids[i]; size_t NumEntries = adjsMatrix->getNumEntriesInGlobalRow (globalRow); Indices.resize (NumEntries); Values.resize (NumEntries); adjsMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries); for (size_t j=0; j < NumEntries; j++) if (part[i] != Values[j]) wgt[part[i]] += wgts[edim][offsets[i] + j]; } wgt += nparts; // individual weights } } ////////////////////////////////////////////////////////// // Obtain global totals by part. try{ reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM, globalSumSize, localBuf, sumBuf); } Z2_THROW_OUTSIDE_ERROR(*env); delete [] localBuf; ////////////////////////////////////////////////////////// // Global max and sum over all parts cut = sumBuf; // # of cuts scalar_t max=0, sum=0; ArrayView<scalar_t> cutVec(cut, nparts); getStridedStats<scalar_t>(cutVec, 1, 0, max, sum); metrics[next]->setName("edge cuts"); metrics[next]->setMetricValue("global maximum", max); metrics[next]->setMetricValue("global sum", sum); next++; if (numMetrics > 1){ scalar_t *wgt = sumBuf + nparts; // weight 0 for (int edim=0; edim < ewgtDim; edim++){ ArrayView<scalar_t> fromVec(wgt, nparts); getStridedStats<scalar_t>(fromVec, 1, 0, max, sum); std::ostringstream oss; oss << "weight " << edim; metrics[next]->setName(oss.str()); metrics[next]->setMetricValue("global maximum", max); metrics[next]->setMetricValue("global sum", sum); next++; wgt += nparts; // individual weights } } numParts = nparts; env->debug(DETAILED_STATUS, "Exiting globalWeightedCutsByPart"); }
void AggregationPhase2aAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const { Monitor m(*this, "BuildAggregates"); LO minNodesPerAggregate = params.get<LO>("aggregation: min agg size"); LO maxNodesPerAggregate = params.get<LO>("aggregation: max agg size"); const LO numRows = graph.GetNodeNumVertices(); const int myRank = graph.GetComm()->getRank(); ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0); ArrayRCP<LO> procWinner = aggregates.GetProcWinner() ->getDataNonConst(0); LO numLocalAggregates = aggregates.GetNumAggregates(); LO numLocalNodes = procWinner.size(); LO numLocalAggregated = numLocalNodes - numNonAggregatedNodes; const double aggFactor = 0.5; double factor = as<double>(numLocalAggregated)/(numLocalNodes+1); factor = pow(factor, aggFactor); int aggIndex = -1; size_t aggSize = 0; std::vector<int> aggList(graph.getNodeMaxNumRowEntries()); for (LO rootCandidate = 0; rootCandidate < numRows; rootCandidate++) { if (aggStat[rootCandidate] != READY) continue; aggSize = 0; ArrayView<const LocalOrdinal> neighOfINode = graph.getNeighborVertices(rootCandidate); LO numNeighbors = 0; for (int j = 0; j < neighOfINode.size(); j++) { LO neigh = neighOfINode[j]; if (neigh != rootCandidate) { if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == READY) { // If aggregate size does not exceed max size, add node to the tentative aggregate // NOTE: We do not exit the loop over all neighbours since we have still // to count all aggregated neighbour nodes for the aggregation criteria // NOTE: We check here for the maximum aggregation size. If we would do it below // with all the other check too big aggregates would not be accepted at all. if (aggSize < as<size_t>(maxNodesPerAggregate)) aggList[aggSize++] = neigh; } numNeighbors++; } } // NOTE: ML uses a hardcoded value 3 instead of MinNodesPerAggregate if (aggSize > as<size_t>(minNodesPerAggregate) && aggSize > factor*numNeighbors) { // Accept new aggregate // rootCandidate becomes the root of the newly formed aggregate aggregates.SetIsRoot(rootCandidate); aggIndex = numLocalAggregates++; for (size_t k = 0; k < aggSize; k++) { aggStat [aggList[k]] = AGGREGATED; vertex2AggId[aggList[k]] = aggIndex; procWinner [aggList[k]] = myRank; } numNonAggregatedNodes -= aggSize; } } // update aggregate object aggregates.SetNumAggregates(numLocalAggregates); }
void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const { FactoryMonitor m(*this, "Build", currentLevel); const Teuchos::ParameterList & pL = GetParameterList(); // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation. // TODO (JG): I don't really know if we want to do this. const int startLevel = pL.get<int> ("repartition: start level"); const LO minRowsPerProcessor = pL.get<LO> ("repartition: min rows per proc"); const double nonzeroImbalance = pL.get<double>("repartition: max imbalance"); const bool remapPartitions = pL.get<bool> ("repartition: remap parts"); // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types. RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A"); // ====================================================================================================== // Determine whether partitioning is needed // ====================================================================================================== // NOTE: most tests include some global communication, which is why we currently only do tests until we make // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to // rebalance an operator. So, it would probably be beneficial to do and report *all* tests. // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning if (currentLevel.GetLevelID() < startLevel) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } RCP<const Map> rowMap = A->getRowMap(); // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it // does not matter. RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm(); RCP<const Teuchos::Comm<int> > comm = origComm->duplicate(); // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created) // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size { int numActiveProcesses = 0; MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses); if (numActiveProcesses == 1) { GetOStream(Statistics0) << "Repartitioning? NO:" << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } bool test3 = false, test4 = false; std::string msg3, msg4; // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3) if (minRowsPerProcessor > 0) { LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max(); LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0; MueLu_sumAll(comm, haveFewRows, numWithFewRows); MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows); // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements. // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement. if (numWithFewRows > 0) test3 = true; msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor); } // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold if (!test3) { GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries()); MueLu_maxAll(comm, numMyNnz, maxNnz); MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors double imbalance = Teuchos::as<double>(maxNnz)/minNnz; if (imbalance > nonzeroImbalance) test4 = true; msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance); } if (!test3 && !test4) { GetOStream(Statistics0) << "Repartitioning? NO:" << msg3 + msg4 << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl; GO indexBase = rowMap->getIndexBase(); Xpetra::UnderlyingLib lib = rowMap->lib(); int myRank = comm->getRank(); int numProcs = comm->getSize(); RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm); TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object."); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm(); // ====================================================================================================== // Calculate number of partitions // ====================================================================================================== // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML) // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold. GO numPartitions; if (currentLevel.IsAvailable("number of partitions")) { numPartitions = currentLevel.Get<GO>("number of partitions"); GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl; } else { if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) { // System is too small, migrate it to a single processor numPartitions = 1; } else { // Make sure that each processor has approximately minRowsPerProcessor numPartitions = A->getGlobalNumRows() / minRowsPerProcessor; } numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs)); currentLevel.Set("number of partitions", numPartitions, NoFactory::get()); } GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl; // ====================================================================================================== // Construct decomposition vector // ====================================================================================================== RCP<GOVector> decomposition; if (numPartitions == 1) { // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut // in Zoltan[12]Interface). // TODO: We can probably skip more work in this case (like building all extra data structures) GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl; decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true); } else { decomposition = Get<RCP<GOVector> >(currentLevel, "Partition"); if (decomposition.is_null()) { GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl; Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null); return; } } // ====================================================================================================== // Remap if necessary // ====================================================================================================== // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature. // There are two problems, however. // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings. // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for // each partition is the same. // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level. // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking // algorithm can resolve these differently. For instance, running // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12 // with // <ParameterList name="MueLu"> // <Parameter name="coarse: max size" type="int" value="1"/> // <Parameter name="repartition: enable" type="bool" value="true"/> // <Parameter name="repartition: min rows per proc" type="int" value="2"/> // <ParameterList name="level 1"> // <Parameter name="repartition: remap parts" type="bool" value="false/true"/> // </ParameterList> // </ParameterList> // produces different repartitioning for level 2. // This different repartitioning may then escalate into different aggregation for the next level. // // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector. // Fixing (2) is more complicated. // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu, // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1 // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1. // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place. // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in // MPI_COMM_WORLD, which may lead to issues for logging. if (remapPartitions) { SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel); DeterminePartitionPlacement(*A, *decomposition, numPartitions); } // ====================================================================================================== // Construct importer // ====================================================================================================== // At this point, the following is true: // * Each processors owns 0 or 1 partitions // * If a processor owns a partition, that partition number is equal to the processor rank // * The decomposition vector contains the partitions ids that the corresponding GID belongs to ArrayRCP<const GO> decompEntries; if (decomposition->getLocalLength() > 0) decompEntries = decomposition->getData(0); #ifdef HAVE_MUELU_DEBUG // Test range of partition ids int incorrectRank = -1; for (int i = 0; i < decompEntries.size(); i++) if (decompEntries[i] >= numProcs || decompEntries[i] < 0) { incorrectRank = myRank; break; } int incorrectGlobalRank = -1; MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank); TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range"); #endif Array<GO> myGIDs; myGIDs.reserve(decomposition->getLocalLength()); // Step 0: Construct mapping // part number -> GIDs I own which belong to this part // NOTE: my own part GIDs are not part of the map typedef std::map<GO, Array<GO> > map_type; map_type sendMap; for (LO i = 0; i < decompEntries.size(); i++) { GO id = decompEntries[i]; GO GID = rowMap->getGlobalElement(i); if (id == myRank) myGIDs .push_back(GID); else sendMap[id].push_back(GID); } decompEntries = Teuchos::null; if (IsPrint(Statistics2)) { GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows(); MueLu_sumAll(comm,numLocalKept, numGlobalKept); GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl; } int numSend = sendMap.size(), numRecv; // Arrayify map keys Array<GO> myParts(numSend), myPart(1); int cnt = 0; myPart[0] = myRank; for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++) myParts[cnt++] = it->first; // Step 1: Find out how many processors send me data // partsIndexBase starts from zero, as the processors ids start from zero GO partsIndexBase = 0; RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm); RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm); RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn); RCP<GOVector> partsISend = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave); RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn); if (numSend) { ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0); for (int i = 0; i < numSend; i++) partsISendData[i] = 1; } (numPartsIRecv->getDataNonConst(0))[0] = 0; numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD); numRecv = (numPartsIRecv->getData(0))[0]; // Step 2: Get my GIDs from everybody else MPI_Datatype MpiType = MpiTypeTraits<GO>::getType(); int msgTag = 12345; // TODO: use Comm::dup for all internal messaging // Post sends Array<MPI_Request> sendReqs(numSend); cnt = 0; for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++) MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]); map_type recvMap; size_t totalGIDs = myGIDs.size(); for (int i = 0; i < numRecv; i++) { MPI_Status status; MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status); // Get rank and number of elements from status int fromRank = status.MPI_SOURCE, count; MPI_Get_count(&status, MpiType, &count); recvMap[fromRank].resize(count); MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status); totalGIDs += count; } // Do waits on send requests if (numSend) { Array<MPI_Status> sendStatuses(numSend); MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr()); } // Merge GIDs myGIDs.reserve(totalGIDs); for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) { int offset = myGIDs.size(), len = it->second.size(); if (len) { myGIDs.resize(offset + len); memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO)); } } // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation. std::sort(myGIDs.begin(), myGIDs.end()); // Step 3: Construct importer RCP<Map> newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm); RCP<const Import> rowMapImporter; { SubFactoryMonitor m1(*this, "Import construction", currentLevel); rowMapImporter = ImportFactory::Build(rowMap, newRowMap); } Set(currentLevel, "Importer", rowMapImporter); // ====================================================================================================== // Print some data // ====================================================================================================== if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) { // Print the grid of processors GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl; char amActive = (myGIDs.size() ? 1 : 0); std::vector<char> areActive(numProcs, 0); MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm); int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100); for (int proc = 0; proc < numProcs; proc += rowWidth) { for (int j = 0; j < rowWidth; j++) if (proc + j < numProcs) GetOStream(Statistics2) << (areActive[proc + j] ? "+" : "."); else GetOStream(Statistics2) << " "; GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl; } } } // Build
int main(int argc, char *argv[]) { #include <MueLu_UseShortNames.hpp> using Teuchos::RCP; using Teuchos::rcp; using Teuchos::ArrayRCP; using Teuchos::TimeMonitor; using Teuchos::ParameterList; // ========================================================================= // MPI initialization using Teuchos // ========================================================================= Teuchos::GlobalMPISession mpiSession(&argc, &argv, NULL); RCP< const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm(); // ========================================================================= // Convenient definitions // ========================================================================= typedef Teuchos::ScalarTraits<SC> STS; SC zero = STS::zero(), one = STS::one(); // ========================================================================= // Parameters initialization // ========================================================================= Teuchos::CommandLineProcessor clp(false); GO nx = 100, ny = 100, nz = 100; Galeri::Xpetra::Parameters<GO> galeriParameters(clp, nx, ny, nz, "Laplace2D"); // manage parameters of the test case Xpetra::Parameters xpetraParameters(clp); // manage parameters of Xpetra std::string xmlFileName = "scalingTest.xml"; clp.setOption("xml", &xmlFileName, "read parameters from a file [default = 'scalingTest.xml']"); bool printTimings = true; clp.setOption("timings", "notimings", &printTimings, "print timings to screen"); int writeMatricesOPT = -2; clp.setOption("write", &writeMatricesOPT, "write matrices to file (-1 means all; i>=0 means level i)"); std::string dsolveType = "cg", solveType; clp.setOption("solver", &dsolveType, "solve type: (none | cg | gmres | standalone)"); double dtol = 1e-12, tol; clp.setOption("tol", &dtol, "solver convergence tolerance"); std::string mapFile; clp.setOption("map", &mapFile, "map data file"); std::string matrixFile; clp.setOption("matrix", &matrixFile, "matrix data file"); std::string coordFile; clp.setOption("coords", &coordFile, "coordinates data file"); int numRebuilds = 0; clp.setOption("rebuild", &numRebuilds, "#times to rebuild hierarchy"); int maxIts = 200; clp.setOption("its", &maxIts, "maximum number of solver iterations"); bool scaleResidualHistory = true; clp.setOption("scale", "noscale", &scaleResidualHistory, "scaled Krylov residual history"); switch (clp.parse(argc, argv)) { case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: return EXIT_SUCCESS; case Teuchos::CommandLineProcessor::PARSE_ERROR: case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: break; } Xpetra::UnderlyingLib lib = xpetraParameters.GetLib(); ParameterList paramList; Teuchos::updateParametersFromXmlFileAndBroadcast(xmlFileName, Teuchos::Ptr<ParameterList>(¶mList), *comm); bool isDriver = paramList.isSublist("Run1"); if (isDriver) { // update galeriParameters with the values from the XML file ParameterList& realParams = galeriParameters.GetParameterList(); for (ParameterList::ConstIterator it = realParams.begin(); it != realParams.end(); it++) { const std::string& name = realParams.name(it); if (paramList.isParameter(name)) realParams.setEntry(name, paramList.getEntry(name)); } } // Retrieve matrix parameters (they may have been changed on the command line) // [for instance, if we changed matrix type from 2D to 3D we need to update nz] ParameterList galeriList = galeriParameters.GetParameterList(); // ========================================================================= // Problem construction // ========================================================================= std::ostringstream galeriStream; comm->barrier(); RCP<TimeMonitor> globalTimeMonitor = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: S - Global Time"))); RCP<TimeMonitor> tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1 - Matrix Build"))); RCP<Matrix> A; RCP<const Map> map; RCP<MultiVector> coordinates; RCP<MultiVector> nullspace; if (matrixFile.empty()) { galeriStream << "========================================================\n" << xpetraParameters << galeriParameters; // Galeri will attempt to create a square-as-possible distribution of subdomains di, e.g., // d1 d2 d3 // d4 d5 d6 // d7 d8 d9 // d10 d11 d12 // A perfect distribution is only possible when the #processors is a perfect square. // This *will* result in "strip" distribution if the #processors is a prime number or if the factors are very different in // size. For example, np=14 will give a 7-by-2 distribution. // If you don't want Galeri to do this, specify mx or my on the galeriList. std::string matrixType = galeriParameters.GetMatrixType(); // Create map and coordinates // In the future, we hope to be able to first create a Galeri problem, and then request map and coordinates from it // At the moment, however, things are fragile as we hope that the Problem uses same map and coordinates inside if (matrixType == "Laplace1D") { map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian1D", comm, galeriList); coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("1D", map, galeriList); } else if (matrixType == "Laplace2D" || matrixType == "Star2D" || matrixType == "BigStar2D" || matrixType == "Elasticity2D") { map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian2D", comm, galeriList); coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("2D", map, galeriList); } else if (matrixType == "Laplace3D" || matrixType == "Brick3D" || matrixType == "Elasticity3D") { map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian3D", comm, galeriList); coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("3D", map, galeriList); } // Expand map to do multiple DOF per node for block problems if (matrixType == "Elasticity2D") map = Xpetra::MapFactory<LO,GO,Node>::Build(map, 2); if (matrixType == "Elasticity3D") map = Xpetra::MapFactory<LO,GO,Node>::Build(map, 3); galeriStream << "Processor subdomains in x direction: " << galeriList.get<int>("mx") << std::endl << "Processor subdomains in y direction: " << galeriList.get<int>("my") << std::endl << "Processor subdomains in z direction: " << galeriList.get<int>("mz") << std::endl << "========================================================" << std::endl; if (matrixType == "Elasticity2D" || matrixType == "Elasticity3D") { // Our default test case for elasticity: all boundaries of a square/cube have Neumann b.c. except left which has Dirichlet galeriList.set("right boundary" , "Neumann"); galeriList.set("bottom boundary", "Neumann"); galeriList.set("top boundary" , "Neumann"); galeriList.set("front boundary" , "Neumann"); galeriList.set("back boundary" , "Neumann"); } RCP<Galeri::Xpetra::Problem<Map,CrsMatrixWrap,MultiVector> > Pr = Galeri::Xpetra::BuildProblem<SC,LO,GO,Map,CrsMatrixWrap,MultiVector>(galeriParameters.GetMatrixType(), map, galeriList); A = Pr->BuildMatrix(); if (matrixType == "Elasticity2D" || matrixType == "Elasticity3D") { nullspace = Pr->BuildNullspace(); A->SetFixedBlockSize((galeriParameters.GetMatrixType() == "Elasticity2D") ? 2 : 3); } } else { if (!mapFile.empty()) map = Utils2::ReadMap(mapFile, xpetraParameters.GetLib(), comm); comm->barrier(); if (lib == Xpetra::UseEpetra) { A = Utils::Read(matrixFile, map); } else { // Tpetra matrix reader is still broken, so instead we read in // a matrix in a binary format and then redistribute it const bool binaryFormat = true; A = Utils::Read(matrixFile, lib, comm, binaryFormat); RCP<Matrix> newMatrix = MatrixFactory::Build(map, 1); RCP<Import> importer = ImportFactory::Build(A->getRowMap(), map); newMatrix->doImport(*A, *importer, Xpetra::INSERT); newMatrix->fillComplete(); A.swap(newMatrix); } comm->barrier(); if (!coordFile.empty()) coordinates = Utils2::ReadMultiVector(coordFile, map); } comm->barrier(); tm = Teuchos::null; galeriStream << "Galeri complete.\n========================================================" << std::endl; int numReruns = 1; if (paramList.isParameter("number of reruns")) numReruns = paramList.get<int>("number of reruns"); const bool mustAlreadyExist = true; for (int rerunCount = 1; rerunCount <= numReruns; rerunCount++) { ParameterList mueluList, runList; bool stop = false; if (isDriver) { runList = paramList.sublist("Run1", mustAlreadyExist); mueluList = runList .sublist("MueLu", mustAlreadyExist); } else { mueluList = paramList; stop = true; } if (nullspace.is_null()) { int blkSize = 1; if (mueluList.isSublist("Matrix")) { // Factory style parameter list const Teuchos::ParameterList& operatorList = paramList.sublist("Matrix"); if (operatorList.isParameter("PDE equations")) blkSize = operatorList.get<int>("PDE equations"); } else if (paramList.isParameter("number of equations")) { // Easy style parameter list blkSize = paramList.get<int>("number of equations"); } nullspace = MultiVectorFactory::Build(map, blkSize); for (int i = 0; i < blkSize; i++) { RCP<const Map> domainMap = A->getDomainMap(); GO indexBase = domainMap->getIndexBase(); ArrayRCP<SC> nsData = nullspace->getDataNonConst(i); for (int j = 0; j < nsData.size(); j++) { GO GID = domainMap->getGlobalElement(j) - indexBase; if ((GID-i) % blkSize == 0) nsData[j] = Teuchos::ScalarTraits<SC>::one(); } } } int runCount = 1; do { A->SetMaxEigenvalueEstimate(-one); solveType = dsolveType; tol = dtol; int savedOut = -1; FILE* openedOut = NULL; if (isDriver) { if (runList.isParameter("filename")) { // Redirect all output into a filename We have to redirect all output, // including printf's, therefore we cannot simply replace C++ cout // buffers, and have to use heavy machinary (dup2) std::string filename = runList.get<std::string>("filename"); if (numReruns > 1) filename += "_run" + MueLu::toString(rerunCount); filename += (lib == Xpetra::UseEpetra ? ".epetra" : ".tpetra"); savedOut = dup(STDOUT_FILENO); openedOut = fopen(filename.c_str(), "w"); dup2(fileno(openedOut), STDOUT_FILENO); } if (runList.isParameter("solver")) solveType = runList.get<std::string>("solver"); if (runList.isParameter("tol")) tol = runList.get<double> ("tol"); } // Instead of checking each time for rank, create a rank 0 stream RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); Teuchos::FancyOStream& out = *fancy; out.setOutputToRootOnly(0); out << galeriStream.str(); // ========================================================================= // Preconditioner construction // ========================================================================= comm->barrier(); tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1.5 - MueLu read XML"))); RCP<HierarchyManager> mueLuFactory = rcp(new ParameterListInterpreter(mueluList)); comm->barrier(); tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 2 - MueLu Setup"))); RCP<Hierarchy> H; for (int i = 0; i <= numRebuilds; i++) { A->SetMaxEigenvalueEstimate(-one); H = mueLuFactory->CreateHierarchy(); H->GetLevel(0)->Set("A", A); H->GetLevel(0)->Set("Nullspace", nullspace); if (!coordinates.is_null()) H->GetLevel(0)->Set("Coordinates", coordinates); mueLuFactory->SetupHierarchy(*H); } comm->barrier(); tm = Teuchos::null; // ========================================================================= // System solution (Ax = b) // ========================================================================= comm->barrier(); tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 3 - LHS and RHS initialization"))); RCP<Vector> X = VectorFactory::Build(map); RCP<Vector> B = VectorFactory::Build(map); { // we set seed for reproducibility Utils::SetRandomSeed(*comm); X->randomize(); A->apply(*X, *B, Teuchos::NO_TRANS, one, zero); Teuchos::Array<STS::magnitudeType> norms(1); B->norm2(norms); B->scale(one/norms[0]); X->putScalar(zero); } tm = Teuchos::null; if (writeMatricesOPT > -2) { tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 3.5 - Matrix output"))); H->Write(writeMatricesOPT, writeMatricesOPT); tm = Teuchos::null; } comm->barrier(); if (solveType == "none") { // Do not perform a solve } else if (solveType == "standalone") { tm = rcp (new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 4 - Fixed Point Solve"))); H->IsPreconditioner(false); H->Iterate(*B, *X, maxIts); } else if (solveType == "cg" || solveType == "gmres") { #ifdef HAVE_MUELU_BELOS tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 5 - Belos Solve"))); // Operator and Multivector type that will be used with Belos typedef MultiVector MV; typedef Belos::OperatorT<MV> OP; H->IsPreconditioner(true); // Define Operator and Preconditioner Teuchos::RCP<OP> belosOp = Teuchos::rcp(new Belos::XpetraOp<SC, LO, GO, NO, LMO>(A)); // Turns a Xpetra::Matrix object into a Belos operator Teuchos::RCP<OP> belosPrec = Teuchos::rcp(new Belos::MueLuOp <SC, LO, GO, NO, LMO>(H)); // Turns a MueLu::Hierarchy object into a Belos operator // Construct a Belos LinearProblem object RCP< Belos::LinearProblem<SC, MV, OP> > belosProblem = rcp(new Belos::LinearProblem<SC, MV, OP>(belosOp, X, B)); belosProblem->setRightPrec(belosPrec); bool set = belosProblem->setProblem(); if (set == false) { out << "\nERROR: Belos::LinearProblem failed to set up correctly!" << std::endl; return EXIT_FAILURE; } // Belos parameter list Teuchos::ParameterList belosList; belosList.set("Maximum Iterations", maxIts); // Maximum number of iterations allowed belosList.set("Convergence Tolerance", tol); // Relative convergence tolerance requested belosList.set("Verbosity", Belos::Errors + Belos::Warnings + Belos::StatusTestDetails); belosList.set("Output Frequency", 1); belosList.set("Output Style", Belos::Brief); if (!scaleResidualHistory) belosList.set("Implicit Residual Scaling", "None"); // Create an iterative solver manager RCP< Belos::SolverManager<SC, MV, OP> > solver; if (solveType == "cg") { solver = rcp(new Belos::PseudoBlockCGSolMgr <SC, MV, OP>(belosProblem, rcp(&belosList, false))); } else if (solveType == "gmres") { solver = rcp(new Belos::BlockGmresSolMgr<SC, MV, OP>(belosProblem, rcp(&belosList, false))); } // Perform solve Belos::ReturnType ret = Belos::Unconverged; try { ret = solver->solve(); // Get the number of iterations for this solve. out << "Number of iterations performed for this solve: " << solver->getNumIters() << std::endl; } catch(...) { out << std::endl << "ERROR: Belos threw an error! " << std::endl; } // Check convergence if (ret != Belos::Converged) out << std::endl << "ERROR: Belos did not converge! " << std::endl; else out << std::endl << "SUCCESS: Belos converged!" << std::endl; #endif //ifdef HAVE_MUELU_BELOS } else { throw MueLu::Exceptions::RuntimeError("Unknown solver type: \"" + solveType + "\""); } comm->barrier(); tm = Teuchos::null; globalTimeMonitor = Teuchos::null; if (printTimings) TimeMonitor::summarize(A->getRowMap()->getComm().ptr(), std::cout, false, true, false, Teuchos::Union); TimeMonitor::clearCounters(); if (isDriver) { if (openedOut != NULL) { dup2(savedOut, STDOUT_FILENO); fclose(openedOut); openedOut = NULL; } try { runList = paramList.sublist("Run" + MueLu::toString(++runCount), mustAlreadyExist); mueluList = runList .sublist("MueLu", mustAlreadyExist); } catch (std::exception) { stop = true; } } } while (stop == false); } return 0; } //main
void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomReorder(ArrayRCP<LO> list) const { //TODO: replace int int n = list.size(); for(int i = 0; i < n-1; i++) std::swap(list[i], list[RandomOrdinal(i,n-1)]); }
int main(int argc, char *argv[]) { Teuchos::GlobalMPISession session(&argc, &argv); RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm(); int nprocs = comm->getSize(); int rank = comm->getRank(); int fail=0, gfail=0; double epsilon = 10e-6; //////////////// // Arrays to hold part Ids and part Sizes for each weight int numIdsPerProc = 10; int maxNumWeights = 3; int maxNumPartSizes = nprocs; int *lengths = new int [maxNumWeights]; part_t **idLists = new part_t * [maxNumWeights]; scalar_t **sizeLists = new scalar_t * [maxNumWeights]; for (int w=0; w < maxNumWeights; w++){ idLists[w] = new part_t [maxNumPartSizes]; sizeLists[w] = new scalar_t [maxNumPartSizes]; } ///////////// // A default environment RCP<const Zoltan2::Environment> env = rcp(new Zoltan2::Environment); ///////////// // A simple identifier map. gno_t *myGids = new gno_t [numIdsPerProc]; for (int i=0, x=rank*numIdsPerProc; i < numIdsPerProc; i++){ myGids[i] = x++; } ArrayRCP<const gno_t> gidArray(myGids, 0, numIdsPerProc, true); RCP<const Zoltan2::IdentifierMap<user_t> > idMap = rcp(new Zoltan2::IdentifierMap<user_t>(env, comm, gidArray)); ///////////// // TEST: // One weight, one part per proc. // Some part sizes are 2 and some are 1. int numGlobalParts = nprocs; int nWeights = 1; ArrayRCP<ArrayRCP<part_t> > ids; ArrayRCP<ArrayRCP<scalar_t> > sizes; memset(lengths, 0, sizeof(int) * maxNumWeights); lengths[0] = 1; // We give a size for 1 part. idLists[0][0] = rank; // The part is my part. sizeLists[0][0] = rank%2 + 1.0; // The size is 1.0 or 2.0 makeArrays(1, lengths, idLists, sizeLists, ids, sizes); // Normalized part size for every part, for checking later on scalar_t *normalizedPartSizes = new scalar_t [numGlobalParts]; scalar_t sumSizes=0; for (int i=0; i < numGlobalParts; i++){ normalizedPartSizes[i] = 1.0; if (i % 2) normalizedPartSizes[i] = 2.0; sumSizes += normalizedPartSizes[i]; } for (int i=0; i < numGlobalParts; i++) normalizedPartSizes[i] /= sumSizes; ///////////// // Create a solution object with part size information, and check it. RCP<Zoltan2::PartitioningSolution<idInput_t> > solution; try{ solution = rcp(new Zoltan2::PartitioningSolution<idInput_t>( env, // application environment info comm, // problem communicator idMap, // problem identifiers (global Ids, local Ids) nWeights, // number of weights ids.view(0,nWeights), // part ids sizes.view(0,nWeights))); // part sizes } catch (std::exception &e){ fail=1; } TEST_FAIL_AND_EXIT(*comm, fail==0, "constructor call 1", 1); // Test the Solution queries that are used by algorithms if (solution->getTargetGlobalNumberOfParts() != size_t(numGlobalParts)) fail=2; if (!fail && solution->getLocalNumberOfParts() != 1) fail=3; if (!fail && !solution->oneToOnePartDistribution()) fail=4; if (!fail && solution->getPartDistribution() != NULL) fail=5; if (!fail && solution->getProcDistribution() != NULL) fail=6; if (!fail && ((nprocs>1 && solution->criteriaHasUniformPartSizes(0)) || (nprocs==1 && !solution->criteriaHasUniformPartSizes(0))) ) fail=8; if (!fail){ for (int partId=0; !fail && partId < numGlobalParts; partId++){ scalar_t psize = solution->getCriteriaPartSize(0, partId); if ( psize < normalizedPartSizes[partId] - epsilon || psize > normalizedPartSizes[partId] + epsilon ) fail=9; } } delete [] normalizedPartSizes; gfail = globalFail(comm, fail); if (gfail){ printFailureCode(comm, fail); // exits after printing "FAIL" } // Test the Solution set method that is called by algorithms part_t *partAssignments = new part_t [numIdsPerProc]; for (int i=0; i < numIdsPerProc; i++){ partAssignments[i] = myGids[i] % numGlobalParts; // round robin } ArrayRCP<part_t> partList = arcp(partAssignments, 0, numIdsPerProc); try{ solution->setParts(gidArray, partList, true); } catch (std::exception &e){ fail=10; } gfail = globalFail(comm, fail); if (gfail){ printFailureCode(comm, fail); // exits after printing "FAIL" } // Test the Solution get methods that may be called by users // or migration functions. if (solution->getLocalNumberOfIds() != size_t(numIdsPerProc)) fail = 11; if (!fail){ const gno_t *gids = solution->getIdList(); for (int i=0; !fail && i < numIdsPerProc; i++){ if (gids[i] != myGids[i]) fail = 12; } } if (!fail){ const part_t *parts = solution->getPartList(); for (int i=0; !fail && i < numIdsPerProc; i++){ if (parts[i] != myGids[i] % numGlobalParts) fail = 13; } } gfail = globalFail(comm, fail); if (gfail){ printFailureCode(comm, fail); // exits after printing "FAIL" } if (rank==0) std::cout << "PASS" << std::endl; /////////////////////////////////////////////////////////////////// // TODO: ///////////// // Create a solution object without part size information, and check it. ///////////// // Test multiple weights. ///////////// // Test multiple parts per process. ///////////// // Specify a list of parts of size 0. (The rest should be uniform.) delete [] lengths; for (int w=0; w < maxNumWeights; w++){ delete [] idLists[w]; delete [] sizeLists[w]; } delete [] idLists; delete [] sizeLists; }
TEUCHOS_UNIT_TEST(Aggregates, UncoupledPhase3) { out << "version: " << MueLu::Version() << std::endl; RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(36); RCP<const Map> rowmap = A->getRowMap(); RCP<AmalgamationInfo> amalgInfo; RCP<Aggregates> aggregates = gimmeUncoupledAggregates(A, amalgInfo,false,false,false,true); GO numAggs = aggregates->GetNumAggregates(); RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm(); TEST_EQUALITY(aggregates->AggregatesCrossProcessors(),false); ArrayRCP<LO> aggSizes = Teuchos::ArrayRCP<LO>(numAggs); ArrayRCP<LO> aggStart; ArrayRCP<GO> aggToRowMap; amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap); for (LO i = 0; i < numAggs; ++i) aggSizes[i] = aggStart[i+1] - aggStart[i]; bool foundAggNotSize2=false; for (int i=0; i<aggSizes.size(); ++i) if (aggSizes[i] != 2) { foundAggNotSize2=true; break; } switch (comm->getSize()) { case 1 : TEST_EQUALITY(numAggs, 18); TEST_EQUALITY(foundAggNotSize2, false); break; case 2: TEST_EQUALITY(numAggs, 9); TEST_EQUALITY(foundAggNotSize2, false); break; case 3: TEST_EQUALITY(numAggs, 6); TEST_EQUALITY(foundAggNotSize2, false); break; case 4: TEST_EQUALITY(numAggs, 4); TEST_EQUALITY(foundAggNotSize2, true); break; default: std::string msg = "Only 1-4 MPI processes are supported."; //throw(MueLu::Exceptions::NotImplemented(msg)); out << msg << std::endl; break; } //ArrayRCP< ArrayRCP<GO> > aggToRowMap(numAggs); int root = out.getOutputToRootOnly(); out.setOutputToRootOnly(-1); for (int j=0; j<comm->getSize(); ++j) { if (comm->getRank() == j) { out << "++ pid " << j << " ++" << std::endl; out << " num local DOFs = " << rowmap->getNodeNumElements() << std::endl; for (int i=0; i< numAggs; ++i) { out << " aggregate " << i << ": "; for (int k=aggStart[i]; k< aggStart[i+1]; ++k) out << aggToRowMap[k] << " "; out << std::endl; } } comm->barrier(); } out.setOutputToRootOnly(root); } //UncoupledPhase3
/*! \brief Create a mesh of approximately the desired size. * * We want 3 dimensions close to equal in length. */ const RCP<tMVector_t> getMeshCoordinates( const RCP<const Teuchos::Comm<int> > & comm, zgno_t numGlobalCoords) { int rank = comm->getRank(); int nprocs = comm->getSize(); double k = log(numGlobalCoords) / 3; double xdimf = exp(k) + 0.5; ssize_t xdim = static_cast<ssize_t>(floor(xdimf)); ssize_t ydim = xdim; ssize_t zdim = numGlobalCoords / (xdim*ydim); ssize_t num=xdim*ydim*zdim; ssize_t diff = numGlobalCoords - num; ssize_t newdiff = 0; while (diff > 0){ if (zdim > xdim && zdim > ydim){ zdim++; newdiff = diff - (xdim*ydim); if (newdiff < 0) if (diff < -newdiff) zdim--; } else if (ydim > xdim && ydim > zdim){ ydim++; newdiff = diff - (xdim*zdim); if (newdiff < 0) if (diff < -newdiff) ydim--; } else{ xdim++; newdiff = diff - (ydim*zdim); if (newdiff < 0) if (diff < -newdiff) xdim--; } diff = newdiff; } num=xdim*ydim*zdim; diff = numGlobalCoords - num; if (diff < 0) diff /= -numGlobalCoords; else diff /= numGlobalCoords; if (rank == 0){ if (diff > .01) cout << "Warning: Difference " << diff*100 << " percent" << endl; cout << "Mesh size: " << xdim << "x" << ydim << "x" << zdim << ", " << num << " vertices." << endl; } // Divide coordinates. ssize_t numLocalCoords = num / nprocs; ssize_t leftOver = num % nprocs; ssize_t gid0 = 0; if (rank <= leftOver) gid0 = zgno_t(rank) * (numLocalCoords+1); else gid0 = (leftOver * (numLocalCoords+1)) + ((zgno_t(rank) - leftOver) * numLocalCoords); if (rank < leftOver) numLocalCoords++; ssize_t gid1 = gid0 + numLocalCoords; zgno_t *ids = new zgno_t [numLocalCoords]; if (!ids) throw bad_alloc(); ArrayRCP<zgno_t> idArray(ids, 0, numLocalCoords, true); for (ssize_t i=gid0; i < gid1; i++) *ids++ = zgno_t(i); RCP<const tMap_t> idMap = rcp( new tMap_t(num, idArray.view(0, numLocalCoords), 0, comm)); // Create a Tpetra::MultiVector of coordinates. zscalar_t *x = new zscalar_t [numLocalCoords*3]; if (!x) throw bad_alloc(); ArrayRCP<zscalar_t> coordArray(x, 0, numLocalCoords*3, true); zscalar_t *y = x + numLocalCoords; zscalar_t *z = y + numLocalCoords; zgno_t xStart = 0; zgno_t yStart = 0; zgno_t xyPlane = xdim*ydim; zgno_t zStart = gid0 / xyPlane; zgno_t rem = gid0 % xyPlane; if (rem > 0){ yStart = rem / xdim; xStart = rem % xdim; } zlno_t next = 0; for (zscalar_t zval=zStart; next < numLocalCoords && zval < zdim; zval++){ for (zscalar_t yval=yStart; next < numLocalCoords && yval < ydim; yval++){ for (zscalar_t xval=xStart; next < numLocalCoords && xval < xdim; xval++){ x[next] = xval; y[next] = yval; z[next] = zval; next++; } xStart = 0; } yStart = 0; } ArrayView<const zscalar_t> xArray(x, numLocalCoords); ArrayView<const zscalar_t> yArray(y, numLocalCoords); ArrayView<const zscalar_t> zArray(z, numLocalCoords); ArrayRCP<ArrayView<const zscalar_t> > coordinates = arcp(new ArrayView<const zscalar_t> [3], 0, 3); coordinates[0] = xArray; coordinates[1] = yArray; coordinates[2] = zArray; ArrayRCP<const ArrayView<const zscalar_t> > constCoords = coordinates.getConst(); RCP<tMVector_t> meshCoords = rcp(new tMVector_t( idMap, constCoords.view(0,3), 3)); return meshCoords; }
/*! \brief Return the metric values. * \param values on return is the array of values. */ ArrayRCP<const MetricValues<scalar_t> > getMetrics() const{ //BDD return metricsConst_; if(metricsConst_.is_null()) return metrics_; return metricsConst_; }
ArrayView<const T>::ArrayView( const ArrayRCP<const T> &arcp ) : ptr_(arcp.getRawPtr()), size_(arcp.size()), arcp_(arcp) {}
/*! \brief Print all the metrics */ void printMetrics(std::ostream &os) const { Zoltan2::printMetrics<scalar_t, part_t>(os, targetGlobalParts_, numGlobalParts_, numNonEmpty_, metrics_.view(0, metrics_.size())); }
Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Vector( const RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> > &map, const ArrayRCP<Scalar> &view, EPrivateComputeViewConstructor /* dummy */) : MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>(map,view,view.size(),1,COMPUTE_VIEW_CONSTRUCTOR) { }