int main(int argc, char* argv[])
{

  using namespace std;
  using namespace Teuchos;

  const int num_samples = 1;
  const int num_loops = 500000;
  const int size = 10;
  const int num_vectors = 3;

  TEST_FOR_EXCEPTION(num_loops * size != 5000000, std::logic_error,
		     "Work amount is not constant!");

  // Make all vectors in a contiguous block

  MyVector<double>* vector_array = new MyVector<double>[num_vectors * size];
  ArrayRCP< MyVector<double> > a = 
    arcp< MyVector<double> >(vector_array, 0, size, false);
  ArrayRCP< MyVector<double> > b = 
    arcp< MyVector<double> >(&vector_array[size], 0, size, false);
  ArrayRCP< MyVector<double> > c = 
    arcp< MyVector<double> >(&vector_array[2*size], 0, size, false);

#ifdef HAVE_PHALANX_TVMET
  tvmet::Vector<double, 3>* tvmet_array = 
    new tvmet::Vector<double, 3>[num_vectors * size];
  ArrayRCP< tvmet::Vector<double, 3> > d = 
    arcp< tvmet::Vector<double, 3> >(tvmet_array, 0, size, false);
  ArrayRCP< tvmet::Vector<double, 3> > e = 
    arcp< tvmet::Vector<double, 3> >(&tvmet_array[size], 0, size, false);
  ArrayRCP< tvmet::Vector<double, 3> > f = 
    arcp< tvmet::Vector<double, 3> >(&tvmet_array[2*size], 0, size, false);
#endif

  double* raw_array = new double[num_vectors * size * 3];

  double* raw_a = raw_array;
  double* raw_b = &raw_array[size];
  double* raw_c = &raw_array[2*size];

  for (int i=0; i < a.size(); ++i)
    a[i] = 1.0;
  for (int i=0; i < b.size(); ++i)
    b[i] = 2.0;
  for (int i=0; i < c.size(); ++i)
    c[i] = 3.0;
#ifdef HAVE_PHALANX_TVMET
  for (int i=0; i < d.size(); ++i)
    d[i] = 1.0;
  for (int i=0; i < e.size(); ++i)
    e[i] = 2.0;
  for (int i=0; i < f.size(); ++i)
    f[i] = 3.0;
#endif
  for (int i=0; i < size; ++i) {
    int offset = i * 3;
    for (int j=0; j < 3; ++j) {
      raw_a[offset + j] = 1.0;
      raw_b[offset + j] = 2.0;
      raw_c[offset + j] = 3.0;
    }
  }

  RCP<Time> vector_time = TimeMonitor::getNewTimer("Vector Time");
  RCP<Time> update_time = TimeMonitor::getNewTimer("Update Time");
#ifdef HAVE_PHALANX_TVMET
  RCP<Time> tvmet_time = TimeMonitor::getNewTimer("TVMET Time");
#endif
  RCP<Time> raw_time = TimeMonitor::getNewTimer("Raw Time");
  RCP<Time> raw2_time = TimeMonitor::getNewTimer("Raw2 Time");

  for (int sample = 0; sample < num_samples; ++sample) {
    
    cout << "Vector" << endl;
    {
      TimeMonitor t(*vector_time);
      for (int i=0; i < num_loops; ++i)
	for (int j=0; j < c.size(); ++j)
	  c[j] = a[j] * b[j];
    } 
    
    cout << "Update" << endl;
    {
      TimeMonitor t(*update_time);
      for (int i=0; i < num_loops; ++i)
	for (int j=0; j < c.size(); ++j)
	  c[j].update_multiply(a[j], b[j]);
    }
    
#ifdef HAVE_PHALANX_TVMET
    cout << "TVMET" << endl;
    {
      TimeMonitor t(*tvmet_time);
      for (int i=0; i < num_loops; ++i)
	for (int j=0; j < d.size(); ++j)
	  f[j] = d[j] * e[j];
    }
#endif
    
    cout << "Raw" << endl;
    {
      TimeMonitor t(*raw_time);
      for (int i=0; i < num_loops; ++i) {
	for (int j=0; j < size; ++j) {
	  int offset = j * 3;
	  for (int k=0; k < 3; ++k)
	    raw_c[offset + k] = raw_a[offset + k] * raw_b[offset + k];
	}
      }
    }
    
    cout << "Raw2" << endl;
    {
      TimeMonitor t(*raw2_time);
      const int raw_size = 3 * size;  // 3 vector components
      for (int i=0; i < num_loops; ++i) {
	for (int j=0; j < raw_size; ++j) {
	  raw_c[j] = raw_a[j] * raw_b[j];
	}
      }
    }
    
  } // end loop over samples

  TimeMonitor::summarize();
  
  double f_vector = vector_time->totalElapsedTime() / raw_time->totalElapsedTime();
  double f_update = update_time->totalElapsedTime() / raw_time->totalElapsedTime();
#ifdef HAVE_PHALANX_TVMET
  double f_tvmet = tvmet_time->totalElapsedTime() / raw_time->totalElapsedTime();
#endif
  double f_raw = raw_time->totalElapsedTime() / raw_time->totalElapsedTime();

  double f_raw2 = raw2_time->totalElapsedTime() / raw_time->totalElapsedTime();

  std::cout << "vector = " << f_vector << std::endl;
  std::cout << "update = " << f_update << std::endl;
#ifdef HAVE_PHALANX_TVMET
  std::cout << "tvmet  = " << f_tvmet << std::endl;
#endif
  std::cout << "raw    = " << f_raw << std::endl;
  std::cout << "raw2   = " << f_raw2 << std::endl;

  delete [] vector_array;
#ifdef HAVE_PHALANX_TVMET
  delete [] tvmet_array;
#endif
  delete [] raw_array;

  std::cout << "\nTest passed!\n" << std::endl; 
    
  return 0;
}
Esempio n. 2
0
  void debug_assert_valid_ptr() const
    {
#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
      arcp_.access_private_node().assert_valid_ptr(*this);
#endif
    }
 /*! \brief Return the object normed weight imbalance.
  *  \param imbalance on return is the object normed weight imbalance.
  *  If there were no weights, this is the object count imbalance.
  *  If there was one weight, it is the imbalance with respect to that weight.
  */
 void getNormedImbalance(scalar_t &imbalance) const{
   if (metrics_.size() > 1)
     imbalance = metrics_[1].getMaxImbalance();
   else 
     imbalance = metrics_[0].getMaxImbalance();
 }
  void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
  DeterminePartitionPlacement(const Matrix& A, GOVector& decomposition, GO numPartitions) const {
    RCP<const Map> rowMap = A.getRowMap();

    RCP<const Teuchos::Comm<int> > comm = rowMap->getComm()->duplicate();
    int numProcs = comm->getSize();

    RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
    TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();

    const Teuchos::ParameterList& pL = GetParameterList();

    // maxLocal is a constant which determins the number of largest edges which are being exchanged
    // The idea is that we do not want to construct the full bipartite graph, but simply a subset of
    // it, which requires less communication. By selecting largest local edges we hope to achieve
    // similar results but at a lower cost.
    const int maxLocal = pL.get<int>("repartition: remap num values");
    const int dataSize = 2*maxLocal;

    ArrayRCP<GO> decompEntries;
    if (decomposition.getLocalLength() > 0)
      decompEntries = decomposition.getDataNonConst(0);

    // Step 1: Sort local edges by weight
    // Each edge of a bipartite graph corresponds to a triplet (i, j, v) where
    //   i: processor id that has some piece of part with part_id = j
    //   j: part id
    //   v: weight of the edge
    // We set edge weights to be the total number of nonzeros in rows on this processor which
    // correspond to this part_id. The idea is that when we redistribute matrix, this weight
    // is a good approximation of the amount of data to move.
    // We use two maps, original which maps a partition id of an edge to the corresponding weight,
    // and a reverse one, which is necessary to sort by edges.
    std::map<GO,GO> lEdges;
    for (LO i = 0; i < decompEntries.size(); i++)
      lEdges[decompEntries[i]] += A.getNumEntriesInLocalRow(i);

    // Reverse map, so that edges are sorted by weight.
    // This results in multimap, as we may have edges with the same weight
    std::multimap<GO,GO> revlEdges;
    for (typename std::map<GO,GO>::const_iterator it = lEdges.begin(); it != lEdges.end(); it++)
      revlEdges.insert(std::make_pair(it->second, it->first));

    // Both lData and gData are arrays of data which we communicate. The data is stored
    // in pairs, so that data[2*i+0] is the part index, and data[2*i+1] is the corresponding edge weight.
    // We do not store processor id in data, as we can compute that by looking on the offset in the gData.
    Array<GO> lData(dataSize, -1), gData(numProcs * dataSize);
    int numEdges = 0;
    for (typename std::multimap<GO,GO>::reverse_iterator rit = revlEdges.rbegin(); rit != revlEdges.rend() && numEdges < maxLocal; rit++) {
      lData[2*numEdges+0] = rit->second; // part id
      lData[2*numEdges+1] = rit->first;  // edge weight
      numEdges++;
    }

    // Step 2: Gather most edges
    // Each processors contributes maxLocal edges by providing maxLocal pairs <part id, weight>, which is of size dataSize
    MPI_Datatype MpiType = MpiTypeTraits<GO>::getType();
    MPI_Allgather(static_cast<void*>(lData.getRawPtr()), dataSize, MpiType, static_cast<void*>(gData.getRawPtr()), dataSize, MpiType, *rawMpiComm);

    // Step 3: Construct mapping

    // Construct the set of triplets
    std::vector<Triplet<int,int> > gEdges(numProcs * maxLocal);
    size_t k = 0;
    for (LO i = 0; i < gData.size(); i += 2) {
      GO part   = gData[i+0];
      GO weight = gData[i+1];
      if (part != -1) {                     // skip nonexistent edges
        gEdges[k].i = i/dataSize;           // determine the processor by its offset (since every processor sends the same amount)
        gEdges[k].j = part;
        gEdges[k].v = weight;
        k++;
      }
    }
    gEdges.resize(k);

    // Sort edges by weight
    // NOTE: compareTriplets is actually a reverse sort, so the edges weight is in decreasing order
    std::sort(gEdges.begin(), gEdges.end(), compareTriplets<int,int>);

    // Do matching
    std::map<int,int> match;
    std::vector<char> matchedRanks(numProcs, 0), matchedParts(numProcs, 0);
    int numMatched = 0;
    for (typename std::vector<Triplet<int,int> >::const_iterator it = gEdges.begin(); it != gEdges.end(); it++) {
      GO rank = it->i;
      GO part = it->j;
      if (matchedRanks[rank] == 0 && matchedParts[part] == 0) {
        matchedRanks[rank] = 1;
        matchedParts[part] = 1;
        match[part] = rank;
        numMatched++;
      }
    }
    GetOStream(Statistics0) << "Number of unassigned paritions before cleanup stage: " << (numPartitions - numMatched) << " / " << numPartitions << std::endl;

    // Step 4: Assign unassigned partitions
    // We do that through random matching for remaining partitions. Not all part numbers are valid, but valid parts are a subset of [0, numProcs).
    // The reason it is done this way is that we don't need any extra communication, as we don't need to know which parts are valid.
    for (int part = 0, matcher = 0; part < numProcs; part++)
      if (match.count(part) == 0) {
        // Find first non-matched rank
        while (matchedRanks[matcher])
          matcher++;

        match[part] = matcher++;
      }

    // Step 5: Permute entries in the decomposition vector
    for (LO i = 0; i < decompEntries.size(); i++)
      decompEntries[i] = match[decompEntries[i]];
  }
 /*! \brief Return the graph metric values.
  *  \param values on return is the array of values.
  */
 ArrayRCP<const GraphMetricValues<scalar_t> > getGraphMetrics() const{
     if(graphMetricsConst_.is_null()) return graphMetrics_;
     return graphMetricsConst_;
 }
void globalWeightedCutsMessagesHopsByPart(
    const RCP<const Environment> &env,
    const RCP<const Comm<int> > &comm,
    const RCP<const GraphModel<typename Adapter::base_adapter_t> > &graph,
    const ArrayView<const typename Adapter::part_t> &parts,
    typename Adapter::part_t &numParts,
    ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > > &metrics,
    ArrayRCP<typename Adapter::scalar_t> &globalSums,
    const RCP <const MachineRep> machine)
{
  env->debug(DETAILED_STATUS, "Entering globalWeightedCutsMessagesHopsByPart");
  //////////////////////////////////////////////////////////
  // Initialize return values

  typedef typename Adapter::lno_t t_lno_t;
  typedef typename Adapter::gno_t t_gno_t;
  typedef typename Adapter::scalar_t t_scalar_t;
  typedef typename Adapter::part_t part_t;
  typedef typename Adapter::node_t t_node_t;


  typedef typename Zoltan2::GraphModel<typename Adapter::base_adapter_t>::input_t t_input_t;

  t_lno_t localNumVertices = graph->getLocalNumVertices();
  t_gno_t globalNumVertices = graph->getGlobalNumVertices();
  t_lno_t localNumEdges = graph->getLocalNumEdges();

  ArrayView<const t_gno_t> Ids;
  ArrayView<t_input_t> v_wghts;
  graph->getVertexList(Ids, v_wghts);

  typedef GraphMetrics<t_scalar_t> mv_t;

  //get the edge ids, and weights
  ArrayView<const t_gno_t> edgeIds;
  ArrayView<const t_lno_t> offsets;
  ArrayView<t_input_t> e_wgts;
  graph->getEdgeList(edgeIds, offsets, e_wgts);


  std::vector <t_scalar_t> edge_weights;
  int numWeightPerEdge = graph->getNumWeightsPerEdge();

  int numMetrics = 4;                   // "edge cuts", messages, hops, weighted hops
  if (numWeightPerEdge) numMetrics += numWeightPerEdge * 2;   // "weight n", weighted hops per weight n

  // add some more metrics to the array
  typedef typename ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > >::size_type array_size_type;
  metrics.resize( metrics.size() + numMetrics );

  for( array_size_type n = metrics.size() - numMetrics; n < metrics.size(); ++n ){
    mv_t * newMetric = new mv_t;                  // allocate the new memory
    env->localMemoryAssertion(__FILE__,__LINE__,1,newMetric);   // check errors
    metrics[n] = rcp( newMetric);         // create the new members
  }
  array_size_type next = metrics.size() - numMetrics; // MDM - this is most likely temporary to preserve the format here - we are now filling a larger array so we may not have started at 0

  std::vector <part_t> e_parts (localNumEdges);
#ifdef HAVE_ZOLTAN2_MPI
  if (comm->getSize() > 1)
  {
    Zoltan_DD_Struct *dd = NULL;

    MPI_Comm mpicomm = Teuchos::getRawMpiComm(*comm);
    int size_gnot = Zoltan2::TPL_Traits<ZOLTAN_ID_PTR, t_gno_t>::NUM_ID;

    int debug_level = 0;
    Zoltan_DD_Create(&dd, mpicomm,
        size_gnot, 0,
        sizeof(part_t), localNumVertices, debug_level);

    ZOLTAN_ID_PTR ddnotneeded = NULL;  // Local IDs not needed
    Zoltan_DD_Update(
        dd,
        (ZOLTAN_ID_PTR) Ids.getRawPtr(),
        ddnotneeded,
        (char *) &(parts[0]),
        NULL,
        int(localNumVertices));

    Zoltan_DD_Find(
        dd,
        (ZOLTAN_ID_PTR) edgeIds.getRawPtr(),
        ddnotneeded,
        (char *)&(e_parts[0]),
        NULL,
        localNumEdges,
        NULL
        );
    Zoltan_DD_Destroy(&dd);
  } else
#endif
  {

    std::map<t_gno_t,t_lno_t> global_id_to_local_index;

    //else everything is local.
    //we need a globalid to local index conversion.
    //this does not exists till this point, so we need to create one.
    for (t_lno_t i = 0; i < localNumVertices; ++i){
      //at the local index i, we have the global index Ids[i].
      //so write i, to Ids[i] index of the vector.
      global_id_to_local_index[Ids[i]] = i;
    }

    for (t_lno_t i = 0; i < localNumEdges; ++i){
      t_gno_t ei = edgeIds[i];
      //ei is the global index of the neighbor one.
      part_t p = parts[global_id_to_local_index[ei]];
      e_parts[i] = p;
    }
  }

  RCP<const Teuchos::Comm<int> > tcomm = comm;

  env->timerStart(MACRO_TIMERS, "Communication Graph Create");
  {
    //get the vertices in each part in my part.
    std::vector <t_lno_t> part_begins(numParts, -1);
    std::vector <t_lno_t> part_nexts(localNumVertices, -1);

    //cluster vertices according to their parts.
    //create local part graph.
    for (t_lno_t i = 0; i < localNumVertices; ++i){
      part_t ap = parts[i];
      part_nexts[i] = part_begins[ap];
      part_begins[ap] = i;
    }


    for (int weight_index = -1; weight_index < numWeightPerEdge ; ++weight_index){

      //MD: these two should be part_t.
      //but we dont want to compile tpetra from the beginning.
      //This can be changed when directory is updated.
      typedef t_lno_t local_part_type;
      typedef t_gno_t global_part_type;

      typedef Tpetra::Map<local_part_type, global_part_type, t_node_t> map_t;
      Teuchos::RCP<const map_t> map = Teuchos::rcp (new map_t (numParts, 0, tcomm));

      typedef Tpetra::CrsMatrix<t_scalar_t, local_part_type, global_part_type, t_node_t> tcrsMatrix_t;
      Teuchos::RCP<tcrsMatrix_t> tMatrix(new tcrsMatrix_t (map, 0));


      std::vector <global_part_type> part_neighbors (numParts);

      std::vector <t_scalar_t> part_neighbor_weights(numParts, 0);
      std::vector <t_scalar_t> part_neighbor_weights_ordered(numParts);

      //coarsen for all vertices in my part in order with parts.
      for (global_part_type i = 0; i < (global_part_type) numParts; ++i){
        part_t num_neighbor_parts = 0;
        t_lno_t v = part_begins[i];
        //get part i, and first vertex in this part v.
        while (v != -1){
          //now get the neightbors of v.
          for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){
            //get the part of the second vertex.
            part_t ep = e_parts[j];

            t_scalar_t ew = 1;
            if (weight_index > -1){
              ew = e_wgts[weight_index][j];
            }
            //add it to my local part neighbors for part i.
            if (part_neighbor_weights[ep] < 0.00001){
              part_neighbors[num_neighbor_parts++] = ep;
            }
            part_neighbor_weights[ep] += ew;
          }
          v = part_nexts[v];
        }

        //now get the part list.
        for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
          part_t neighbor_part = part_neighbors[j];
          part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part];
          part_neighbor_weights[neighbor_part] = 0;
        }

        //insert it to tpetra crsmatrix.
        if (num_neighbor_parts > 0){
          Teuchos::ArrayView<const global_part_type> destinations(&(part_neighbors[0]), num_neighbor_parts);
          Teuchos::ArrayView<const t_scalar_t> vals(&(part_neighbor_weights_ordered[0]), num_neighbor_parts);
          tMatrix->insertGlobalValues (i,destinations, vals);
        }
      }
      tMatrix->fillComplete ();
      local_part_type num_local_parts = map->getNodeNumElements();

      Array<global_part_type> Indices;
      Array<t_scalar_t> Values;

      t_scalar_t max_edge_cut = 0;
      t_scalar_t total_edge_cut = 0;
      global_part_type max_message = 0;
      global_part_type total_message = 0;

      global_part_type total_hop_count = 0;
      t_scalar_t total_weighted_hop_count = 0;
      global_part_type max_hop_count = 0;
      t_scalar_t max_weighted_hop_count = 0;

      for (local_part_type i=0; i < num_local_parts; i++) {

        const global_part_type globalRow = map->getGlobalElement(i);
        size_t NumEntries = tMatrix->getNumEntriesInGlobalRow (globalRow);
        Indices.resize (NumEntries);
        Values.resize (NumEntries);
        tMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries);

        t_scalar_t part_edge_cut = 0;
        global_part_type part_messages = 0;

        for (size_t j=0; j < NumEntries; j++){
          if (Indices[j] != globalRow){
            part_edge_cut += Values[j];
            part_messages += 1;

            typename MachineRep::machine_pcoord_t hop_count = 0;
            machine->getHopCount(globalRow, Indices[j], hop_count);

            global_part_type hop_counts = hop_count;
            t_scalar_t weighted_hop_counts = hop_count * Values[j];

            total_hop_count += hop_counts;
            total_weighted_hop_count += weighted_hop_counts;

            if (hop_counts > max_hop_count ){
              max_hop_count = hop_counts;
            }
            if (weighted_hop_counts > max_weighted_hop_count ){
              max_weighted_hop_count = weighted_hop_counts;
            }
          }
        }
        if (part_edge_cut > max_edge_cut){
          max_edge_cut = part_edge_cut;
        }
        total_edge_cut += part_edge_cut;

        if (part_messages > max_message){
          max_message = part_messages;
        }
        total_message += part_messages;

      }
      t_scalar_t g_max_edge_cut = 0;
      t_scalar_t g_total_edge_cut = 0;
      global_part_type g_max_message = 0;
      global_part_type g_total_message = 0;



      global_part_type g_total_hop_count = 0;
      t_scalar_t g_total_weighted_hop_count = 0;
      global_part_type g_max_hop_count = 0;
      t_scalar_t g_max_weighted_hop_count = 0;

      try{

        Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_MAX,1,&max_edge_cut,&g_max_edge_cut);
        Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_MAX,1,&max_message,&g_max_message);

        Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_MAX,1,&max_hop_count,&g_max_hop_count);
        Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_MAX,1,&max_weighted_hop_count,&g_max_weighted_hop_count);

        Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_SUM,1,&total_edge_cut,&g_total_edge_cut);
        Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_SUM,1,&total_message,&g_total_message);

        Teuchos::reduceAll<int, global_part_type>(*comm,Teuchos::REDUCE_SUM,1,&total_hop_count,&g_total_hop_count);
        Teuchos::reduceAll<int, t_scalar_t>(*comm,Teuchos::REDUCE_SUM,1,&total_weighted_hop_count,&g_total_weighted_hop_count);

      }
      Z2_THROW_OUTSIDE_ERROR(*env);


      if (weight_index == -1){
        metrics[next]->setName("md edge cuts");
      }
      else {
        std::ostringstream oss;
        oss << "md weight " << weight_index;
        metrics[next]->setName( oss.str());
      }

      metrics[next]->setMetricValue("global maximum", g_max_edge_cut);
      metrics[next]->setMetricValue("global sum", g_total_edge_cut);
      next++;

      if (weight_index == -1){
        metrics[next]->setName("message");
        metrics[next]->setMetricValue("global maximum", g_max_message);
        metrics[next]->setMetricValue("global sum", g_total_message);
        next++;
      }


      if (weight_index == -1){
        metrics[next]->setName("hops");
        metrics[next]->setMetricValue("global maximum", g_max_hop_count);
        metrics[next]->setMetricValue("global sum", g_total_hop_count);
        next++;
      }

      std::ostringstream oss;
      oss << "weighted hops" << weight_index;
      metrics[next]->setName( oss.str());
      metrics[next]->setMetricValue("global maximum", g_max_weighted_hop_count);
      metrics[next]->setMetricValue("global sum", g_total_weighted_hop_count);
      next++;

    }
  }
  env->timerStop(MACRO_TIMERS, "Communication Graph Create");

  env->debug(DETAILED_STATUS, "Exiting globalWeightedCutsMessagesHopsByPart");
}
Esempio n. 7
0
int main(int argc, char *argv[])
{
  Teuchos::GlobalMPISession session(&argc, &argv);
  RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();
  int rank = comm->getRank();

  Teuchos::RCP<Teuchos::FancyOStream> outStream = 
    Teuchos::VerboseObjectBase::getDefaultOStream();
  Teuchos::EVerbosityLevel v=Teuchos::VERB_EXTREME;

  typedef Tpetra::CrsMatrix<zscalar_t,zlno_t,zgno_t,znode_t> tmatrix_t;
  typedef Tpetra::CrsGraph<zlno_t,zgno_t,znode_t> tgraph_t;
  typedef Tpetra::Vector<zscalar_t,zlno_t,zgno_t,znode_t> tvector_t;
  typedef Tpetra::MultiVector<zscalar_t,zlno_t,zgno_t,znode_t> tmvector_t;
  typedef Xpetra::CrsMatrix<zscalar_t,zlno_t,zgno_t,znode_t> xmatrix_t;
  typedef Xpetra::CrsGraph<zlno_t,zgno_t,znode_t> xgraph_t;
  typedef Xpetra::Vector<zscalar_t,zlno_t,zgno_t,znode_t> xvector_t;
  typedef Xpetra::MultiVector<zscalar_t,zlno_t,zgno_t,znode_t> xmvector_t;
  typedef Xpetra::TpetraMap<zlno_t,zgno_t,znode_t> xtmap_t;

  // Create object that can give us test Tpetra and Xpetra input.

  RCP<UserInputForTests> uinput;

  try{
    uinput = 
      rcp(new UserInputForTests(testDataFilePath,std::string("simple"), comm, true));
  }
  catch(std::exception &e){
    TEST_FAIL_AND_EXIT(*comm, 0, string("input ")+e.what(), 1);
  }

  /////////////////////////////////////////////////////////////////
  //   Tpetra::CrsMatrix
  //   Tpetra::CrsGraph
  //   Tpetra::Vector
  //   Tpetra::MultiVector
  /////////////////////////////////////////////////////////////////

  // XpetraTraits<Tpetra::CrsMatrix<zscalar_t, zlno_t, zgno_t, znode_t> > 
  {
    RCP<tmatrix_t> M;
  
    try{
      M = uinput->getUITpetraCrsMatrix();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getTpetraCrsMatrix ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Tpetra matrix " << M->getGlobalNumRows()
        << " x " << M->getGlobalNumCols() << std::endl;

    M->describe(*outStream,v);

    RCP<const xtmap_t> xmap(new xtmap_t(M->getRowMap()));

    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const tmatrix_t> newM;
    try{
      newM = Zoltan2::XpetraTraits<tmatrix_t>::doMigration(*M,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<tmatrix_t>::doMigration ")+e.what(), 1);
    }

    if (rank== 0)
      std::cout << "Migrated Tpetra matrix" << std::endl;
  
    newM->describe(*outStream,v);
  }

  // XpetraTraits<Tpetra::CrsGraph<zscalar_t, zlno_t, zgno_t, znode_t> > 
  {
    RCP<tgraph_t> G;
  
    try{
      G = uinput->getUITpetraCrsGraph();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getTpetraCrsGraph ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Tpetra graph" << std::endl;
  
    G->describe(*outStream,v);
  
    RCP<const xtmap_t> xmap(new xtmap_t(G->getRowMap()));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const tgraph_t> newG;
    try{
      newG = Zoltan2::XpetraTraits<tgraph_t>::doMigration(*G,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<tgraph_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Tpetra graph" << std::endl;
  
    newG->describe(*outStream,v);
  }

  // XpetraTraits<Tpetra::Vector<zscalar_t, zlno_t, zgno_t, znode_t>> 
  {
    RCP<tvector_t> V;
  
    try{
      V = rcp(new tvector_t(uinput->getUITpetraCrsGraph()->getRowMap(),  1));
      V->randomize();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getTpetraVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Tpetra vector" << std::endl;
  
    V->describe(*outStream,v);
  
    RCP<const xtmap_t> xmap(new xtmap_t(V->getMap()));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const tvector_t> newV;
    try{
      newV = Zoltan2::XpetraTraits<tvector_t>::doMigration(*V,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<tvector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Tpetra vector" << std::endl;
  
    newV->describe(*outStream,v);
  }

  // XpetraTraits<Tpetra::MultiVector<zscalar_t, zlno_t, zgno_t, znode_t>> 
  {
    RCP<tmvector_t> MV;
  
    try{
      MV = rcp(new tmvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 3));
      MV->randomize();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getTpetraMultiVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Tpetra multivector" << std::endl;
  
    MV->describe(*outStream,v);
  
    RCP<const xtmap_t> xmap(new xtmap_t(MV->getMap()));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const tmvector_t> newMV;
    try{
      newMV = Zoltan2::XpetraTraits<tmvector_t>::doMigration(*MV,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<tmvector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Tpetra multivector" << std::endl;
  
    newMV->describe(*outStream,v);
  }

  /////////////////////////////////////////////////////////////////
  //   Xpetra::CrsMatrix
  //   Xpetra::CrsGraph
  //   Xpetra::Vector
  //   Xpetra::MultiVector
  /////////////////////////////////////////////////////////////////

  // XpetraTraits<Xpetra::CrsMatrix<zscalar_t, zlno_t, zgno_t, znode_t> > 
  {
    RCP<xmatrix_t> M;
  
    try{
      M = uinput->getUIXpetraCrsMatrix();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getXpetraCrsMatrix ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Xpetra matrix" << std::endl;
  
    M->describe(*outStream,v);
  
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(M->getRowMap());
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const xmatrix_t> newM;
    try{
      newM = Zoltan2::XpetraTraits<xmatrix_t>::doMigration(*M,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<xmatrix_t>::doMigration ")+e.what(), 1);
    }

    if (rank== 0)
      std::cout << "Migrated Xpetra matrix" << std::endl;
  
    newM->describe(*outStream,v);
  }

  // XpetraTraits<Xpetra::CrsGraph<zscalar_t, zlno_t, zgno_t, znode_t> > 
  {
    RCP<xgraph_t> G;
  
    try{
      G = uinput->getUIXpetraCrsGraph();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getXpetraCrsGraph ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Xpetra graph" << std::endl;
  
    G->describe(*outStream,v);
  
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(G->getRowMap());
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const xgraph_t> newG;
    try{
      newG = Zoltan2::XpetraTraits<xgraph_t>::doMigration(*G,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<xgraph_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Xpetra graph" << std::endl;
  
    newG->describe(*outStream,v);
  }

  // XpetraTraits<Xpetra::Vector<zscalar_t, zlno_t, zgno_t, znode_t>> 
  {
    RCP<xvector_t> V;
  
    try{
      RCP<tvector_t> tV = 
          rcp(new tvector_t(uinput->getUITpetraCrsGraph()->getRowMap(),  1));
      tV->randomize();
      V = Zoltan2::XpetraTraits<tvector_t>::convertToXpetra(tV);
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getXpetraVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Xpetra vector" << std::endl;
  
    V->describe(*outStream,v);
  
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(V->getMap());
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const xvector_t> newV;
    try{
      newV = Zoltan2::XpetraTraits<xvector_t>::doMigration(*V,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<xvector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Xpetra vector" << std::endl;
  
    newV->describe(*outStream,v);
  }

  // XpetraTraits<Xpetra::MultiVector<zscalar_t, zlno_t, zgno_t, znode_t>> 
  {
    RCP<xmvector_t> MV;
  
    try{
      RCP<tmvector_t> tMV =
          rcp(new tmvector_t(uinput->getUITpetraCrsGraph()->getRowMap(), 3));
      tMV->randomize();
      MV = Zoltan2::XpetraTraits<tmvector_t>::convertToXpetra(tMV);
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getXpetraMultiVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Xpetra multivector" << std::endl;
  
    MV->describe(*outStream,v);
  
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(MV->getMap());
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const xmvector_t> newMV;
    try{
      newMV = Zoltan2::XpetraTraits<xmvector_t>::doMigration(*MV,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<xmvector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Xpetra multivector" << std::endl;
  
    newMV->describe(*outStream,v);
  }

#ifdef HAVE_EPETRA_DATA_TYPES
  /////////////////////////////////////////////////////////////////
  //   Epetra_CrsMatrix
  //   Epetra_CrsGraph
  //   Epetra_Vector
  //   Epetra_MultiVector
  /////////////////////////////////////////////////////////////////

  typedef Epetra_CrsMatrix ematrix_t;
  typedef Epetra_CrsGraph egraph_t;
  typedef Epetra_Vector evector_t;
  typedef Epetra_MultiVector emvector_t;
  typedef Xpetra::EpetraMap xemap_t;
  typedef Epetra_BlockMap emap_t;

  // Create object that can give us test Epetra input.

  RCP<UserInputForTests> euinput;

  try{
    euinput = 
      rcp(new UserInputForTests(testDataFilePath,std::string("simple"), comm, true));
  }
  catch(std::exception &e){
    TEST_FAIL_AND_EXIT(*comm, 0, string("epetra input ")+e.what(), 1);
  }

  // XpetraTraits<Epetra_CrsMatrix> 
  {
    RCP<ematrix_t> M;
  
    try{
      M = euinput->getUIEpetraCrsMatrix();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getEpetraCrsMatrix ")+e.what(), 1);
    }

    if (rank== 0)
      std::cout << "Original Epetra matrix" << std::endl;
  
    M->Print(std::cout);
  
    RCP<const emap_t> emap = Teuchos::rcpFromRef(M->RowMap());
    RCP<const xemap_t> xmap(new xemap_t(emap));

    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const ematrix_t> newM;
    try{
      newM = Zoltan2::XpetraTraits<ematrix_t>::doMigration(*M,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<ematrix_t>::doMigration ")+e.what(), 1);
    }

    if (rank== 0)
      std::cout << "Migrated Epetra matrix" << std::endl;
  
    newM->Print(std::cout);
  }

  // XpetraTraits<Epetra_CrsGraph> 
  {
    RCP<egraph_t> G;
  
    try{
      G = euinput->getUIEpetraCrsGraph();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getEpetraCrsGraph ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Epetra graph" << std::endl;
  
    G->Print(std::cout);
  
    RCP<const emap_t> emap = Teuchos::rcpFromRef(G->RowMap());
    RCP<const xemap_t> xmap(new xemap_t(emap));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const egraph_t> newG;
    try{
      newG = Zoltan2::XpetraTraits<egraph_t>::doMigration(*G,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<egraph_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Epetra graph" << std::endl;
  
    newG->Print(std::cout);
  }

  // XpetraTraits<Epetra_Vector>
  {
    RCP<evector_t> V;
  
    try{
      V = rcp(new Epetra_Vector(euinput->getUIEpetraCrsGraph()->RowMap()));
      V->Random();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getEpetraVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Epetra vector" << std::endl;
  
    V->Print(std::cout);
  
    RCP<const emap_t> emap = Teuchos::rcpFromRef(V->Map());
    RCP<const xemap_t> xmap(new xemap_t(emap));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const evector_t> newV;
    try{
      newV = Zoltan2::XpetraTraits<evector_t>::doMigration(*V,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<evector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Epetra vector" << std::endl;
  
    newV->Print(std::cout);
  }

  // XpetraTraits<Epetra_MultiVector>
  {
    RCP<emvector_t> MV;
  
    try{
      MV =
        rcp(new Epetra_MultiVector(euinput->getUIEpetraCrsGraph()->RowMap(),3));
      MV->Random();
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string("getEpetraMultiVector")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Original Epetra multivector" << std::endl;
  
    MV->Print(std::cout);
  
    RCP<const emap_t> emap = Teuchos::rcpFromRef(MV->Map());
    RCP<const xemap_t> xmap(new xemap_t(emap));
    ArrayRCP<zgno_t> newRowIds = roundRobinMap(xmap);
  
    zgno_t localNumRows = newRowIds.size();
  
    RCP<const emvector_t> newMV;
    try{
      newMV = Zoltan2::XpetraTraits<emvector_t>::doMigration(*MV,
        localNumRows, newRowIds.getRawPtr());
    }
    catch(std::exception &e){
      TEST_FAIL_AND_EXIT(*comm, 0, 
        string(" Zoltan2::XpetraTraits<emvector_t>::doMigration ")+e.what(), 1);
    }
  
    if (rank== 0)
      std::cout << "Migrated Epetra multivector" << std::endl;
  
    newMV->Print(std::cout);
  }
#endif   // have epetra data types (int, int, double)

  /////////////////////////////////////////////////////////////////
  // DONE
  /////////////////////////////////////////////////////////////////

  if (rank==0)
    std::cout << "PASS" << std::endl;
}
  void CoordinatesTransferFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level & fineLevel, Level &coarseLevel) const {
    FactoryMonitor m(*this, "Build", coarseLevel);

    GetOStream(Runtime0, 0) << "Transferring coordinates" << std::endl;

    const ParameterList  & pL = GetParameterList();
    int                 writeStart = pL.get< int >("write start");
    int                 writeEnd   = pL.get< int >("write end");

    RCP<Aggregates>     aggregates = Get< RCP<Aggregates> > (fineLevel, "Aggregates");
    RCP<MultiVector>    fineCoords = Get< RCP<MultiVector> >(fineLevel, "Coordinates");
    RCP<const Map>      coarseMap  = Get< RCP<const Map> >  (fineLevel, "CoarseMap");

    // coarseMap is being used to set up the domain map of tentative P, and therefore, the row map of Ac
    // Therefore, if we amalgamate coarseMap, logical nodes in the coordinates vector would correspond to
    // logical blocks in the matrix

    ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
    LO                  blkSize      = 1;
    if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
      blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();

    GO                  indexBase    = coarseMap->getIndexBase();
    size_t              numElements  = elementAList.size() / blkSize;
    Array<GO>           elementList(numElements);

    // Amalgamate the map
    for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
      elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;

    RCP<const Map> coarseCoordMap = MapFactory        ::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), elementList, indexBase, coarseMap->getComm());
    RCP<MultiVector> coarseCoords = MultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());

    // Maps
    RCP<const Map> uniqueMap    = fineCoords->getMap();
    RCP<const Map> nonUniqueMap = aggregates->GetMap();

    // Create overlapped fine coordinates to reduce global communication
    RCP<const Import>     importer = ImportFactory     ::Build(uniqueMap, nonUniqueMap);
    RCP<MultiVector> ghostedCoords = MultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
    ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);

    // Get some info about aggregates
    int                         myPID        = uniqueMap->getComm()->getRank();
    LO                          numAggs      = aggregates->GetNumAggregates();
    ArrayRCP<LO>                aggSizes     = aggregates->ComputeAggregateSizes();
    const ArrayRCP<const LO>    vertex2AggID = aggregates->GetVertex2AggId()->getData(0);
    const ArrayRCP<const LO>    procWinner   = aggregates->GetProcWinner()->getData(0);

    // Fill in coarse coordinates
    for (size_t j = 0; j < fineCoords->getNumVectors(); j++) {
      ArrayRCP<const Scalar> fineCoordsData = ghostedCoords->getData(j);
      ArrayRCP<Scalar>     coarseCoordsData = coarseCoords->getDataNonConst(j);

      for (LO lnode = 0; lnode < vertex2AggID.size(); lnode++)
        if (procWinner[lnode] == myPID)
          coarseCoordsData[vertex2AggID[lnode]] += fineCoordsData[lnode];

      for (LO agg = 0; agg < numAggs; agg++)
        coarseCoordsData[agg] /= aggSizes[agg];
    }

    Set<RCP<MultiVector> >(coarseLevel, "Coordinates", coarseCoords);
    if (writeStart == 0 && fineLevel.GetLevelID() == 0 && writeStart <= writeEnd) {
      std::ostringstream buf;
      buf << fineLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Utils::Write(fileName,*fineCoords);
    }
    if (writeStart <= coarseLevel.GetLevelID() && coarseLevel.GetLevelID() <= writeEnd) {
      std::ostringstream buf;
      buf << coarseLevel.GetLevelID();
      std::string fileName = "coordinates_before_rebalance_level_" + buf.str() + ".m";
      Utils::Write(fileName,*coarseCoords);
    }

  } // Build
  void Ifpack2Smoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::SetupSchwarz(Level& currentLevel) {
    if (this->IsSetup() == true)
      this->GetOStream(Warnings0) << "MueLu::Ifpack2Smoother::Setup(): Setup() has already been called" << std::endl;

    // If we are doing "user" partitioning, we assume that what the user
    // really wants to do is make tiny little subdomains with one row
    // asssigned to each subdomain. The rows used for these little
    // subdomains correspond to those in the 2nd block row.  Then,
    // if we overlap these mini-subdomains, we will do something that
    // looks like Vanka (grabbing all velocities associated with each
    // each pressure unknown). In addition, we put all Dirichlet points
    // as a little mini-domain.
    ParameterList& paramList = const_cast<ParameterList&>(this->GetParameterList());

    bool isBlockedMatrix = false;
    RCP<Matrix> merged2Mat;

    std::string sublistName = "subdomain solver parameters";
    if (paramList.isSublist(sublistName)) {
      ParameterList& subList = paramList.sublist(sublistName);

      std::string partName = "partitioner: type";
      if (subList.isParameter(partName) && subList.get<std::string>(partName) == "user") {
        isBlockedMatrix = true;

        RCP<BlockedCrsMatrix> bA = rcp_dynamic_cast<BlockedCrsMatrix>(A_);
        TEUCHOS_TEST_FOR_EXCEPTION(bA.is_null(), Exceptions::BadCast,
                                   "Matrix A must be of type BlockedCrsMatrix.");

        size_t numVels = bA->getMatrix(0,0)->getNodeNumRows();
        size_t numPres = bA->getMatrix(1,0)->getNodeNumRows();
        size_t numRows = A_->getNodeNumRows();

        ArrayRCP<LocalOrdinal> blockSeeds(numRows, Teuchos::OrdinalTraits<LocalOrdinal>::invalid());

        size_t numBlocks = 0;
        for (size_t rowOfB = numVels; rowOfB < numVels+numPres; ++rowOfB)
          blockSeeds[rowOfB] = numBlocks++;

        RCP<BlockedCrsMatrix> bA2 = rcp_dynamic_cast<BlockedCrsMatrix>(A_);
        TEUCHOS_TEST_FOR_EXCEPTION(bA2.is_null(), Exceptions::BadCast,
                                   "Matrix A must be of type BlockedCrsMatrix.");

        RCP<CrsMatrix> mergedMat = bA2->Merge();
        merged2Mat = rcp(new CrsMatrixWrap(mergedMat));

        // Add Dirichlet rows to the list of seeds
        ArrayRCP<const bool> boundaryNodes;
        boundaryNodes = Utilities::DetectDirichletRows(*merged2Mat, 0.0);
        bool haveBoundary = false;
        for (LO i = 0; i < boundaryNodes.size(); i++)
          if (boundaryNodes[i]) {
            // FIXME:
            // 1. would not this [] overlap with some in the previos blockSeed loop?
            // 2. do we need to distinguish between pressure and velocity Dirichlet b.c.
            blockSeeds[i] = numBlocks;
            haveBoundary = true;
          }
        if (haveBoundary)
          numBlocks++;

        subList.set("partitioner: map",         blockSeeds);
        subList.set("partitioner: local parts", as<int>(numBlocks));
      }
    }

    RCP<const Tpetra::RowMatrix<SC, LO, GO, NO> > tpA;
    if (isBlockedMatrix == true) tpA = Utilities::Op2NonConstTpetraRow(merged2Mat);
    else                         tpA = Utilities::Op2NonConstTpetraRow(A_);

    prec_ = Ifpack2::Factory::create(type_, tpA, overlap_);
    SetPrecParameters();
    prec_->initialize();
    prec_->compute();
  }
Esempio n. 10
0
size_t computeLocalEdgeList(
  const RCP<const Environment> &env, const RCP<const Comm<int> > &comm,
  size_t numLocalEdges,           // local edges
  size_t numLocalGraphEdges,      // edges in "local" graph
  RCP<const IdentifierMap<User> > &idMap,
  ArrayRCP<const typename InputTraits<User>::zgid_t> &allEdgeIds, // in
  ArrayRCP<const typename InputTraits<User>::gno_t> &allEdgeGnos, // in
  ArrayRCP<int> &allProcs,                                 // in
  ArrayRCP<const typename InputTraits<User>::lno_t> &allOffs,    // in
  ArrayRCP<StridedData<typename InputTraits<User>::lno_t,
                       typename InputTraits<User>::scalar_t> > &allWeights,// in
  ArrayRCP<const typename InputTraits<User>::lno_t> &edgeLocalIds, //
  ArrayRCP<const typename InputTraits<User>::lno_t> &offsets,      // out
  ArrayRCP<StridedData<typename InputTraits<User>::lno_t,
    typename InputTraits<User>::scalar_t> > &eWeights)             // out
{
  typedef typename InputTraits<User>::zgid_t zgid_t;
  typedef typename InputTraits<User>::gno_t gno_t;
  typedef typename InputTraits<User>::scalar_t scalar_t;
  typedef typename InputTraits<User>::lno_t lno_t;
  typedef StridedData<lno_t, scalar_t> input_t;
  int rank = comm->getRank();

  bool gnosAreGids = idMap->gnosAreGids();

  edgeLocalIds = ArrayRCP<const lno_t>(Teuchos::null);
  eWeights = ArrayRCP<input_t>(Teuchos::null);
  offsets = ArrayRCP<const lno_t>(Teuchos::null);

  if (numLocalGraphEdges == 0) {
    // Set the offsets array and return
    size_t allOffsSize = allOffs.size();
    lno_t *offs = new lno_t [allOffsSize];
    env->localMemoryAssertion(__FILE__, __LINE__, allOffsSize, offs);
    for (size_t i = 0; i < allOffsSize; i++) offs[i] = 0;
    offsets = arcp(offs, 0, allOffsSize, true);
    return 0;
  }

  if (numLocalGraphEdges == numLocalEdges){

    // Entire graph is local.

    lno_t *lnos = new lno_t [numLocalGraphEdges];
    env->localMemoryAssertion(__FILE__, __LINE__, numLocalGraphEdges, lnos);
    if (comm->getSize() == 1) {
      // With one rank, Can use gnos as local index.
      if (gnosAreGids)
        for (size_t i=0; i < numLocalEdges; i++) lnos[i] = allEdgeIds[i];
      else
        for (size_t i=0; i < numLocalEdges; i++) lnos[i] = allEdgeGnos[i];
    }
    else {
      ArrayRCP<gno_t> gnoArray;

      if (gnosAreGids){
        ArrayRCP<const gno_t> gnosConst =
                 arcp_reinterpret_cast<const gno_t>(allEdgeIds);
        gnoArray = arcp_const_cast<gno_t>(gnosConst);
      }
      else {
        gnoArray = arcp_const_cast<gno_t>(allEdgeGnos);
      }

      // Need to translate to gnos to local indexing
      ArrayView<lno_t> lnoView(lnos, numLocalGraphEdges);
      try {
        idMap->lnoTranslate(lnoView,
                            gnoArray.view(0,numLocalGraphEdges),
                            TRANSLATE_LIB_TO_APP);
      }
      Z2_FORWARD_EXCEPTIONS;
    }
    edgeLocalIds = arcp(lnos, 0, numLocalGraphEdges, true);
    offsets = allOffs;
    eWeights = allWeights;

  }
  void ZoltanInterface<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Build(Level& level) const {
    FactoryMonitor m(*this, "Build", level);

    RCP<Matrix>      A        = Get< RCP<Matrix> >     (level, "A");
    RCP<const Map>   rowMap   = A->getRowMap();

    RCP<MultiVector> Coords   = Get< RCP<MultiVector> >(level, "Coordinates");
    size_t           dim      = Coords->getNumVectors();

    GO               numParts = level.Get<GO>("number of partitions");

    if (numParts == 1) {
      // Running on one processor, so decomposition is the trivial one, all zeros.
      RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, true);
      Set(level, "Partition", decomposition);
      return;
    }

    float zoltanVersion_;
    Zoltan_Initialize(0, NULL, &zoltanVersion_);

    RCP<const Teuchos::MpiComm<int> >            dupMpiComm = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(rowMap->getComm()->duplicate());
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > zoltanComm = dupMpiComm->getRawMpiComm();

    RCP<Zoltan> zoltanObj_ = rcp(new Zoltan((*zoltanComm)()));  //extract the underlying MPI_Comm handle and create a Zoltan object
    if (zoltanObj_ == Teuchos::null)
      throw Exceptions::RuntimeError("MueLu::Zoltan : Unable to create Zoltan data structure");

    // Tell Zoltan what kind of local/global IDs we will use.
    // In our case, each GID is two ints and there are no local ids.
    // One can skip this step if the IDs are just single ints.
    int rv;
    if ((rv = zoltanObj_->Set_Param("num_gid_entries", "1")) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_gid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("num_lid_entries", "0") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'num_lid_entries' returned error code " + Teuchos::toString(rv));
    if ((rv = zoltanObj_->Set_Param("obj_weight_dim", "1") ) != ZOLTAN_OK)
      throw Exceptions::RuntimeError("MueLu::Zoltan::Setup : setting parameter 'obj_weight_dim' returned error code "  + Teuchos::toString(rv));

    if (GetVerbLevel() & Statistics1) zoltanObj_->Set_Param("debug_level", "1");
    else                              zoltanObj_->Set_Param("debug_level", "0");

    zoltanObj_->Set_Param("num_global_partitions", toString(numParts));

    zoltanObj_->Set_Num_Obj_Fn(GetLocalNumberOfRows,      (void *) &*A);
    zoltanObj_->Set_Obj_List_Fn(GetLocalNumberOfNonzeros, (void *) &*A);
    zoltanObj_->Set_Num_Geom_Fn(GetProblemDimension,      (void *) &dim);
    zoltanObj_->Set_Geom_Multi_Fn(GetProblemGeometry,     (void *) Coords.get());

    // Data pointers that Zoltan requires.
    ZOLTAN_ID_PTR import_gids = NULL;  // Global nums of objs to be imported
    ZOLTAN_ID_PTR import_lids = NULL;  // Local indices to objs to be imported
    int   *import_procs       = NULL;  // Proc IDs of procs owning objs to be imported.
    int   *import_to_part     = NULL;  // Partition #s to which imported objs should be assigned.
    ZOLTAN_ID_PTR export_gids = NULL;  // Global nums of objs to be exported
    ZOLTAN_ID_PTR export_lids = NULL;  // local indices to objs to be exported
    int   *export_procs       = NULL;  // Proc IDs of destination procs for objs to be exported.
    int   *export_to_part     = NULL;  // Partition #s for objs to be exported.
    int   num_imported;                // Number of objs to be imported.
    int   num_exported;                // Number of objs to be exported.
    int   newDecomp;                   // Flag indicating whether the decomposition has changed
    int   num_gid_entries;             // Number of array entries in a global ID.
    int   num_lid_entries;

    {
      SubFactoryMonitor m1(*this, "Zoltan RCB", level);
      rv = zoltanObj_->LB_Partition(newDecomp, num_gid_entries, num_lid_entries,
                                    num_imported, import_gids, import_lids, import_procs, import_to_part,
                                    num_exported, export_gids, export_lids, export_procs, export_to_part);
      if (rv == ZOLTAN_FATAL)
        throw Exceptions::RuntimeError("Zoltan::LB_Partition() returned error code");
    }

    // TODO check that A's row map is 1-1.  Zoltan requires this.

    RCP<Xpetra::Vector<GO, LO, GO, NO> > decomposition;
    if (newDecomp) {
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(rowMap, false); // Don't initialize, will be overwritten
      ArrayRCP<GO> decompEntries = decomposition->getDataNonConst(0);

      int mypid = rowMap->getComm()->getRank();
      for (typename ArrayRCP<GO>::iterator i = decompEntries.begin(); i != decompEntries.end(); ++i)
        *i = mypid;

      LO blockSize = A->GetFixedBlockSize();
      for (int i = 0; i < num_exported; ++i) {
        // We have assigned Zoltan gids to first row GID in the block
        // NOTE: Zoltan GIDs are different from GIDs in the Coordinates vector
        LO  localEl = rowMap->getLocalElement(export_gids[i]);
        int partNum = export_to_part[i];
        for (LO j = 0; j < blockSize; ++j)
          decompEntries[localEl + j] = partNum;
      }
    }

    Set(level, "Partition", decomposition);

    zoltanObj_->LB_Free_Part(&import_gids, &import_lids, &import_procs, &import_to_part);
    zoltanObj_->LB_Free_Part(&export_gids, &export_lids, &export_procs, &export_to_part);

  } //Build()
// ********************************************************
int main(int argc, char *argv[]) 
{
  using namespace std;
  using namespace Teuchos;
  using namespace PHX;
  
  GlobalMPISession mpi_session(&argc, &argv);

  try {
    
    RCP<Time> total_time = TimeMonitor::getNewTimer("Total Run Time");
    TimeMonitor tm(*total_time);

    // *********************************************************************
    // Start of MDField Testing
    // *********************************************************************
    {

      typedef MDField<double,Cell,Node>::size_type size_type;

      std::vector<size_type> dims(3);
      dims[0] = 10;
      dims[1] = 4;
      dims[2] = 3;

      RCP<DataLayout> quad_vector = 
	rcp(new MDALayout<Cell,Quadrature,Dim>(dims[0],dims[1],dims[2]));
      
      int size = quad_vector->size();

      TEUCHOS_TEST_FOR_EXCEPTION(size != dims[0]*dims[1]*dims[2], std::runtime_error, 
			 "Size mismatch on MDField!");

      ArrayRCP<double> a_mem = arcp<double>(size);
      ArrayRCP<double> b_mem = arcp<double>(size);

      for (int i=0; i < a_mem.size(); ++i)
	a_mem[i] = static_cast<double>(i);

      for (int i=0; i < b_mem.size(); ++i)
	b_mem[i] = static_cast<double>(i);

      MDField<double,Cell,Point,Dim> a("density",quad_vector);
      MDField<double> b("density",quad_vector);

      a.setFieldData(a_mem);
      b.setFieldData(b_mem);

      simulated_intrepid_integrate(a);     
      simulated_intrepid_integrate(b);     

      // ***********************
      // Shards tests
      // ***********************

      ArrayRCP<double> c_mem = arcp<double>(size);
      ArrayRCP<double> d_mem = arcp<double>(size);

      for (int i=0; i < c_mem.size(); ++i)
	c_mem[i] = static_cast<double>(i);

      for (int i=0; i < d_mem.size(); ++i)
	d_mem[i] = static_cast<double>(i);

      shards::Array<double,shards::NaturalOrder,Cell,Node,Dim> c(c_mem.get(),
								 dims[0], 
								 dims[1],
								 dims[2]);

      size_type rank = dims.size();

      const ArrayRCP<const shards::ArrayDimTag*> tags = 
	arcp<const shards::ArrayDimTag*>(rank);
      tags[0] = &Cell::tag();
      tags[1] = &Point::tag();
      tags[2] = &Dim::tag();
      
      shards::Array<double,shards::NaturalOrder> d(d_mem.get(),rank,
						   &dims[0],tags.get());
      
      simulated_intrepid_integrate(d); 
      simulated_intrepid_integrate((const shards::Array<double,shards::NaturalOrder>&)(c));    

    }

    // *********************************************************************
    // *********************************************************************
    std::cout << "\nTest passed!\n" << std::endl; 
    // *********************************************************************
    // *********************************************************************

  }
  catch (const std::exception& e) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Exception Caught!" << endl;
    std::cout << "Error message is below\n " << e.what() << endl;
    std::cout << "************************************************" << endl;
  }
  catch (...) {
    std::cout << "************************************************" << endl;
    std::cout << "************************************************" << endl;
    std::cout << "Unknown Exception Caught!" << endl;
    std::cout << "************************************************" << endl;
  }

  TimeMonitor::summarize();
    
  return 0;
}
Esempio n. 13
0
  void LeftoverAggregationAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::AggregateLeftovers(GraphBase const &graph, Aggregates &aggregates) const {
    Monitor m(*this, "AggregateLeftovers");

    my_size_t nVertices = graph.GetNodeNumVertices();
    int exp_nRows    = aggregates.GetMap()->getNodeNumElements(); // Tentative fix... was previously exp_nRows = nVertices + graph.GetNodeNumGhost();
    int myPid        = graph.GetComm()->getRank();
    my_size_t nAggregates  = aggregates.GetNumAggregates();

    int minNodesPerAggregate = GetMinNodesPerAggregate();

    const RCP<const Map> nonUniqueMap = aggregates.GetMap(); //column map of underlying graph
    const RCP<const Map> uniqueMap    = graph.GetDomainMap();

    MueLu::CoupledAggregationCommHelper<LO,GO,NO,LMO> myWidget(uniqueMap, nonUniqueMap);

    //TODO JJH We want to skip this call
    RCP<Xpetra::Vector<double,LO,GO,NO> > distWeights = Xpetra::VectorFactory<double,LO,GO,NO>::Build(nonUniqueMap);

    // Aggregated vertices not "definitively" assigned to processors are
    // arbitrated by ArbitrateAndCommunicate(). There is some
    // additional logic to prevent losing root nodes in arbitration.
    {
      ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>    weights     = distWeights->getDataNonConst(0);

      for (size_t i=0;i<nonUniqueMap->getNodeNumElements();i++) {
        if (procWinner[i] == MUELU_UNASSIGNED) {
          if (vertex2AggId[i] != MUELU_UNAGGREGATED) {
            weights[i] = 1.;
            if (aggregates.IsRoot(i)) weights[i] = 2.;
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Tentatively assign any vertex (ghost or local) which neighbors a root
    // to the aggregate associated with the root.
    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      ArrayRCP<double>   weights      = distWeights->getDataNonConst(0);

      for (my_size_t i = 0; i < nVertices; i++) {
        if ( aggregates.IsRoot(i) && (procWinner[i] == myPid) ) {

          // neighOfINode is the neighbor node list of node 'i'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int colj = *it;
            if (vertex2AggId[colj] == MUELU_UNAGGREGATED) {
              weights[colj]= 1.;
              vertex2AggId[colj] = vertex2AggId[i];
            }
          }
        }
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    // All tentatively assigned vertices are now definitive

    // Record the number of aggregated vertices
    GO total_phase_one_aggregated = 0;
    {
      ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);

      GO phase_one_aggregated = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] != MUELU_UNAGGREGATED)
          phase_one_aggregated++;
      }

      sumAll(graph.GetComm(), phase_one_aggregated, total_phase_one_aggregated);

      GO local_nVertices = nVertices, total_nVertices = 0;
      sumAll(graph.GetComm(), local_nVertices, total_nVertices);

      /* Among unaggregated points, see if we can make a reasonable size    */
      /* aggregate out of it. We do this by looking at neighbors and seeing */
      /* how many are unaggregated and on my processor. Loosely,            */
      /* base the number of new aggregates created on the percentage of     */
      /* unaggregated nodes.                                                */

      ArrayRCP<double>    weights      = distWeights->getDataNonConst(0);

      double factor = 1.;
      factor = ((double) total_phase_one_aggregated)/((double)(total_nVertices + 1));
      factor = pow(factor, GetPhase3AggCreation());

      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED)
          {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);
            int rowi_N = neighOfINode.size();

            int nonaggd_neighbors = 0;
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int colj = *it;
              if (vertex2AggId[colj] == MUELU_UNAGGREGATED && colj < nVertices)
                nonaggd_neighbors++;
            }
            if (  (nonaggd_neighbors > minNodesPerAggregate) &&
                  (((double) nonaggd_neighbors)/((double) rowi_N) > factor))
              {
                vertex2AggId[i] = (nAggregates)++;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int colj = *it;
                  if (vertex2AggId[colj]==MUELU_UNAGGREGATED) {
                    vertex2AggId[colj] = vertex2AggId[i];
                    if (colj < nVertices) weights[colj] = 2.;
                    else                  weights[colj] = 1.;
                  }
                }
                aggregates.SetIsRoot(i);
                weights[i] = 2.;
              }
          }
      } // for (i = 0; i < nVertices; i++)

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
    //All tentatively assigned vertices are now definitive

    if (IsPrint(Statistics1)) {
      GO Nphase1_agg = nAggregates;
      GO total_aggs;

      sumAll(graph.GetComm(), Nphase1_agg, total_aggs);

      GetOStream(Statistics1, 0) << "Phase 1 - nodes aggregated = " << total_phase_one_aggregated << std::endl;
      GetOStream(Statistics1, 0) << "Phase 1 - total aggregates = " << total_aggs << std::endl;

      GO i = nAggregates - Nphase1_agg;
      { GO ii; sumAll(graph.GetComm(),i,ii); i = ii; }
      GetOStream(Statistics1, 0) << "Phase 3 - additional aggregates = " << i << std::endl;
    }

    // Determine vertices that are not shared by setting Temp to all ones
    // and doing NonUnique2NonUnique(..., ADD). This sums values of all
    // local copies associated with each Gid. Thus, sums > 1 are shared.

    //         std::cout << "exp_nrows=" << exp_nRows << " (nVertices= " << nVertices << ", numGhost=" << graph.GetNodeNumGhost() << ")" << std::endl;
    //         std::cout << "nonUniqueMap=" << nonUniqueMap->getNodeNumElements() << std::endl;

    RCP<Xpetra::Vector<double,LO,GO,NO> > temp_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap,false); //no need to zero out vector in ctor
    temp_->putScalar(1.);

    RCP<Xpetra::Vector<double,LO,GO,NO> > tempOutput_ = Xpetra::VectorFactory<double,LO,GO,NO> ::Build(nonUniqueMap);

    myWidget.NonUnique2NonUnique(*temp_, *tempOutput_, Xpetra::ADD);

    std::vector<bool> gidNotShared(exp_nRows);
    {
      ArrayRCP<const double> tempOutput = tempOutput_->getData(0);
      for (int i = 0; i < exp_nRows; i++) {
        if (tempOutput[i] > 1.)
          gidNotShared[i] = false;
        else
          gidNotShared[i] = true;
      }
    }

    // Phase 4.
    double nAggregatesTarget;
    nAggregatesTarget = ((double)  uniqueMap->getGlobalNumElements())* (((double) uniqueMap->getGlobalNumElements())/ ((double) graph.GetGlobalNumEdges()));

    GO nAggregatesLocal=nAggregates, nAggregatesGlobal; sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

    LO minNAggs; minAll(graph.GetComm(), nAggregates, minNAggs);
    LO maxNAggs; maxAll(graph.GetComm(), nAggregates, maxNAggs);

    //
    // Only do this phase if things look really bad. THIS
    // CODE IS PRETTY EXPERIMENTAL
    //
#define MUELU_PHASE4BUCKETS 6
    if ((nAggregatesGlobal < graph.GetComm()->getSize()) &&
        (2.5*nAggregatesGlobal < nAggregatesTarget) &&
        (minNAggs ==0) && (maxNAggs <= 1)) {

      // Modify seed of the random algorithm used by temp_->randomize()
      {
        typedef Teuchos::ScalarTraits<double> scalarTrait; // temp_ is of type double.
        scalarTrait::seedrandom(static_cast<unsigned int>(myPid*2 + (int) (11*scalarTrait::random())));
        int k = (int)ceil( (10.*myPid)/graph.GetComm()->getSize());
        for (int i = 0; i < k+7; i++) scalarTrait::random();
        temp_->setSeed(static_cast<unsigned int>(scalarTrait::random()));
      }

      temp_->randomize();

      ArrayRCP<double> temp = temp_->getDataNonConst(0);

      // build a list of candidate root nodes (vertices not adjacent
      // to aggregated vertices)

      my_size_t nCandidates = 0;
      global_size_t nCandidatesGlobal;

      ArrayRCP<LO> candidates = Teuchos::arcp<LO>(nVertices+1);

      double priorThreshold = 0.;
      for (int kkk = 0; kkk < MUELU_PHASE4BUCKETS; kkk++) {

        {
          ArrayRCP<const LO> vertex2AggId = aggregates.GetVertex2AggId()->getData(0);
          ArrayView<const LO> vertex2AggIdView = vertex2AggId();
          RootCandidates(nVertices, vertex2AggIdView, graph, candidates, nCandidates, nCandidatesGlobal);
          // views on distributed vectors are freed here.
        }

        double nTargetNewGuys =  nAggregatesTarget - nAggregatesGlobal;
        double threshold      =  priorThreshold + (1. - priorThreshold)*nTargetNewGuys/(nCandidatesGlobal + .001);

        threshold = (threshold*(kkk+1.))/((double) MUELU_PHASE4BUCKETS);
        priorThreshold = threshold;

        {
          ArrayRCP<LO>     vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
          ArrayRCP<double> weights      = distWeights->getDataNonConst(0);

          for (int k = 0; k < nCandidates; k++ ) {
            int i = candidates[k];
            if ((vertex2AggId[i] == MUELU_UNAGGREGATED) && (fabs(temp[i])  < threshold)) {
              // Note: priorThreshold <= fabs(temp[i]) <= 1

              // neighOfINode is the neighbor node list of node 'iNode'.
              ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

              if (neighOfINode.size() > minNodesPerAggregate) { //TODO: check if this test is exactly was we want to do
                int count = 0;
                for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                  int Adjacent    = *it;
                  // This might not be true if someone close to i
                  // is chosen as a root via fabs(temp[]) < Threshold
                  if (vertex2AggId[Adjacent] == MUELU_UNAGGREGATED){
                    count++;
                    vertex2AggId[Adjacent] = nAggregates;
                    weights[Adjacent] = 1.;
                  }
                }
                if (count >= minNodesPerAggregate) {
                  vertex2AggId[i] = nAggregates++;
                  weights[i] = 2.;
                  aggregates.SetIsRoot(i);
                }
                else { // undo things
                  for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
                    int Adjacent    = *it;
                    if (vertex2AggId[Adjacent] == nAggregates){
                      vertex2AggId[Adjacent] = MUELU_UNAGGREGATED;
                      weights[Adjacent] = 0.;
                    }
                  }
                }
              }
            }
          }
          // views on distributed vectors are freed here.
        }
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
        nAggregatesLocal=nAggregates;
        sumAll(graph.GetComm(), nAggregatesLocal, nAggregatesGlobal);

        // check that there are no aggregates sizes below minNodesPerAggregate

        aggregates.SetNumAggregates(nAggregates);

        RemoveSmallAggs(aggregates, minNodesPerAggregate, distWeights, myWidget);

        nAggregates = aggregates.GetNumAggregates();
      }   // one possibility
    }

    // Initialize things for Phase 5. This includes building the transpose
    // of the matrix ONLY for transposed rows that correspond to unaggregted
    // ghost vertices. Further, the transpose is only a local transpose.
    // Nonzero edges which exist on other processors are not represented.


    int observedNAgg=-1; //number of aggregates that contain vertices on this process

    {
      ArrayRCP<LO>       vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<const LO> procWinner   = aggregates.GetProcWinner()->getData(0);
      for(LO k = 0; k < vertex2AggId.size(); ++k )
        if(vertex2AggId[k]>observedNAgg)
          observedNAgg=vertex2AggId[k];
      observedNAgg++;
    }

    ArrayRCP<int> Mark = Teuchos::arcp<int>(exp_nRows+1);
    ArrayRCP<int> agg_incremented = Teuchos::arcp<int>(observedNAgg);
    ArrayRCP<int> SumOfMarks = Teuchos::arcp<int>(observedNAgg);

    for (int i = 0; i < exp_nRows; i++)   Mark[i] = MUELU_DISTONE_VERTEX_WEIGHT;
    for (int i = 0; i < agg_incremented.size(); i++) agg_incremented[i] = 0;
    for (int i = 0; i < SumOfMarks.size(); i++) SumOfMarks[i] = 0;

    // Grab the transpose matrix graph for unaggregated ghost vertices.
    //     a) count the number of nonzeros per row in the transpose
    std::vector<int> RowPtr(exp_nRows+1-nVertices);
    //{
    ArrayRCP<const LO> vertex2AggIdCst = aggregates.GetVertex2AggId()->getData(0);

    for (int i = nVertices; i < exp_nRows;  i++) RowPtr[i-nVertices] = 0;
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          RowPtr[j-nVertices]++;
        }
      }
    }

    //     b) Convert RowPtr[i] to point to 1st first nnz spot in row i.

    int iSum = 0, iTemp;
    for (int i = nVertices; i < exp_nRows;  i++) {
      iTemp = RowPtr[i-nVertices];
      RowPtr[i-nVertices] = iSum;
      iSum += iTemp;
    }
    RowPtr[exp_nRows-nVertices] = iSum;
    std::vector<LO> cols(iSum+1);

    //     c) Traverse matrix and insert entries in proper location.
    for (int i = 0; i < nVertices;  i++) {

      // neighOfINode is the neighbor node list of node 'iNode'.
      ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

      for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
        int j = *it;
        if ( (j >= nVertices) && (vertex2AggIdCst[j] == MUELU_UNAGGREGATED)){
          cols[RowPtr[j-nVertices]++] = i;
        }
      }
    }

    //     d) RowPtr[i] points to beginning of row i+1 so shift by one location.
    for (int i = exp_nRows; i > nVertices;  i--)
      RowPtr[i-nVertices] = RowPtr[i-1-nVertices];
    RowPtr[0] = 0;

    // views on distributed vectors are freed here.
    vertex2AggIdCst = Teuchos::null;
    //}

    int bestScoreCutoff;
    int thresholds[10] = {300,200,100,50,25,13,7,4,2,0};

    // Stick unaggregated vertices into existing aggregates as described above.

    {
      int ncalls=0;

      for (int kk = 0; kk < 10; kk += 2) {
        bestScoreCutoff = thresholds[kk];

        ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
        ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);
        ArrayRCP<double> weights       = distWeights->getDataNonConst(0);

        for (int i = 0; i < exp_nRows; i++) {

          if (vertex2AggId[i] == MUELU_UNAGGREGATED) {

            // neighOfINode is the neighbor node list of node 'iNode'.
            ArrayView<const LO> neighOfINode;

            // Grab neighboring vertices which is either in graph for local ids
            // or sits in transposed fragment just constructed above for ghosts.
            if (i < nVertices) {
              neighOfINode = graph.getNeighborVertices(i);
            }
            else {
              LO *rowi_col = NULL, rowi_N;
              rowi_col = &(cols[RowPtr[i-nVertices]]);
              rowi_N   = RowPtr[i+1-nVertices] - RowPtr[i-nVertices];

              neighOfINode = ArrayView<const LO>(rowi_col, rowi_N);
            }
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];

              //Adjacent is aggregated and either I own the aggregate
              // or I could own the aggregate after arbitration.
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED))){
                SumOfMarks[AdjacentAgg] += Mark[Adjacent];
              }
            }
            int best_score = MUELU_NOSCORE;
            int best_agg = -1;
            int BestMark = -1;
            bool cannotLoseAllFriends=false; // Used to address possible loss of vertices in arbitration of shared nodes discussed above. (Initialized to false only to avoid a compiler warning).

            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              //Adjacent is unaggregated, has some value and no
              //other processor has definitively claimed him
              if ((AdjacentAgg != MUELU_UNAGGREGATED) &&
                  (SumOfMarks[AdjacentAgg] != 0) &&
                  ((procWinner[Adjacent] == myPid) ||
                   (procWinner[Adjacent] == MUELU_UNASSIGNED ))) {

                // first figure out the penalty associated with
                // AdjacentAgg having already been incremented
                // during this phase, then compute score.

                double penalty = (double) (INCR_SCALING*agg_incremented[AdjacentAgg]);
                if (penalty > MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]))
                  penalty = MUELU_PENALTYFACTOR*((double)SumOfMarks[AdjacentAgg]);
                int score = SumOfMarks[AdjacentAgg]- ((int) floor(penalty));

                if (score > best_score) {
                  best_agg             = AdjacentAgg;
                  best_score           = score;
                  BestMark             = Mark[Adjacent];
                  cannotLoseAllFriends = false;

                  // This address issue mentioned above by checking whether
                  // Adjacent could be lost in arbitration. weight==0 means that
                  // Adjacent was not set during this loop of Phase 5 (and so it
                  // has already undergone arbitration). GidNotShared == true
                  // obviously implies that Adjacent cannot be lost to arbitration
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                }
                // Another vertex within current best aggregate found.
                // We should have (best_score == score). We need to see
                // if we can improve BestMark and cannotLoseAllFriends.
                else if (best_agg == AdjacentAgg) {
                  if ((weights[Adjacent]== 0.) || (gidNotShared[Adjacent] == true))
                    cannotLoseAllFriends = true;
                  if (Mark[Adjacent] > BestMark) BestMark = Mark[Adjacent];
                }
              }
            }
            // Clean up
            for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
              int Adjacent    = *it;
              int AdjacentAgg = vertex2AggId[Adjacent];
              if (AdjacentAgg >= 0) SumOfMarks[AdjacentAgg] = 0;
            }
            // Tentatively assign vertex to best_agg.
            if ( (best_score >= bestScoreCutoff) && (cannotLoseAllFriends)) {

              TEUCHOS_TEST_FOR_EXCEPTION(best_agg == -1 || BestMark == -1, MueLu::Exceptions::RuntimeError, "MueLu::CoupledAggregationFactory internal error"); // should never happen

              vertex2AggId[i] = best_agg;
              weights[i] = best_score;
              agg_incremented[best_agg]++;
              Mark[i] = (int) ceil(   ((double) BestMark)/2.);
            }
          }

          // views on distributed vectors are freed here.
        }

        vertex2AggId = Teuchos::null;
        procWinner   = Teuchos::null;
        weights      = Teuchos::null;

        ++ncalls;
        //TODO JJH We want to skip this call
        myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, true);
        // All tentatively assigned vertices are now definitive
      }

      //       if (graph.GetComm()->getRank()==0)
      //         std::cout << "#calls to Arb&Comm=" << ncalls << std::endl;
    }

    // Phase 6: Aggregate remain unaggregated vertices and try at all costs
    //          to avoid small aggregates.
    //          One case where we can find ourselves in this situation
    //          is if all vertices vk adjacent to v have already been
    //          put in other processor's aggregates and v does not have
    //          a direct connection to a local vertex in any of these
    //          aggregates.

    int Nleftover = 0, Nsingle = 0;
    {

      ArrayRCP<LO> vertex2AggId     = aggregates.GetVertex2AggId()->getDataNonConst(0);
      ArrayRCP<double> weights       = distWeights->getDataNonConst(0);
      ArrayRCP<const LO> procWinner = aggregates.GetProcWinner()->getData(0);

      int count = 0;
      for (my_size_t i = 0; i < nVertices; i++) {
        if (vertex2AggId[i] == MUELU_UNAGGREGATED) {
          Nleftover++;

          // neighOfINode is the neighbor node list of node 'iNode'.
          ArrayView<const LO> neighOfINode = graph.getNeighborVertices(i);

          // We don't want too small of an aggregate. So lets see if there is an
          // unaggregated neighbor that we can also put with this vertex

          vertex2AggId[i] = nAggregates;
          weights[i] = 1.;
          if (count == 0) aggregates.SetIsRoot(i);
          count++;
          for (typename ArrayView<const LO>::const_iterator it = neighOfINode.begin(); it != neighOfINode.end(); ++it) {
            int j = *it;
            if ((j != i)&&(vertex2AggId[j] == MUELU_UNAGGREGATED)&&
                (j < nVertices)) {
              vertex2AggId[j] = nAggregates;
              weights[j] = 1.;
              count++;
            }
          }
          if ( count >= minNodesPerAggregate) {
            nAggregates++;
            count = 0;
          }
        }
      }

      // We have something which is under minNodesPerAggregate when
      if (count != 0) {
#ifdef FIXME
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            if ((vertex2AggId[i] == nAggregates) && (procWinner[i] == myPid)) {
              vertex2AggId[i] = 0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#else
        // Can stick small aggregate with 0th aggregate?
        if (nAggregates > 0) {
          for (my_size_t i = 0; i < nVertices; i++) {
            // TW: This is not a real fix. This may produce ugly bad aggregates!
            // I removed the procWinner[i] == myPid check. it makes no sense to me since
            // it leaves vertex2AggId[i] == nAggregates -> crash in ComputeAggregateSizes().
            // Maybe it's better to add the leftovers to the last generated agg on the current proc.
            // The best solution would be to add them to the "next"/nearest aggregate, that may be
            // on an other processor
            if (vertex2AggId[i] == nAggregates) {
              vertex2AggId[i] = nAggregates-1; //0;
              aggregates.SetIsRoot(i,false);
            }
          }
        }
        else {
          Nsingle++;
          nAggregates++;
        }
#endif
      }

      // views on distributed vectors are freed here.
    }

    //TODO JJH We want to skip this call
    myWidget.ArbitrateAndCommunicate(*distWeights, aggregates, false);

    if (IsPrint(Statistics1)) {
      GO total_Nsingle=0;   sumAll(graph.GetComm(), (GO)Nsingle,     total_Nsingle);
      GO total_Nleftover=0; sumAll(graph.GetComm(), (GO)Nleftover,   total_Nleftover);
      // GO total_aggs;        sumAll(graph.GetComm(), (GO)nAggregates, total_aggs);
      // GetOStream(Statistics1, 0) << "Phase 6 - total aggregates = " << total_aggs << std::endl;
      GetOStream(Statistics1, 0) << "Phase 6 - leftovers = " << total_Nleftover << " and singletons = " << total_Nsingle << std::endl;
    }

    aggregates.SetNumAggregates(nAggregates);

  } //AggregateLeftovers
  void BraessSarazinSmoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Setup(Level& currentLevel) {
    FactoryMonitor m(*this, "Setup Smoother", currentLevel);

    if (SmootherPrototype::IsSetup() == true)
      this->GetOStream(Warnings0) << "MueLu::BreaessSarazinSmoother::Setup(): Setup() has already been called";

    // Extract blocked operator A from current level
    A_ = Factory::Get<RCP<Matrix> > (currentLevel, "A");
    RCP<BlockedCrsMatrix> bA = rcp_dynamic_cast<BlockedCrsMatrix>(A_);
    TEUCHOS_TEST_FOR_EXCEPTION(bA.is_null(), Exceptions::BadCast,
                               "MueLu::BraessSarazinSmoother::Setup: input matrix A is not of type BlockedCrsMatrix! error.");

    // Store map extractors
    rangeMapExtractor_  = bA->getRangeMapExtractor();
    domainMapExtractor_ = bA->getDomainMapExtractor();

    // Store the blocks in local member variables
    A00_ = bA->getMatrix(0,0);
    A01_ = bA->getMatrix(0,1);
    A10_ = bA->getMatrix(1,0);
    A11_ = bA->getMatrix(1,1);

    const ParameterList& pL = Factory::GetParameterList();
    SC omega = pL.get<SC>("Damping factor");

#if 0 // old code
    // Create the inverse of the diagonal of F
    D_ = VectorFactory::Build(A00_->getRowMap());

    ArrayRCP<SC> diag;
    if (pL.get<bool>("lumping") == false)
      diag = Utilities::GetMatrixDiagonal      (*A00_);
    else
      diag = Utilities::GetLumpedMatrixDiagonal(*A00_);

    SC one = Teuchos::ScalarTraits<SC>::one();

    ArrayRCP<SC> Ddata = D_->getDataNonConst(0);
    for (GO row = 0; row < Ddata.size(); row++)
      Ddata[row] = one / (diag[row]*omega);*/
#else
    // Create the inverse of the diagonal of F
    // TODO add safety check for zeros on diagonal of F!
    RCP<Vector> diagFVector = VectorFactory::Build(A00_->getRowMap());
    if (pL.get<bool>("lumping") == false) {
      A00_->getLocalDiagCopy(*diagFVector);       // extract diagonal of F
    } else {
      diagFVector = Utilities::GetLumpedMatrixDiagonal(A00_);
    }
    diagFVector->scale(omega);
    D_ = Utilities::GetInverse(diagFVector);
#endif

    // Set the Smoother
    // carefully switch to the SubFactoryManagers (defined by the users)
    {
      SetFactoryManager currentSFM(rcpFromRef(currentLevel), FactManager_);
      smoo_ = currentLevel.Get<RCP<SmootherBase> >("PreSmoother", FactManager_->GetFactory("Smoother").get());
      S_    = currentLevel.Get<RCP<Matrix> >      ("A",           FactManager_->GetFactory("A").get());
    }

    SmootherPrototype::IsSetup(true);
  }
int main(int argc, char *argv[]) {
#if defined(HAVE_MUELU_EPETRA) && defined(HAVE_MUELU_EPETRAEXT)
  typedef double Scalar;
  typedef int LocalOrdinal;
  typedef int GlobalOrdinal;
  typedef LocalOrdinal LO;
  typedef GlobalOrdinal GO;
  typedef Xpetra::EpetraNode Node;
#include "MueLu_UseShortNames.hpp"

  using Teuchos::RCP;
  using Teuchos::rcp;
  using namespace MueLuTests;
  using namespace Teuchos;

  oblackholestream blackhole;
  GlobalMPISession mpiSession(&argc,&argv,&blackhole);

  bool success = false;
  bool verbose = true;
  try {
    // default parameters
    std::string xmlFile = "myXML.xml";

    // Note: use --help to list available options.
    CommandLineProcessor clp(false);
    clp.setOption("xml", &xmlFile, "xml file with solver parameters for a 2x2 blocked NS example");

    switch (clp.parse(argc,argv)) {
      case CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS; break;
      case CommandLineProcessor::PARSE_ERROR:
      case CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; break;
      case CommandLineProcessor::PARSE_SUCCESSFUL:                               break;
    }

    RCP<const Comm<int> > comm = DefaultComm<int>::getComm();
    RCP<FancyOStream> out = fancyOStream(rcpFromRef(std::cout));
    out->setOutputToRootOnly(0);
    *out << MueLu::MemUtils::PrintMemoryUsage() << std::endl;

    // Timing
    Time myTime("global");
    TimeMonitor MM(myTime);

    GO maxCoarseSize=1; //FIXME clp doesn't like long long int

    int globalNumDofs = 1500;  // used for the maps
    int nDofsPerNode = 3;            // used for generating the fine level null-space

    // build strided maps
    // striding information: 2 velocity dofs and 1 pressure dof = 3 dofs per node
    std::vector<size_t> stridingInfo;
    stridingInfo.push_back(2);
    stridingInfo.push_back(1);

    /////////////////////////////////////// build strided maps
    // build strided maps:
    // xstridedfullmap: full map (velocity and pressure dof gids), continous
    // xstridedvelmap: only velocity dof gid maps (i.e. 0,1,3,4,6,7...)
    // xstridedpremap: only pressure dof gid maps (i.e. 2,5,8,...)
    Xpetra::UnderlyingLib lib = Xpetra::UseEpetra;
    RCP<const StridedMap> xstridedfullmap = StridedMapFactory::Build(lib,globalNumDofs,0,stridingInfo,comm,-1);
    RCP<const StridedMap> xstridedvelmap  = StridedMapFactory::Build(xstridedfullmap,0);
    RCP<const StridedMap> xstridedpremap  = StridedMapFactory::Build(xstridedfullmap,1);

    /////////////////////////////////////// transform Xpetra::Map objects to Epetra
    // this is needed for AztecOO
    const RCP<const Epetra_Map> fullmap = rcpFromRef(Xpetra::toEpetra(*xstridedfullmap));
    RCP<const Epetra_Map>       velmap  = rcpFromRef(Xpetra::toEpetra(*xstridedvelmap));
    RCP<const Epetra_Map>       premap  = rcpFromRef(Xpetra::toEpetra(*xstridedpremap));

    /////////////////////////////////////// import problem matrix and RHS from files (-> Epetra)

    // read in problem
    Epetra_CrsMatrix * ptrA = 0;
    Epetra_Vector * ptrf = 0;
    Epetra_MultiVector* ptrNS = 0;

    *out << "Reading matrix market file" << std::endl;
    EpetraExt::MatrixMarketFileToCrsMatrix("A_re1000_5932.txt",*fullmap,*fullmap,*fullmap,ptrA);
    EpetraExt::MatrixMarketFileToVector("b_re1000_5932.txt",*fullmap,ptrf);
    RCP<Epetra_CrsMatrix> epA = rcp(ptrA);
    RCP<Epetra_Vector> epv = rcp(ptrf);
    RCP<Epetra_MultiVector> epNS = rcp(ptrNS);

    /////////////////////////////////////// split system into 2x2 block system

    *out << "Split matrix into 2x2 block matrix" << std::endl;

    // split fullA into A11,..., A22
    RCP<Epetra_CrsMatrix> A11;
    RCP<Epetra_CrsMatrix> A12;
    RCP<Epetra_CrsMatrix> A21;
    RCP<Epetra_CrsMatrix> A22;

    if(SplitMatrix2x2(epA,*velmap,*premap,A11,A12,A21,A22)==false)
      *out << "Problem with splitting matrix"<< std::endl;

    /////////////////////////////////////// transform Epetra objects to Xpetra (needed for MueLu)

    // build Xpetra objects from Epetra_CrsMatrix objects
    RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA11 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A11));
    RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA12 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A12));
    RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA21 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A21));
    RCP<Xpetra::CrsMatrix<Scalar,LO,GO,Node> > xA22 = rcp(new Xpetra::EpetraCrsMatrixT<GO,Node>(A22));

    /////////////////////////////////////// generate MapExtractor object

    std::vector<RCP<const Xpetra::Map<LO,GO,Node> > > xmaps;
    xmaps.push_back(xstridedvelmap);
    xmaps.push_back(xstridedpremap);

    RCP<const Xpetra::MapExtractor<Scalar,LO,GO,Node> > map_extractor = Xpetra::MapExtractorFactory<Scalar,LO,GO,Node>::Build(xstridedfullmap,xmaps);

    /////////////////////////////////////// build blocked transfer operator
    // using the map extractor
    RCP<Xpetra::BlockedCrsMatrix<Scalar,LO,GO,Node> > bOp = rcp(new Xpetra::BlockedCrsMatrix<Scalar,LO,GO,Node>(map_extractor,map_extractor,10));
    bOp->setMatrix(0,0,xA11);
    bOp->setMatrix(0,1,xA12);
    bOp->setMatrix(1,0,xA21);
    bOp->setMatrix(1,1,xA22);

    bOp->fillComplete();

    //////////////////////////////////////// prepare setup
    ParameterListInterpreter mueLuFactory(xmlFile, *comm);


    RCP<Hierarchy> H = mueLuFactory.CreateHierarchy();
    H->setDefaultVerbLevel(VERB_HIGH);
    H->SetMaxCoarseSize(maxCoarseSize);

    RCP<MueLu::Level> Finest = H->GetLevel(0);
    Finest->setDefaultVerbLevel(VERB_HIGH);
    Finest->Set("A",           rcp_dynamic_cast<Matrix>(bOp));


    ////////////////////////////////////////// prepare null space for A11
    RCP<MultiVector> nullspace11 = MultiVectorFactory::Build(xstridedvelmap, 2);  // this is a 2D standard null space

    for (int i=0; i<nDofsPerNode-1; ++i) {
      ArrayRCP<Scalar> nsValues = nullspace11->getDataNonConst(i);
      int numBlocks = nsValues.size() / (nDofsPerNode - 1);
      for (int j=0; j< numBlocks; ++j) {
        nsValues[j*(nDofsPerNode - 1) + i] = 1.0;
      }
    }

    Finest->Set("Nullspace1",nullspace11);

    ////////////////////////////////////////// prepare null space for A22
    RCP<MultiVector> nullspace22 = MultiVectorFactory::Build(xstridedpremap, 1);  // this is a 2D standard null space
    ArrayRCP<Scalar> nsValues22 = nullspace22->getDataNonConst(0);
    for (int j=0; j< nsValues22.size(); ++j) {
      nsValues22[j] = 1.0;
    }

    Finest->Set("Nullspace2",nullspace22);

    /////////////////////////////////// BEGIN setup

    mueLuFactory.SetupHierarchy(*H);

    ///////////////////////////////////// END setup

    *out << std::endl;

    RCP<MultiVector> xLsg = MultiVectorFactory::Build(xstridedfullmap,1);

    // Use AMG directly as an iterative method
    {
      xLsg->putScalar( (SC) 0.0);

      // Epetra_Vector -> Xpetra::Vector
      RCP<Vector> xRhs = rcp(new Xpetra::EpetraVectorT<int,Node>(epv));

      // calculate initial (absolute) residual
      Array<ScalarTraits<SC>::magnitudeType> norms(1);
      xRhs->norm2(norms);
      *out << "||x_0|| = " << norms[0] << std::endl;

      // apply ten multigrid iterations
      H->Iterate(*xRhs,*xLsg,100);

      // calculate and print residual
      RCP<MultiVector> xTmp = MultiVectorFactory::Build(xstridedfullmap,1);
      bOp->apply(*xLsg,*xTmp,NO_TRANS,(SC)1.0,(SC)0.0);
      xRhs->update((SC)-1.0,*xTmp,(SC)1.0);
      xRhs->norm2(norms);
      *out << "||r|| = " << norms[0] << std::endl;

    }

    // TODO: don't forget to add Aztec as prerequisite in CMakeLists.txt!
    //
    // Solve Ax = b using AMG as a preconditioner in AztecOO
    //
    {
      RCP<Epetra_Vector> X = rcp(new Epetra_Vector(epv->Map()));
      X->PutScalar(0.0);
      Epetra_LinearProblem epetraProblem(epA.get(), X.get(), epv.get());

      AztecOO aztecSolver(epetraProblem);
      aztecSolver.SetAztecOption(AZ_solver, AZ_gmres);

      MueLu::EpetraOperator aztecPrec(H);
      aztecSolver.SetPrecOperator(&aztecPrec);

      int maxIts = 50;
      double tol = 1e-8;

      aztecSolver.Iterate(maxIts, tol);
    }

    success = true;
  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  return ( success ? EXIT_SUCCESS : EXIT_FAILURE );
#else
  std::cout << "Epetra (and/or EpetraExt) are not available. Skip test." << std::endl;
  return EXIT_SUCCESS;
#endif
}
 EpetraCrsMatrixT<EpetraGlobalOrdinal>::EpetraCrsMatrixT(const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype, const Teuchos::RCP< Teuchos::ParameterList > &plist)
   : isFillResumed_(false)
 {
   Teuchos::Array<int> numEntriesPerRowToAlloc(NumEntriesPerRowToAlloc.begin(), NumEntriesPerRowToAlloc.end()); // convert array of "size_t" to array of "int"
   mtx_ = Teuchos::rcp(new Epetra_CrsMatrix(Copy, toEpetra(rowMap), toEpetra(colMap), numEntriesPerRowToAlloc.getRawPtr(), toEpetra(pftype)));
 }
Esempio n. 17
0
/* this file is automatically generated - do not edit (see script/tpetra.py) */

#include "Xpetra_TpetraConfigDefs.hpp"

#include "Tpetra_CrsMatrix.hpp"

#include "Xpetra_CrsMatrix.hpp"
#include "Xpetra_TpetraMap.hpp"
#include "Xpetra_TpetraMultiVector.hpp"
#include "Xpetra_TpetraVector.hpp"
#include "Xpetra_TpetraCrsGraph.hpp"
//#include "Xpetra_TpetraRowMatrix.hpp"
#include "Xpetra_Exceptions.hpp"

namespace Xpetra {

  // TODO: move that elsewhere
  // template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalMatOps>
  // const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> toTpetraCrsMatrix(const Xpetra::DistObject<char, LocalOrdinal, GlobalOrdinal, Node> &);
  //

  template <class Scalar, class LocalOrdinal = int, class GlobalOrdinal = LocalOrdinal, class Node = Kokkos::DefaultNode::DefaultNodeType, class LocalMatOps = typename Kokkos::DefaultKernels<Scalar,LocalOrdinal,Node>::SparseOps>
  class TpetraCrsMatrix
    : public CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps>//, public TpetraRowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>
  {

    // The following typedef are used by the XPETRA_DYNAMIC_CAST() macro.
    typedef TpetraCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> TpetraCrsMatrixClass;
    typedef TpetraVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> TpetraVectorClass;
    typedef TpetraImport<LocalOrdinal,GlobalOrdinal,Node> TpetraImportClass;
    typedef TpetraExport<LocalOrdinal,GlobalOrdinal,Node> TpetraExportClass;

  public:

    //! @name Constructor/Destructor Methods
    //@{

    //! Constructor specifying fixed number of entries for each row.
    TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, size_t maxNumEntriesPerRow, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
      : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), maxNumEntriesPerRow, toTpetra(pftype), params))) {  }

    //! Constructor specifying (possibly different) number of entries in each row.
    TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
      : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), NumEntriesPerRowToAlloc, toTpetra(pftype), params))) {  }

    //! Constructor specifying column Map and fixed number of entries for each row.
    TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, size_t maxNumEntriesPerRow, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
      : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), toTpetra(colMap), maxNumEntriesPerRow, toTpetra(pftype), params))) {  }

    //! Constructor specifying column Map and number of entries in each row.
    TpetraCrsMatrix(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rowMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap, const ArrayRCP< const size_t > &NumEntriesPerRowToAlloc, ProfileType pftype=DynamicProfile, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
      : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(rowMap), toTpetra(colMap), NumEntriesPerRowToAlloc, toTpetra(pftype), params))) {  }

    //! Constructor specifying a previously constructed graph.
    TpetraCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node, LocalMatOps > > &graph, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
      : mtx_(Teuchos::rcp(new Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps >(toTpetra(graph), params))) {  }



    //! Constructor for a fused import
    TpetraCrsMatrix(const Teuchos::RCP<const CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& sourceMatrix,
		    const Import<LocalOrdinal,GlobalOrdinal,Node> & importer,
		    const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap = Teuchos::null,
		    const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap = Teuchos::null,
		    const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null)
    {
      typedef Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MyTpetraCrsMatrix;
      XPETRA_DYNAMIC_CAST(const TpetraCrsMatrixClass, *sourceMatrix, tSourceMatrix, "Xpetra::TpetraCrsMatrix constructor only accepts Xpetra::TpetraCrsMatrix as the input argument.");//TODO: remove and use toTpetra()
      RCP< const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> > v = tSourceMatrix.getTpetra_CrsMatrix();

      RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myDomainMap = domainMap!=Teuchos::null ? toTpetra(domainMap) : Teuchos::null;
      RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myRangeMap  = rangeMap!=Teuchos::null  ? toTpetra(rangeMap)  : Teuchos::null;
      mtx_=Tpetra::importAndFillCompleteCrsMatrix<MyTpetraCrsMatrix>(tSourceMatrix.getTpetra_CrsMatrix(),toTpetra(importer),myDomainMap,myRangeMap,params);
      bool restrictComm=false;
      if(!params.is_null()) restrictComm = params->get("Restrict Communicator",restrictComm);
      if(restrictComm && mtx_->getRowMap().is_null()) mtx_=Teuchos::null;

    }

    //! Constructor for a fused export
    TpetraCrsMatrix(const Teuchos::RCP<const CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& sourceMatrix,
		    const Export<LocalOrdinal,GlobalOrdinal,Node> & exporter,
		    const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap = Teuchos::null,
		    const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap = Teuchos::null,
		    const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null)
    {
      typedef Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MyTpetraCrsMatrix;
      XPETRA_DYNAMIC_CAST(const TpetraCrsMatrixClass, *sourceMatrix, tSourceMatrix, "Xpetra::TpetraCrsMatrix constructor only accepts Xpetra::TpetraCrsMatrix as the input argument.");//TODO: remove and use toTpetra()
      RCP< const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> > v = tSourceMatrix.getTpetra_CrsMatrix();

      RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myDomainMap = domainMap!=Teuchos::null ? toTpetra(domainMap) : Teuchos::null;
      RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > myRangeMap  = rangeMap!=Teuchos::null  ? toTpetra(rangeMap)  : Teuchos::null;
      mtx_=Tpetra::exportAndFillCompleteCrsMatrix<MyTpetraCrsMatrix>(tSourceMatrix.getTpetra_CrsMatrix(),toTpetra(exporter),myDomainMap,myRangeMap,params);

    }

    //! Destructor.
    virtual ~TpetraCrsMatrix() {  }

    //@}

    //! @name Insertion/Removal Methods
    //@{

    //! Insert matrix entries, using global IDs.
    void insertGlobalValues(GlobalOrdinal globalRow, const ArrayView< const GlobalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::insertGlobalValues"); mtx_->insertGlobalValues(globalRow, cols, vals); }

    //! Insert matrix entries, using local IDs.
    void insertLocalValues(LocalOrdinal localRow, const ArrayView< const LocalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::insertLocalValues"); mtx_->insertLocalValues(localRow, cols, vals); }

    //! Replace matrix entries, using global IDs.
    void replaceGlobalValues(GlobalOrdinal globalRow, const ArrayView< const GlobalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::replaceGlobalValues"); mtx_->replaceGlobalValues(globalRow, cols, vals); }

    //! Replace matrix entries, using local IDs.
    void replaceLocalValues(LocalOrdinal localRow, const ArrayView< const LocalOrdinal > &cols, const ArrayView< const Scalar > &vals) { XPETRA_MONITOR("TpetraCrsMatrix::replaceLocalValues"); mtx_->replaceLocalValues(localRow, cols, vals); }

    //! Set all matrix entries equal to scalarThis.
    void setAllToScalar(const Scalar &alpha) { XPETRA_MONITOR("TpetraCrsMatrix::setAllToScalar"); mtx_->setAllToScalar(alpha); }

    //! Scale the current values of a matrix, this = alpha*this.
    void scale(const Scalar &alpha) { XPETRA_MONITOR("TpetraCrsMatrix::scale"); mtx_->scale(alpha); }

    //! Allocates and returns ArrayRCPs of the Crs arrays --- This is an Xpetra-only routine.
    //** \warning This is an expert-only routine and should not be called from user code. */
    void allocateAllValues(size_t numNonZeros,ArrayRCP<size_t> & rowptr, ArrayRCP<LocalOrdinal> & colind, ArrayRCP<Scalar> & values)
    { XPETRA_MONITOR("TpetraCrsMatrix::allocateAllValues"); rowptr.resize(getNodeNumRows()+1); colind.resize(numNonZeros); values.resize(numNonZeros);}
    //! @name Constructor/Destructor
    //@{
    AMGXOperator(const Teuchos::RCP<Tpetra::CrsMatrix<SC,LO,GO,NO> > &inA, Teuchos::ParameterList &paramListIn) {
      RCP<const Teuchos::Comm<int> > comm = inA->getRowMap()->getComm();
      int numProcs = comm->getSize();
      int myRank   = comm->getRank();

      RCP<Teuchos::Time> amgxTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: initialize");
      amgxTimer->start();
      // Initialize
      AMGX_SAFE_CALL(AMGX_initialize());
      AMGX_SAFE_CALL(AMGX_initialize_plugins());

      /*system*/
      //AMGX_SAFE_CALL(AMGX_register_print_callback(&print_callback));
      AMGX_SAFE_CALL(AMGX_install_signal_handler());
      Teuchos::ParameterList configs = paramListIn.sublist("amgx:params", true);
      if (configs.isParameter("json file")) {
        AMGX_SAFE_CALL(AMGX_config_create_from_file(&Config_, (const char *) &configs.get<std::string>("json file")[0]));
      } else {
        std::ostringstream oss;
        oss << "";
        ParameterList::ConstIterator itr;
        for (itr = configs.begin(); itr != configs.end(); ++itr) {
          const std::string&    name  = configs.name(itr);
          const ParameterEntry& entry = configs.entry(itr);
          oss << name << "=" << filterValueToString(entry) << ", ";
        }
        oss << "\0";
        std::string configString = oss.str();
        if (configString == "") {
          //print msg that using defaults
          //GetOStream(Warnings0) << "Warning: No configuration parameters specified, using default AMGX configuration parameters. \n";
        }
        AMGX_SAFE_CALL(AMGX_config_create(&Config_, configString.c_str()));
      }

      // TODO: we probably need to add "exception_handling=1" to the parameter list
      // to switch on internal error handling (with no need for AMGX_SAFE_CALL)

#define NEW_COMM
#ifdef NEW_COMM
      // NOTE: MPI communicator used in AMGX_resources_create must exist in the scope of AMGX_matrix_comm_from_maps_one_ring
      // FIXME: fix for serial comm
      RCP<const Teuchos::MpiComm<int> > tmpic = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm->duplicate());
      TEUCHOS_TEST_FOR_EXCEPTION(tmpic.is_null(), Exceptions::RuntimeError, "Communicator is not MpiComm");

      RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
      MPI_Comm mpiComm = *rawMpiComm;
#endif

      // Construct AMGX resources
      if (numProcs == 1) {
        AMGX_resources_create_simple(&Resources_, Config_);

      } else {
        int numGPUDevices;
        cudaGetDeviceCount(&numGPUDevices);
        int device[] = {(comm->getRank() % numGPUDevices)};

        AMGX_config_add_parameters(&Config_, "communicator=MPI");
#ifdef NEW_COMM
        AMGX_resources_create(&Resources_, Config_, &mpiComm, 1/* number of GPU devices utilized by this rank */, device);
#else
        AMGX_resources_create(&Resources_, Config_, MPI_COMM_WORLD, 1/* number of GPU devices utilized by this rank */, device);
#endif
      }

      AMGX_Mode mode = AMGX_mode_dDDI;
      AMGX_solver_create(&Solver_, Resources_, mode,  Config_);
      AMGX_matrix_create(&A_,      Resources_, mode);
      AMGX_vector_create(&X_,      Resources_, mode);
      AMGX_vector_create(&Y_,      Resources_, mode);

      amgxTimer->stop();
      amgxTimer->incrementNumCalls();

      std::vector<int> amgx2muelu;

      // Construct AMGX communication pattern
      if (numProcs > 1) {
        RCP<const Tpetra::Import<LO,GO> > importer = inA->getCrsGraph()->getImporter();

        TEUCHOS_TEST_FOR_EXCEPTION(importer.is_null(), MueLu::Exceptions::RuntimeError, "The matrix A has no Import object.");

        Tpetra::Distributor distributor = importer->getDistributor();

        Array<int> sendRanks = distributor.getImagesTo();
        Array<int> recvRanks = distributor.getImagesFrom();

        std::sort(sendRanks.begin(), sendRanks.end());
        std::sort(recvRanks.begin(), recvRanks.end());

        bool match = true;
        if (sendRanks.size() != recvRanks.size()) {
          match = false;
        } else {
          for (int i = 0; i < sendRanks.size(); i++) {
            if (recvRanks[i] != sendRanks[i])
              match = false;
              break;
          }
        }
        TEUCHOS_TEST_FOR_EXCEPTION(!match, MueLu::Exceptions::RuntimeError, "AMGX requires that the processors that we send to and receive from are the same. "
                                   "This is not the case: we send to {" << sendRanks << "} and receive from {" << recvRanks << "}");

        int        num_neighbors = sendRanks.size();  // does not include the calling process
        const int* neighbors     = &sendRanks[0];

        // Later on, we'll have to organize the send and recv data by PIDs,
        // i.e, a vector V of vectors, where V[i] is PID i's vector of data.
        // Hence we need to be able to quickly look up  an array index
        // associated with each PID.
        Tpetra::Details::HashTable<int,int> hashTable(3*num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          hashTable.add(neighbors[i], i);

        // Get some information out
        ArrayView<const int> exportLIDs = importer->getExportLIDs();
        ArrayView<const int> exportPIDs = importer->getExportPIDs();
        Array<int> importPIDs;
        Tpetra::Import_Util::getPids(*importer, importPIDs, true/* make local -1 */);

        // Construct the reordering for AMGX as in AMGX_matrix_upload_all documentation
        RCP<const Map> rowMap = inA->getRowMap();
        RCP<const Map> colMap = inA->getColMap();

        int N = rowMap->getNodeNumElements(), Nc = colMap->getNodeNumElements();
        muelu2amgx_.resize(Nc, -1);

        int numUniqExports = 0;
        for (int i = 0; i < exportLIDs.size(); i++)
          if (muelu2amgx_[exportLIDs[i]] == -1) {
            numUniqExports++;
            muelu2amgx_[exportLIDs[i]] = -2;
          }

        int localOffset = 0, exportOffset = N - numUniqExports;
        // Go through exported LIDs and put them at the end of LIDs
        for (int i = 0; i < exportLIDs.size(); i++)
          if (muelu2amgx_[exportLIDs[i]] < 0) // exportLIDs are not unique
            muelu2amgx_[exportLIDs[i]] = exportOffset++;
        // Go through all non-export LIDs, and put them at the beginning of LIDs
        for (int i = 0; i < N; i++)
          if (muelu2amgx_[i] == -1)
            muelu2amgx_[i] = localOffset++;
        // Go through the tail (imported LIDs), and order those by neighbors
        int importOffset = N;
        for (int k = 0; k < num_neighbors; k++)
          for (int i = 0; i < importPIDs.size(); i++)
            if (importPIDs[i] != -1 && hashTable.get(importPIDs[i]) == k)
              muelu2amgx_[i] = importOffset++;

        amgx2muelu.resize(muelu2amgx_.size());
        for (int i = 0; i < muelu2amgx_.size(); i++)
          amgx2muelu[muelu2amgx_[i]] = i;

        // Construct send arrays
        std::vector<std::vector<int> > sendDatas (num_neighbors);
        std::vector<int>               send_sizes(num_neighbors, 0);
        for (int i = 0; i < exportPIDs.size(); i++) {
          int index = hashTable.get(exportPIDs[i]);
          sendDatas [index].push_back(muelu2amgx_[exportLIDs[i]]);
          send_sizes[index]++;
        }
        // FIXME: sendDatas must be sorted (based on GIDs)

        std::vector<const int*> send_maps(num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          send_maps[i] = &(sendDatas[i][0]);

        // Debugging
        printMaps(comm, sendDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "send_map_vector");

        // Construct recv arrays
        std::vector<std::vector<int> > recvDatas (num_neighbors);
        std::vector<int>               recv_sizes(num_neighbors, 0);
        for (int i = 0; i < importPIDs.size(); i++)
          if (importPIDs[i] != -1) {
            int index = hashTable.get(importPIDs[i]);
            recvDatas [index].push_back(muelu2amgx_[i]);
            recv_sizes[index]++;
        }
        // FIXME: recvDatas must be sorted (based on GIDs)

        std::vector<const int*> recv_maps(num_neighbors);
        for (int i = 0; i < num_neighbors; i++)
          recv_maps[i] = &(recvDatas[i][0]);

        // Debugging
        printMaps(comm, recvDatas, amgx2muelu, neighbors, *importer->getTargetMap(), "recv_map_vector");

        AMGX_SAFE_CALL(AMGX_matrix_comm_from_maps_one_ring(A_, 1, num_neighbors, neighbors, &send_sizes[0], &send_maps[0], &recv_sizes[0], &recv_maps[0]));

        AMGX_vector_bind(X_, A_);
        AMGX_vector_bind(Y_, A_);
      }

      RCP<Teuchos::Time> matrixTransformTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transform matrix");
      matrixTransformTimer->start();

      ArrayRCP<const size_t> ia_s;
      ArrayRCP<const int>    ja;
      ArrayRCP<const double> a;
      inA->getAllValues(ia_s, ja, a);

      ArrayRCP<int> ia(ia_s.size());
      for (int i = 0; i < ia.size(); i++)
        ia[i] = Teuchos::as<int>(ia_s[i]);

      N_      = inA->getNodeNumRows();
      int nnz = inA->getNodeNumEntries();

      matrixTransformTimer->stop();
      matrixTransformTimer->incrementNumCalls();


      // Upload matrix
      // TODO Do we need to pin memory here through AMGX_pin_memory?
      RCP<Teuchos::Time> matrixTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer matrix  CPU->GPU");
      matrixTimer->start();
      if (numProcs == 1) {
        AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia[0], &ja[0], &a[0], NULL);

      } else {
        // Transform the matrix
        std::vector<int>    ia_new(ia.size());
        std::vector<int>    ja_new(ja.size());
        std::vector<double> a_new (a.size());

        ia_new[0] = 0;
        for (int i = 0; i < N_; i++) {
          int oldRow = amgx2muelu[i];

          ia_new[i+1] = ia_new[i] + (ia[oldRow+1] - ia[oldRow]);

          for (int j = ia[oldRow]; j < ia[oldRow+1]; j++) {
            int offset = j - ia[oldRow];
            ja_new[ia_new[i] + offset] = muelu2amgx_[ja[j]];
            a_new [ia_new[i] + offset] = a[j];
          }
          // Do bubble sort on two arrays
          // NOTE: There are multiple possible optimizations here (even of bubble sort)
          bool swapped;
          do {
            swapped = false;

            for (int j = ia_new[i]; j < ia_new[i+1]-1; j++)
              if (ja_new[j] > ja_new[j+1]) {
                std::swap(ja_new[j], ja_new[j+1]);
                std::swap(a_new [j], a_new [j+1]);
                swapped = true;
              }
          } while (swapped == true);
        }

        AMGX_matrix_upload_all(A_, N_, nnz, 1, 1, &ia_new[0], &ja_new[0], &a_new[0], NULL);
      }
      matrixTimer->stop();
      matrixTimer->incrementNumCalls();

      domainMap_ = inA->getDomainMap();
      rangeMap_  = inA->getRangeMap();

      RCP<Teuchos::Time> realSetupTimer = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: real setup");
      realSetupTimer->start();
      AMGX_solver_setup(Solver_, A_);
      realSetupTimer->stop();
      realSetupTimer->incrementNumCalls();

      vectorTimer1_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vectors CPU->GPU");
      vectorTimer2_ = Teuchos::TimeMonitor::getNewTimer("MueLu: AMGX: transfer vector  GPU->CPU");
    }
  void globalWeightedCutsByPart(
    const RCP<const Environment> &env,
    const RCP<const Comm<int> > &comm,
    const RCP<const GraphModel<typename Adapter::base_adapter_t> > &graph,
    const ArrayView<const typename Adapter::part_t> &part,
    typename Adapter::part_t &numParts,
    ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > > &metrics,
    ArrayRCP<typename Adapter::scalar_t> &globalSums)
{
  env->debug(DETAILED_STATUS, "Entering globalWeightedCutsByPart");
  //////////////////////////////////////////////////////////
  // Initialize return values

  numParts = 0;

  int ewgtDim = graph->getNumWeightsPerEdge();

  int numMetrics = 1;                   // "edge cuts"
  if (ewgtDim) numMetrics += ewgtDim;   // "weight n"

  typedef typename Adapter::scalar_t scalar_t;
  typedef typename Adapter::gno_t gno_t;
  typedef typename Adapter::lno_t lno_t;
  typedef typename Adapter::node_t node_t;
  typedef typename Adapter::part_t part_t;
  typedef StridedData<lno_t, scalar_t> input_t;

  typedef GraphMetrics<scalar_t> mv_t;
  typedef Tpetra::CrsMatrix<part_t,lno_t,gno_t,node_t>  sparse_matrix_type;
  typedef Tpetra::Vector<part_t,lno_t,gno_t,node_t>     vector_t;
  typedef Tpetra::Map<lno_t, gno_t, node_t>                map_type;
  typedef Tpetra::global_size_t GST;
  const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();

  using Teuchos::as;

  // add some more metrics to the array
  typedef typename ArrayRCP<RCP<BaseClassMetrics<typename Adapter::scalar_t> > >::size_type array_size_type;
  metrics.resize( metrics.size() + numMetrics );

  for( array_size_type n = metrics.size() - numMetrics; n < metrics.size(); ++n ) {
    mv_t * newMetric = new mv_t;									// allocate the new memory
    env->localMemoryAssertion(__FILE__,__LINE__,1,newMetric);		// check errors
    metrics[n] = rcp( newMetric); 				// create the new members
    }
  array_size_type next = metrics.size() - numMetrics; // MDM - this is most likely temporary to preserve the format here - we are now filling a larger array so we may not have started at 0


  //////////////////////////////////////////////////////////
  // Figure out the global number of parts in use.
  // Verify number of vertex weights is the same everywhere.

  lno_t localNumObj = part.size();
  part_t localNum[2], globalNum[2];
  localNum[0] = static_cast<part_t>(ewgtDim);
  localNum[1] = 0;

  for (lno_t i=0; i < localNumObj; i++)
    if (part[i] > localNum[1]) localNum[1] = part[i];

  try{
    reduceAll<int, part_t>(*comm, Teuchos::REDUCE_MAX, 2,
      localNum, globalNum);
  }
  Z2_THROW_OUTSIDE_ERROR(*env)

  env->globalBugAssertion(__FILE__,__LINE__,
    "inconsistent number of edge weights",
    globalNum[0] == localNum[0], DEBUG_MODE_ASSERTION, comm);

  part_t nparts = globalNum[1] + 1;

  part_t globalSumSize = nparts * numMetrics;
  scalar_t * sumBuf = new scalar_t [globalSumSize];
  env->localMemoryAssertion(__FILE__, __LINE__, globalSumSize, sumBuf);
  globalSums = arcp(sumBuf, 0, globalSumSize);

  //////////////////////////////////////////////////////////
  // Calculate the local totals by part.

  scalar_t *localBuf = new scalar_t [globalSumSize];
  env->localMemoryAssertion(__FILE__,__LINE__,globalSumSize,localBuf);
  memset(localBuf, 0, sizeof(scalar_t) * globalSumSize);

  scalar_t *cut = localBuf;              // # of cuts

  ArrayView<const gno_t> Ids;
  ArrayView<input_t> vwgts;
  //size_t nv =
  graph->getVertexList(Ids, vwgts);

  ArrayView<const gno_t> edgeIds;
  ArrayView<const lno_t> offsets;
  ArrayView<input_t> wgts;
  //size_t numLocalEdges =
  graph->getEdgeList(edgeIds, offsets, wgts);
  // **************************************************************************
  // *************************** BUILD MAP FOR ADJS ***************************
  // **************************************************************************

  RCP<const map_type> vertexMapG;

  // Build a list of the global vertex ids...
  gno_t min = std::numeric_limits<gno_t>::max();
  size_t maxcols = 0;
  for (lno_t i = 0; i < localNumObj; ++i) {
    if (Ids[i] < min) min = Ids[i];
    size_t ncols = offsets[i+1] - offsets[i];
    if (ncols > maxcols) maxcols = ncols;
  }

  gno_t gmin;
  Teuchos::reduceAll<int, gno_t>(*comm,Teuchos::REDUCE_MIN,1,&min,&gmin);

  //Generate Map for vertex
  vertexMapG = rcp(new map_type(INVALID, Ids, gmin, comm));

  // **************************************************************************
  // ************************** BUILD GRAPH FOR ADJS **************************
  // **************************************************************************

  //MD:Zoltan Directory could be used instead of adjMatrix.

  RCP<sparse_matrix_type> adjsMatrix;

  // Construct Tpetra::CrsGraph objects.
  adjsMatrix = rcp (new sparse_matrix_type (vertexMapG, 0));

  Array<part_t> justOneA(maxcols, 1);

  for (lno_t localElement=0; localElement<localNumObj; ++localElement){
    // Insert all columns for global row Ids[localElement]
    size_t ncols = offsets[localElement+1] - offsets[localElement];
    adjsMatrix->insertGlobalValues(Ids[localElement],
                                   edgeIds(offsets[localElement], ncols),
                                   justOneA(0, ncols));
  }

  //Fill-complete adjs Graph
  adjsMatrix->fillComplete ();

  // Compute part
  RCP<vector_t> scaleVec = Teuchos::rcp( new vector_t(vertexMapG,false) );
  for (lno_t localElement=0; localElement<localNumObj; ++localElement) {
    scaleVec->replaceLocalValue(localElement,part[localElement]);
  }

  // Postmultiply adjsMatrix by part
  adjsMatrix->rightScale(*scaleVec);
  Array<gno_t> Indices;
  Array<part_t> Values;

  for (lno_t i=0; i < localNumObj; i++) {
    const gno_t globalRow = Ids[i];
    size_t NumEntries = adjsMatrix->getNumEntriesInGlobalRow (globalRow);
    Indices.resize (NumEntries);
    Values.resize (NumEntries);
    adjsMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries);

    for (size_t j=0; j < NumEntries; j++)
      if (part[i] != Values[j])
	cut[part[i]]++;
  }

  if (numMetrics > 1) {

    scalar_t *wgt = localBuf + nparts; // weight 0

    // This code assumes the solution has the part ordered the
    // same way as the user input.  (Bug 5891 is resolved.)
    for (int edim = 0; edim < ewgtDim; edim++){
      for (lno_t i=0; i < localNumObj; i++) {
        const gno_t globalRow = Ids[i];
        size_t NumEntries = adjsMatrix->getNumEntriesInGlobalRow (globalRow);
        Indices.resize (NumEntries);
        Values.resize (NumEntries);
        adjsMatrix->getGlobalRowCopy (globalRow,Indices(),Values(),NumEntries);

        for (size_t j=0; j < NumEntries; j++)
          if (part[i] != Values[j])
            wgt[part[i]] += wgts[edim][offsets[i] + j];
      }
      wgt += nparts;         // individual weights
    }
  }

  //////////////////////////////////////////////////////////
  // Obtain global totals by part.

  try{
    reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM, globalSumSize,
      localBuf, sumBuf);
  }
  Z2_THROW_OUTSIDE_ERROR(*env);

  delete [] localBuf;

  //////////////////////////////////////////////////////////
  // Global max and sum over all parts

  cut = sumBuf;                     // # of cuts
  scalar_t max=0, sum=0;

  ArrayView<scalar_t> cutVec(cut, nparts);
  getStridedStats<scalar_t>(cutVec, 1, 0, max, sum);

  metrics[next]->setName("edge cuts");
  metrics[next]->setMetricValue("global maximum", max);
  metrics[next]->setMetricValue("global sum", sum);

  next++;

  if (numMetrics > 1){
    scalar_t *wgt = sumBuf + nparts;        // weight 0

    for (int edim=0; edim < ewgtDim; edim++){
      ArrayView<scalar_t> fromVec(wgt, nparts);
      getStridedStats<scalar_t>(fromVec, 1, 0, max, sum);

      std::ostringstream oss;
      oss << "weight " << edim;

      metrics[next]->setName(oss.str());
      metrics[next]->setMetricValue("global maximum", max);
      metrics[next]->setMetricValue("global sum", sum);

      next++;
      wgt += nparts;       // individual weights
    }
  }

  numParts = nparts;

  env->debug(DETAILED_STATUS, "Exiting globalWeightedCutsByPart");
}
  void AggregationPhase2aAlgorithm<LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::BuildAggregates(const ParameterList& params, const GraphBase& graph, Aggregates& aggregates, std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes) const {
    Monitor m(*this, "BuildAggregates");

    LO minNodesPerAggregate = params.get<LO>("aggregation: min agg size");
    LO maxNodesPerAggregate = params.get<LO>("aggregation: max agg size");

    const LO  numRows = graph.GetNodeNumVertices();
    const int myRank  = graph.GetComm()->getRank();

    ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
    ArrayRCP<LO> procWinner   = aggregates.GetProcWinner()  ->getDataNonConst(0);

    LO numLocalAggregates = aggregates.GetNumAggregates();

    LO numLocalNodes      = procWinner.size();
    LO numLocalAggregated = numLocalNodes - numNonAggregatedNodes;

    const double aggFactor = 0.5;
    double       factor    = as<double>(numLocalAggregated)/(numLocalNodes+1);
    factor = pow(factor, aggFactor);

    int              aggIndex = -1;
    size_t           aggSize  =  0;
    std::vector<int> aggList(graph.getNodeMaxNumRowEntries());

    for (LO rootCandidate = 0; rootCandidate < numRows; rootCandidate++) {
      if (aggStat[rootCandidate] != READY)
        continue;

      aggSize = 0;

      ArrayView<const LocalOrdinal> neighOfINode = graph.getNeighborVertices(rootCandidate);

      LO numNeighbors = 0;
      for (int j = 0; j < neighOfINode.size(); j++) {
        LO neigh = neighOfINode[j];

        if (neigh != rootCandidate) {
          if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == READY) {
            // If aggregate size does not exceed max size, add node to the tentative aggregate
            // NOTE: We do not exit the loop over all neighbours since we have still
            //       to count all aggregated neighbour nodes for the aggregation criteria
            // NOTE: We check here for the maximum aggregation size. If we would do it below
            //       with all the other check too big aggregates would not be accepted at all.
            if (aggSize < as<size_t>(maxNodesPerAggregate))
              aggList[aggSize++] = neigh;
          }

          numNeighbors++;
        }
      }

      // NOTE: ML uses a hardcoded value 3 instead of MinNodesPerAggregate
      if (aggSize > as<size_t>(minNodesPerAggregate) &&
          aggSize > factor*numNeighbors) {
        // Accept new aggregate
        // rootCandidate becomes the root of the newly formed aggregate
        aggregates.SetIsRoot(rootCandidate);
        aggIndex = numLocalAggregates++;

        for (size_t k = 0; k < aggSize; k++) {
          aggStat     [aggList[k]] = AGGREGATED;
          vertex2AggId[aggList[k]] = aggIndex;
          procWinner  [aggList[k]] = myRank;
        }

        numNonAggregatedNodes -= aggSize;
      }
    }

    // update aggregate object
    aggregates.SetNumAggregates(numLocalAggregates);
  }
  void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& currentLevel) const {
    FactoryMonitor m(*this, "Build", currentLevel);

    const Teuchos::ParameterList & pL = GetParameterList();
    // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
    // TODO (JG): I don't really know if we want to do this.
    const int    startLevel          = pL.get<int>   ("repartition: start level");
    const LO     minRowsPerProcessor = pL.get<LO>    ("repartition: min rows per proc");
    const double nonzeroImbalance    = pL.get<double>("repartition: max imbalance");
    const bool   remapPartitions     = pL.get<bool>  ("repartition: remap parts");

    // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
    RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");

    // ======================================================================================================
    // Determine whether partitioning is needed
    // ======================================================================================================
    // NOTE: most tests include some global communication, which is why we currently only do tests until we make
    // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
    // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.

    // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
    if (currentLevel.GetLevelID() < startLevel) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" <<
          "\n  current level = " << Teuchos::toString(currentLevel.GetLevelID()) <<
          ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    RCP<const Map> rowMap = A->getRowMap();

    // NOTE: Teuchos::MPIComm::duplicate() calls MPI_Bcast inside, so this is
    // a synchronization point. However, as we do MueLu_sumAll afterwards anyway, it
    // does not matter.
    RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
    RCP<const Teuchos::Comm<int> > comm     = origComm->duplicate();

    // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
    // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
    // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
    {
      int numActiveProcesses = 0;
      MueLu_sumAll(comm, Teuchos::as<int>((A->getNodeNumRows() > 0) ? 1 : 0), numActiveProcesses);

      if (numActiveProcesses == 1) {
        GetOStream(Statistics0) << "Repartitioning?  NO:" <<
            "\n  # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;

        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    bool test3 = false, test4 = false;
    std::string msg3, msg4;

    // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
    // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
    if (minRowsPerProcessor > 0) {
      LO numMyRows = Teuchos::as<LO>(A->getNodeNumRows()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
      LO haveFewRows = (numMyRows < minRowsPerProcessor ? 1 : 0), numWithFewRows = 0;
      MueLu_sumAll(comm, haveFewRows, numWithFewRows);
      MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);

      // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
      // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
      // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
      if (numWithFewRows > 0)
        test3 = true;

      msg3 = "\n  min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcessor);
    }

    // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
    if (!test3) {
      GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getNodeNumEntries());
      MueLu_maxAll(comm, numMyNnz,                           maxNnz);
      MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
      double imbalance = Teuchos::as<double>(maxNnz)/minNnz;

      if (imbalance > nonzeroImbalance)
        test4 = true;

      msg4 = "\n  nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
    }

    if (!test3 && !test4) {
      GetOStream(Statistics0) << "Repartitioning?  NO:" << msg3 + msg4 << std::endl;

      Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
      return;
    }

    GetOStream(Statistics0) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;

    GO                     indexBase = rowMap->getIndexBase();
    Xpetra::UnderlyingLib  lib       = rowMap->lib();
    int myRank   = comm->getRank();
    int numProcs = comm->getSize();

    RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
    TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();

    // ======================================================================================================
    // Calculate number of partitions
    // ======================================================================================================
    // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
    // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
    GO numPartitions;
    if (currentLevel.IsAvailable("number of partitions")) {
      numPartitions = currentLevel.Get<GO>("number of partitions");
      GetOStream(Warnings0) << "Using user-provided \"number of partitions\", the performance is unknown" << std::endl;

    } else {
      if (Teuchos::as<GO>(A->getGlobalNumRows()) < minRowsPerProcessor) {
        // System is too small, migrate it to a single processor
        numPartitions = 1;

      } else {
        // Make sure that each processor has approximately minRowsPerProcessor
        numPartitions = A->getGlobalNumRows() / minRowsPerProcessor;
      }
      numPartitions = std::min(numPartitions, Teuchos::as<GO>(numProcs));

      currentLevel.Set("number of partitions", numPartitions, NoFactory::get());
    }
    GetOStream(Statistics0) << "Number of partitions to use = " << numPartitions << std::endl;

    // ======================================================================================================
    // Construct decomposition vector
    // ======================================================================================================
    RCP<GOVector> decomposition;
    if (numPartitions == 1) {
      // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
      // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
      // in Zoltan[12]Interface).
      // TODO: We can probably skip more work in this case (like building all extra data structures)
      GetOStream(Warnings0) << "Only one partition: Skip call to the repartitioner." << std::endl;
      decomposition = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(A->getRowMap(), true);

    } else {
      decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");

      if (decomposition.is_null()) {
        GetOStream(Warnings0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
        Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
        return;
      }
    }

    // ======================================================================================================
    // Remap if necessary
    // ======================================================================================================
    // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
    // There are two problems, however.
    // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
    //     This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
    //     each partition is the same.
    // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
    //     Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
    //     GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
    //     processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
    //     be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
    //     algorithm can resolve these differently. For instance, running
    //         mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
    //     with
    //         <ParameterList name="MueLu">
    //           <Parameter name="coarse: max size"                type="int"      value="1"/>
    //           <Parameter name="repartition: enable"             type="bool"     value="true"/>
    //           <Parameter name="repartition: min rows per proc"  type="int"      value="2"/>
    //           <ParameterList name="level 1">
    //             <Parameter name="repartition: remap parts"      type="bool"     value="false/true"/>
    //           </ParameterList>
    //         </ParameterList>
    //     produces different repartitioning for level 2.
    //     This different repartitioning may then escalate into different aggregation for the next level.
    //
    // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
    // Fixing (2) is more complicated.
    // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
    // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
    // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
    // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
    // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
    // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
    // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
    // MPI_COMM_WORLD, which may lead to issues for logging.
    if (remapPartitions) {
      SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);

      DeterminePartitionPlacement(*A, *decomposition, numPartitions);
    }

    // ======================================================================================================
    // Construct importer
    // ======================================================================================================
    // At this point, the following is true:
    //  * Each processors owns 0 or 1 partitions
    //  * If a processor owns a partition, that partition number is equal to the processor rank
    //  * The decomposition vector contains the partitions ids that the corresponding GID belongs to

    ArrayRCP<const GO> decompEntries;
    if (decomposition->getLocalLength() > 0)
      decompEntries = decomposition->getData(0);

#ifdef HAVE_MUELU_DEBUG
    // Test range of partition ids
    int incorrectRank = -1;
    for (int i = 0; i < decompEntries.size(); i++)
      if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
        incorrectRank = myRank;
        break;
      }

    int incorrectGlobalRank = -1;
    MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
    TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
#endif

    Array<GO> myGIDs;
    myGIDs.reserve(decomposition->getLocalLength());

    // Step 0: Construct mapping
    //    part number -> GIDs I own which belong to this part
    // NOTE: my own part GIDs are not part of the map
    typedef std::map<GO, Array<GO> > map_type;
    map_type sendMap;
    for (LO i = 0; i < decompEntries.size(); i++) {
      GO id  = decompEntries[i];
      GO GID = rowMap->getGlobalElement(i);

      if (id == myRank)
        myGIDs     .push_back(GID);
      else
        sendMap[id].push_back(GID);
    }
    decompEntries = Teuchos::null;

    if (IsPrint(Statistics2)) {
      GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
      MueLu_sumAll(comm,numLocalKept, numGlobalKept);
      GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl;
    }

    int numSend = sendMap.size(), numRecv;

    // Arrayify map keys
    Array<GO> myParts(numSend), myPart(1);
    int cnt = 0;
    myPart[0] = myRank;
    for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
      myParts[cnt++] = it->first;

    // Step 1: Find out how many processors send me data
    // partsIndexBase starts from zero, as the processors ids start from zero
    GO partsIndexBase = 0;
    RCP<Map>    partsIHave  = MapFactory   ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
    RCP<Map>    partsIOwn   = MapFactory   ::Build(lib,                                                 numProcs,  myPart(), partsIndexBase, comm);
    RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);

    RCP<GOVector> partsISend    = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave);
    RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn);
    if (numSend) {
      ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
      for (int i = 0; i < numSend; i++)
        partsISendData[i] = 1;
    }
    (numPartsIRecv->getDataNonConst(0))[0] = 0;

    numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
    numRecv = (numPartsIRecv->getData(0))[0];

    // Step 2: Get my GIDs from everybody else
    MPI_Datatype MpiType = MpiTypeTraits<GO>::getType();
    int msgTag = 12345;  // TODO: use Comm::dup for all internal messaging

    // Post sends
    Array<MPI_Request> sendReqs(numSend);
    cnt = 0;
    for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
      MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);

    map_type recvMap;
    size_t totalGIDs = myGIDs.size();
    for (int i = 0; i < numRecv; i++) {
      MPI_Status status;
      MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);

      // Get rank and number of elements from status
      int fromRank = status.MPI_SOURCE, count;
      MPI_Get_count(&status, MpiType, &count);

      recvMap[fromRank].resize(count);
      MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);

      totalGIDs += count;
    }

    // Do waits on send requests
    if (numSend) {
      Array<MPI_Status> sendStatuses(numSend);
      MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
    }

    // Merge GIDs
    myGIDs.reserve(totalGIDs);
    for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
      int offset = myGIDs.size(), len = it->second.size();
      if (len) {
        myGIDs.resize(offset + len);
        memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO));
      }
    }
    // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
    // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
    std::sort(myGIDs.begin(), myGIDs.end());

    // Step 3: Construct importer
    RCP<Map>          newRowMap      = MapFactory   ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
    RCP<const Import> rowMapImporter;
    {
      SubFactoryMonitor m1(*this, "Import construction", currentLevel);
      rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
    }

    Set(currentLevel, "Importer", rowMapImporter);

    // ======================================================================================================
    // Print some data
    // ======================================================================================================
    if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
      // Print the grid of processors
      GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;

      char amActive = (myGIDs.size() ? 1 : 0);
      std::vector<char> areActive(numProcs, 0);
      MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);

      int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
      for (int proc = 0; proc < numProcs; proc += rowWidth) {
        for (int j = 0; j < rowWidth; j++)
          if (proc + j < numProcs)
            GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
          else
          GetOStream(Statistics2) << " ";

        GetOStream(Statistics2) << "      " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
      }
    }

  } // Build
Esempio n. 22
0
int main(int argc, char *argv[]) {
#include <MueLu_UseShortNames.hpp>

  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::ArrayRCP;
  using Teuchos::TimeMonitor;
  using Teuchos::ParameterList;

  // =========================================================================
  // MPI initialization using Teuchos
  // =========================================================================
  Teuchos::GlobalMPISession mpiSession(&argc, &argv, NULL);
  RCP< const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();

  // =========================================================================
  // Convenient definitions
  // =========================================================================
  typedef Teuchos::ScalarTraits<SC> STS;
  SC zero = STS::zero(), one = STS::one();

  // =========================================================================
  // Parameters initialization
  // =========================================================================
  Teuchos::CommandLineProcessor clp(false);

  GO nx = 100, ny = 100, nz = 100;
  Galeri::Xpetra::Parameters<GO> galeriParameters(clp, nx, ny, nz, "Laplace2D"); // manage parameters of the test case
  Xpetra::Parameters             xpetraParameters(clp);                          // manage parameters of Xpetra

  std::string xmlFileName       = "scalingTest.xml"; clp.setOption("xml",                   &xmlFileName,      "read parameters from a file [default = 'scalingTest.xml']");
  bool        printTimings      = true;              clp.setOption("timings", "notimings",  &printTimings,     "print timings to screen");
  int         writeMatricesOPT  = -2;                clp.setOption("write",                 &writeMatricesOPT, "write matrices to file (-1 means all; i>=0 means level i)");
  std::string dsolveType        = "cg", solveType;   clp.setOption("solver",                &dsolveType,       "solve type: (none | cg | gmres | standalone)");
  double      dtol              = 1e-12, tol;        clp.setOption("tol",                   &dtol,             "solver convergence tolerance");

  std::string mapFile;                               clp.setOption("map",                   &mapFile,          "map data file");
  std::string matrixFile;                            clp.setOption("matrix",                &matrixFile,       "matrix data file");
  std::string coordFile;                             clp.setOption("coords",                &coordFile,        "coordinates data file");
  int         numRebuilds       = 0;                 clp.setOption("rebuild",               &numRebuilds,      "#times to rebuild hierarchy");
  int         maxIts            = 200;               clp.setOption("its",                   &maxIts,           "maximum number of solver iterations");
  bool        scaleResidualHistory = true;              clp.setOption("scale", "noscale",  &scaleResidualHistory, "scaled Krylov residual history");

  switch (clp.parse(argc, argv)) {
    case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:        return EXIT_SUCCESS;
    case Teuchos::CommandLineProcessor::PARSE_ERROR:
    case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE;
    case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:          break;
  }

  Xpetra::UnderlyingLib lib = xpetraParameters.GetLib();

  ParameterList paramList;
  Teuchos::updateParametersFromXmlFileAndBroadcast(xmlFileName, Teuchos::Ptr<ParameterList>(&paramList), *comm);
  bool isDriver = paramList.isSublist("Run1");
  if (isDriver) {
    // update galeriParameters with the values from the XML file
    ParameterList& realParams = galeriParameters.GetParameterList();

    for (ParameterList::ConstIterator it = realParams.begin(); it != realParams.end(); it++) {
      const std::string& name = realParams.name(it);
      if (paramList.isParameter(name))
        realParams.setEntry(name, paramList.getEntry(name));
    }
  }

  // Retrieve matrix parameters (they may have been changed on the command line)
  // [for instance, if we changed matrix type from 2D to 3D we need to update nz]
  ParameterList galeriList = galeriParameters.GetParameterList();

  // =========================================================================
  // Problem construction
  // =========================================================================
  std::ostringstream galeriStream;
  comm->barrier();
  RCP<TimeMonitor> globalTimeMonitor = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: S - Global Time")));
  RCP<TimeMonitor> tm                = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1 - Matrix Build")));

  RCP<Matrix>      A;
  RCP<const Map>   map;
  RCP<MultiVector> coordinates;
  RCP<MultiVector> nullspace;
  if (matrixFile.empty()) {
    galeriStream << "========================================================\n" << xpetraParameters << galeriParameters;

    // Galeri will attempt to create a square-as-possible distribution of subdomains di, e.g.,
    //                                 d1  d2  d3
    //                                 d4  d5  d6
    //                                 d7  d8  d9
    //                                 d10 d11 d12
    // A perfect distribution is only possible when the #processors is a perfect square.
    // This *will* result in "strip" distribution if the #processors is a prime number or if the factors are very different in
    // size. For example, np=14 will give a 7-by-2 distribution.
    // If you don't want Galeri to do this, specify mx or my on the galeriList.
    std::string matrixType = galeriParameters.GetMatrixType();

    // Create map and coordinates
    // In the future, we hope to be able to first create a Galeri problem, and then request map and coordinates from it
    // At the moment, however, things are fragile as we hope that the Problem uses same map and coordinates inside
    if (matrixType == "Laplace1D") {
      map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian1D", comm, galeriList);
      coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("1D", map, galeriList);

    } else if (matrixType == "Laplace2D" || matrixType == "Star2D" ||
               matrixType == "BigStar2D" || matrixType == "Elasticity2D") {
      map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian2D", comm, galeriList);
      coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("2D", map, galeriList);

    } else if (matrixType == "Laplace3D" || matrixType == "Brick3D" || matrixType == "Elasticity3D") {
      map = Galeri::Xpetra::CreateMap<LO, GO, Node>(xpetraParameters.GetLib(), "Cartesian3D", comm, galeriList);
      coordinates = Galeri::Xpetra::Utils::CreateCartesianCoordinates<SC,LO,GO,Map,MultiVector>("3D", map, galeriList);
    }

    // Expand map to do multiple DOF per node for block problems
    if (matrixType == "Elasticity2D")
      map = Xpetra::MapFactory<LO,GO,Node>::Build(map, 2);
    if (matrixType == "Elasticity3D")
      map = Xpetra::MapFactory<LO,GO,Node>::Build(map, 3);

    galeriStream << "Processor subdomains in x direction: " << galeriList.get<int>("mx") << std::endl
                 << "Processor subdomains in y direction: " << galeriList.get<int>("my") << std::endl
                 << "Processor subdomains in z direction: " << galeriList.get<int>("mz") << std::endl
                 << "========================================================" << std::endl;

    if (matrixType == "Elasticity2D" || matrixType == "Elasticity3D") {
      // Our default test case for elasticity: all boundaries of a square/cube have Neumann b.c. except left which has Dirichlet
      galeriList.set("right boundary" , "Neumann");
      galeriList.set("bottom boundary", "Neumann");
      galeriList.set("top boundary"   , "Neumann");
      galeriList.set("front boundary" , "Neumann");
      galeriList.set("back boundary"  , "Neumann");
    }

    RCP<Galeri::Xpetra::Problem<Map,CrsMatrixWrap,MultiVector> > Pr =
        Galeri::Xpetra::BuildProblem<SC,LO,GO,Map,CrsMatrixWrap,MultiVector>(galeriParameters.GetMatrixType(), map, galeriList);
    A = Pr->BuildMatrix();

    if (matrixType == "Elasticity2D" ||
        matrixType == "Elasticity3D") {
      nullspace = Pr->BuildNullspace();
      A->SetFixedBlockSize((galeriParameters.GetMatrixType() == "Elasticity2D") ? 2 : 3);
    }

  } else {
    if (!mapFile.empty())
      map = Utils2::ReadMap(mapFile, xpetraParameters.GetLib(), comm);
    comm->barrier();

    if (lib == Xpetra::UseEpetra) {
      A = Utils::Read(matrixFile, map);

    } else {
      // Tpetra matrix reader is still broken, so instead we read in
      // a matrix in a binary format and then redistribute it
      const bool binaryFormat = true;
      A = Utils::Read(matrixFile, lib, comm, binaryFormat);

      RCP<Matrix> newMatrix = MatrixFactory::Build(map, 1);
      RCP<Import> importer  = ImportFactory::Build(A->getRowMap(), map);
      newMatrix->doImport(*A, *importer, Xpetra::INSERT);
      newMatrix->fillComplete();

      A.swap(newMatrix);
    }

    comm->barrier();

    if (!coordFile.empty())
      coordinates = Utils2::ReadMultiVector(coordFile, map);
  }

  comm->barrier();
  tm = Teuchos::null;

  galeriStream << "Galeri complete.\n========================================================" << std::endl;

  int numReruns = 1;
  if (paramList.isParameter("number of reruns"))
    numReruns = paramList.get<int>("number of reruns");

  const bool mustAlreadyExist = true;
  for (int rerunCount = 1; rerunCount <= numReruns; rerunCount++) {
    ParameterList mueluList, runList;

    bool stop = false;
    if (isDriver) {
      runList   = paramList.sublist("Run1",  mustAlreadyExist);
      mueluList = runList  .sublist("MueLu", mustAlreadyExist);
    } else {
      mueluList = paramList;
      stop = true;
    }

    if (nullspace.is_null()) {
      int blkSize = 1;
      if (mueluList.isSublist("Matrix")) {
        // Factory style parameter list
        const Teuchos::ParameterList& operatorList = paramList.sublist("Matrix");
        if (operatorList.isParameter("PDE equations"))
          blkSize = operatorList.get<int>("PDE equations");

      } else if (paramList.isParameter("number of equations")) {
        // Easy style parameter list
        blkSize = paramList.get<int>("number of equations");
      }

      nullspace = MultiVectorFactory::Build(map, blkSize);
      for (int i = 0; i < blkSize; i++) {
        RCP<const Map> domainMap = A->getDomainMap();
        GO             indexBase = domainMap->getIndexBase();

        ArrayRCP<SC> nsData = nullspace->getDataNonConst(i);
        for (int j = 0; j < nsData.size(); j++) {
          GO GID = domainMap->getGlobalElement(j) - indexBase;

          if ((GID-i) % blkSize == 0)
            nsData[j] = Teuchos::ScalarTraits<SC>::one();
        }
      }
    }

    int runCount = 1;
    do {
      A->SetMaxEigenvalueEstimate(-one);

      solveType = dsolveType;
      tol       = dtol;

      int   savedOut  = -1;
      FILE* openedOut = NULL;
      if (isDriver) {
        if (runList.isParameter("filename")) {
          // Redirect all output into a filename We have to redirect all output,
          // including printf's, therefore we cannot simply replace C++ cout
          // buffers, and have to use heavy machinary (dup2)
          std::string filename = runList.get<std::string>("filename");
          if (numReruns > 1)
            filename += "_run" + MueLu::toString(rerunCount);
          filename += (lib == Xpetra::UseEpetra ? ".epetra" : ".tpetra");

          savedOut  = dup(STDOUT_FILENO);
          openedOut = fopen(filename.c_str(), "w");
          dup2(fileno(openedOut), STDOUT_FILENO);
        }
        if (runList.isParameter("solver")) solveType = runList.get<std::string>("solver");
        if (runList.isParameter("tol"))    tol       = runList.get<double>     ("tol");
      }

      // Instead of checking each time for rank, create a rank 0 stream
      RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
      Teuchos::FancyOStream& out = *fancy;
      out.setOutputToRootOnly(0);

      out << galeriStream.str();

      // =========================================================================
      // Preconditioner construction
      // =========================================================================
      comm->barrier();
      tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 1.5 - MueLu read XML")));

      RCP<HierarchyManager> mueLuFactory = rcp(new ParameterListInterpreter(mueluList));

      comm->barrier();
      tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 2 - MueLu Setup")));

      RCP<Hierarchy> H;
      for (int i = 0; i <= numRebuilds; i++) {
        A->SetMaxEigenvalueEstimate(-one);

        H = mueLuFactory->CreateHierarchy();
        H->GetLevel(0)->Set("A",           A);
        H->GetLevel(0)->Set("Nullspace",   nullspace);
        if (!coordinates.is_null())
          H->GetLevel(0)->Set("Coordinates", coordinates);
        mueLuFactory->SetupHierarchy(*H);
      }

      comm->barrier();
      tm = Teuchos::null;

      // =========================================================================
      // System solution (Ax = b)
      // =========================================================================
      comm->barrier();
      tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 3 - LHS and RHS initialization")));

      RCP<Vector> X = VectorFactory::Build(map);
      RCP<Vector> B = VectorFactory::Build(map);

      {
        // we set seed for reproducibility
        Utils::SetRandomSeed(*comm);
        X->randomize();
        A->apply(*X, *B, Teuchos::NO_TRANS, one, zero);

        Teuchos::Array<STS::magnitudeType> norms(1);
        B->norm2(norms);
        B->scale(one/norms[0]);
        X->putScalar(zero);
      }
      tm = Teuchos::null;

      if (writeMatricesOPT > -2) {
        tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 3.5 - Matrix output")));
        H->Write(writeMatricesOPT, writeMatricesOPT);
        tm = Teuchos::null;
      }

      comm->barrier();
      if (solveType == "none") {
        // Do not perform a solve

      } else if (solveType == "standalone") {
        tm = rcp (new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 4 - Fixed Point Solve")));

        H->IsPreconditioner(false);
        H->Iterate(*B, *X, maxIts);

      } else if (solveType == "cg" || solveType == "gmres") {
#ifdef HAVE_MUELU_BELOS
        tm = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("ScalingTest: 5 - Belos Solve")));

        // Operator and Multivector type that will be used with Belos
        typedef MultiVector          MV;
        typedef Belos::OperatorT<MV> OP;

        H->IsPreconditioner(true);

        // Define Operator and Preconditioner
        Teuchos::RCP<OP> belosOp   = Teuchos::rcp(new Belos::XpetraOp<SC, LO, GO, NO, LMO>(A)); // Turns a Xpetra::Matrix object into a Belos operator
        Teuchos::RCP<OP> belosPrec = Teuchos::rcp(new Belos::MueLuOp <SC, LO, GO, NO, LMO>(H)); // Turns a MueLu::Hierarchy object into a Belos operator

        // Construct a Belos LinearProblem object
        RCP< Belos::LinearProblem<SC, MV, OP> > belosProblem = rcp(new Belos::LinearProblem<SC, MV, OP>(belosOp, X, B));
        belosProblem->setRightPrec(belosPrec);

        bool set = belosProblem->setProblem();
        if (set == false) {
          out << "\nERROR:  Belos::LinearProblem failed to set up correctly!" << std::endl;
          return EXIT_FAILURE;
        }

        // Belos parameter list
        Teuchos::ParameterList belosList;
        belosList.set("Maximum Iterations",    maxIts); // Maximum number of iterations allowed
        belosList.set("Convergence Tolerance", tol);    // Relative convergence tolerance requested
        belosList.set("Verbosity",             Belos::Errors + Belos::Warnings + Belos::StatusTestDetails);
        belosList.set("Output Frequency",      1);
        belosList.set("Output Style",          Belos::Brief);
        if (!scaleResidualHistory) 
          belosList.set("Implicit Residual Scaling", "None");

        // Create an iterative solver manager
        RCP< Belos::SolverManager<SC, MV, OP> > solver;
        if (solveType == "cg") {
          solver = rcp(new Belos::PseudoBlockCGSolMgr   <SC, MV, OP>(belosProblem, rcp(&belosList, false)));
        } else if (solveType == "gmres") {
          solver = rcp(new Belos::BlockGmresSolMgr<SC, MV, OP>(belosProblem, rcp(&belosList, false)));
        }

        // Perform solve
        Belos::ReturnType ret = Belos::Unconverged;
        try {
          ret = solver->solve();

          // Get the number of iterations for this solve.
          out << "Number of iterations performed for this solve: " << solver->getNumIters() << std::endl;

        } catch(...) {
          out << std::endl << "ERROR:  Belos threw an error! " << std::endl;
        }

        // Check convergence
        if (ret != Belos::Converged)
          out << std::endl << "ERROR:  Belos did not converge! " << std::endl;
        else
          out << std::endl << "SUCCESS:  Belos converged!" << std::endl;
#endif //ifdef HAVE_MUELU_BELOS
      } else {
        throw MueLu::Exceptions::RuntimeError("Unknown solver type: \"" + solveType + "\"");
      }
      comm->barrier();
      tm = Teuchos::null;
      globalTimeMonitor = Teuchos::null;

      if (printTimings)
        TimeMonitor::summarize(A->getRowMap()->getComm().ptr(), std::cout, false, true, false, Teuchos::Union);

      TimeMonitor::clearCounters();

      if (isDriver) {
        if (openedOut != NULL) {
          dup2(savedOut, STDOUT_FILENO);
          fclose(openedOut);
          openedOut = NULL;
        }
        try {
          runList   = paramList.sublist("Run" + MueLu::toString(++runCount), mustAlreadyExist);
          mueluList = runList  .sublist("MueLu", mustAlreadyExist);
        } catch (std::exception) {
          stop = true;
        }
      }

    } while (stop == false);
  }


  return 0;
} //main
 void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomReorder(ArrayRCP<LO> list) const {
   //TODO: replace int
   int n = list.size();
   for(int i = 0; i < n-1; i++)
     std::swap(list[i], list[RandomOrdinal(i,n-1)]);
 }
Esempio n. 24
0
int main(int argc, char *argv[])
{
  Teuchos::GlobalMPISession session(&argc, &argv);
  RCP<const Comm<int> > comm = Teuchos::DefaultComm<int>::getComm();
  int nprocs = comm->getSize();
  int rank = comm->getRank();
  int fail=0, gfail=0;
  double epsilon = 10e-6;

  ////////////////
  // Arrays to hold part Ids and part Sizes for each weight

  int numIdsPerProc = 10;
  int maxNumWeights = 3;
  int maxNumPartSizes = nprocs;
  int *lengths = new int [maxNumWeights];
  part_t **idLists = new part_t * [maxNumWeights];
  scalar_t **sizeLists = new scalar_t * [maxNumWeights];

  for (int w=0; w < maxNumWeights; w++){
    idLists[w] = new part_t [maxNumPartSizes];
    sizeLists[w] = new scalar_t [maxNumPartSizes];
  }

  /////////////
  // A default environment
  RCP<const Zoltan2::Environment> env = rcp(new Zoltan2::Environment);

  /////////////
  // A simple identifier map.

  gno_t *myGids = new gno_t [numIdsPerProc];
  for (int i=0, x=rank*numIdsPerProc; i < numIdsPerProc; i++){
    myGids[i] = x++;
  }

  ArrayRCP<const gno_t> gidArray(myGids, 0, numIdsPerProc, true);

  RCP<const Zoltan2::IdentifierMap<user_t> > idMap = 
    rcp(new Zoltan2::IdentifierMap<user_t>(env, comm, gidArray)); 

  /////////////
  // TEST:
  // One weight, one part per proc.
  // Some part sizes are 2 and some are 1.

  int numGlobalParts = nprocs;
  int nWeights = 1;

  ArrayRCP<ArrayRCP<part_t> > ids;
  ArrayRCP<ArrayRCP<scalar_t> > sizes;

  memset(lengths, 0, sizeof(int) * maxNumWeights);

  lengths[0] = 1;                    // We give a size for 1 part.
  idLists[0][0] = rank;              // The part is my part.
  sizeLists[0][0] = rank%2 + 1.0;    // The size is 1.0 or 2.0

  makeArrays(1, lengths, idLists, sizeLists, ids, sizes);

  // Normalized part size for every part, for checking later on

  scalar_t *normalizedPartSizes = new scalar_t [numGlobalParts];
  scalar_t sumSizes=0;
  for (int i=0; i < numGlobalParts; i++){
    normalizedPartSizes[i] = 1.0;
    if (i % 2) normalizedPartSizes[i] = 2.0;
    sumSizes += normalizedPartSizes[i];
  }
  for (int i=0; i < numGlobalParts; i++)
    normalizedPartSizes[i] /= sumSizes;

  /////////////
  // Create a solution object with part size information, and check it.

  RCP<Zoltan2::PartitioningSolution<idInput_t> > solution;

  try{
    solution = rcp(new Zoltan2::PartitioningSolution<idInput_t>(
      env,                // application environment info
      comm,               // problem communicator
      idMap,              // problem identifiers (global Ids, local Ids)
      nWeights,                  // number of weights
      ids.view(0,nWeights),      // part ids
      sizes.view(0,nWeights))); // part sizes
  }
  catch (std::exception &e){
    fail=1;
  }

  TEST_FAIL_AND_EXIT(*comm, fail==0, "constructor call 1", 1);

  // Test the Solution queries that are used by algorithms

  if (solution->getTargetGlobalNumberOfParts() != size_t(numGlobalParts))
    fail=2;

  if (!fail && solution->getLocalNumberOfParts() != 1)
    fail=3;

  if (!fail && !solution->oneToOnePartDistribution())
    fail=4;

  if (!fail && solution->getPartDistribution() != NULL)
    fail=5;

  if (!fail && solution->getProcDistribution() != NULL)
    fail=6;
      
  if (!fail && 
        ((nprocs>1 && solution->criteriaHasUniformPartSizes(0)) ||
         (nprocs==1 && !solution->criteriaHasUniformPartSizes(0))) )
    fail=8;

  if (!fail){
    for (int partId=0; !fail && partId < numGlobalParts; partId++){
      scalar_t psize = solution->getCriteriaPartSize(0, partId);

      if ( psize < normalizedPartSizes[partId] - epsilon ||
           psize > normalizedPartSizes[partId] + epsilon )
        fail=9;
    }
  }

  delete [] normalizedPartSizes;

  gfail = globalFail(comm, fail);
  if (gfail){
    printFailureCode(comm, fail);   // exits after printing "FAIL"
  }

  // Test the Solution set method that is called by algorithms

  part_t *partAssignments = new part_t [numIdsPerProc];
  for (int i=0; i < numIdsPerProc; i++){
    partAssignments[i] = myGids[i] % numGlobalParts;  // round robin
  }
  ArrayRCP<part_t> partList = arcp(partAssignments, 0, numIdsPerProc);

  try{
    solution->setParts(gidArray, partList, true);
  }
  catch (std::exception &e){
    fail=10;
  }

  gfail = globalFail(comm, fail);
  if (gfail){
    printFailureCode(comm, fail);   // exits after printing "FAIL"
  }

  // Test the Solution get methods that may be called by users 
  // or migration functions.

  if (solution->getLocalNumberOfIds() != size_t(numIdsPerProc))
    fail = 11;

  if (!fail){
    const gno_t *gids = solution->getIdList();
    for (int i=0; !fail && i < numIdsPerProc; i++){
      if (gids[i] != myGids[i])
        fail = 12;
    }
  }

  if (!fail){
    const part_t *parts = solution->getPartList();
    for (int i=0; !fail && i < numIdsPerProc; i++){
      if (parts[i] != myGids[i] % numGlobalParts)
        fail = 13;
    }
  }

  gfail = globalFail(comm, fail);
  if (gfail){
    printFailureCode(comm, fail);   // exits after printing "FAIL"
  }

  if (rank==0)
    std::cout << "PASS" << std::endl;
  
  ///////////////////////////////////////////////////////////////////
  //  TODO:  
  /////////////
  // Create a solution object without part size information, and check it.
  /////////////
  // Test multiple weights.
  /////////////
  // Test multiple parts per process.
  /////////////
  // Specify a list of parts of size 0.  (The rest should be uniform.)

  delete [] lengths;
  for (int w=0; w < maxNumWeights; w++){
    delete [] idLists[w];
    delete [] sizeLists[w];
  }
  delete [] idLists;
  delete [] sizeLists;
}
Esempio n. 25
0
  TEUCHOS_UNIT_TEST(Aggregates, UncoupledPhase3)
  {
    out << "version: " << MueLu::Version() << std::endl;

    RCP<Matrix> A = TestHelpers::TestFactory<SC, LO, GO, NO>::Build1DPoisson(36);
    RCP<const Map> rowmap = A->getRowMap();
    RCP<AmalgamationInfo> amalgInfo;
    RCP<Aggregates> aggregates = gimmeUncoupledAggregates(A, amalgInfo,false,false,false,true);
    GO numAggs = aggregates->GetNumAggregates();
    RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm();

    TEST_EQUALITY(aggregates->AggregatesCrossProcessors(),false);

    ArrayRCP<LO> aggSizes = Teuchos::ArrayRCP<LO>(numAggs);
    ArrayRCP<LO> aggStart;
    ArrayRCP<GO> aggToRowMap;
    amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap);
    for (LO i = 0; i < numAggs; ++i)
      aggSizes[i] = aggStart[i+1] - aggStart[i];

    bool foundAggNotSize2=false;
    for (int i=0; i<aggSizes.size(); ++i)
      if (aggSizes[i] != 2) {
        foundAggNotSize2=true;
        break;
      }

    switch (comm->getSize()) {

      case 1 :
        TEST_EQUALITY(numAggs, 18);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 2:
        TEST_EQUALITY(numAggs, 9);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 3:
        TEST_EQUALITY(numAggs, 6);
        TEST_EQUALITY(foundAggNotSize2, false);
        break;

      case 4:
        TEST_EQUALITY(numAggs, 4);
        TEST_EQUALITY(foundAggNotSize2, true);
        break;

      default:
        std::string msg = "Only 1-4 MPI processes are supported.";
        //throw(MueLu::Exceptions::NotImplemented(msg));
        out << msg << std::endl;
        break;
    }

    //ArrayRCP< ArrayRCP<GO> > aggToRowMap(numAggs);
    int root = out.getOutputToRootOnly();
    out.setOutputToRootOnly(-1);
    for (int j=0; j<comm->getSize(); ++j) {
      if (comm->getRank() == j) {
        out << "++ pid " << j << " ++" << std::endl;
        out << "   num local DOFs = " << rowmap->getNodeNumElements() << std::endl;
        for (int i=0; i< numAggs; ++i) {
          out << "   aggregate " << i << ": ";
          for (int k=aggStart[i]; k< aggStart[i+1]; ++k)
            out << aggToRowMap[k] << " ";
          out << std::endl;
        }
      }
      comm->barrier();
    }
    out.setOutputToRootOnly(root);

  } //UncoupledPhase3
Esempio n. 26
0
/*! \brief Create a mesh of approximately the desired size.
 *
 *  We want 3 dimensions close to equal in length.
 */
const RCP<tMVector_t> getMeshCoordinates(
    const RCP<const Teuchos::Comm<int> > & comm,
    zgno_t numGlobalCoords)
{
  int rank = comm->getRank();
  int nprocs = comm->getSize();

  double k = log(numGlobalCoords) / 3;
  double xdimf = exp(k) + 0.5;
  ssize_t xdim = static_cast<ssize_t>(floor(xdimf));
  ssize_t ydim = xdim;
  ssize_t zdim = numGlobalCoords / (xdim*ydim);
  ssize_t num=xdim*ydim*zdim;
  ssize_t diff = numGlobalCoords - num;
  ssize_t newdiff = 0;

  while (diff > 0){
    if (zdim > xdim && zdim > ydim){
      zdim++;
      newdiff = diff - (xdim*ydim);
      if (newdiff < 0)
        if (diff < -newdiff)
          zdim--;
    }
    else if (ydim > xdim && ydim > zdim){
      ydim++;
      newdiff = diff - (xdim*zdim);
      if (newdiff < 0)
        if (diff < -newdiff)
          ydim--;
    }
    else{
      xdim++;
      newdiff = diff - (ydim*zdim);
      if (newdiff < 0)
        if (diff < -newdiff)
          xdim--;
    }

    diff = newdiff;
  }

  num=xdim*ydim*zdim;
  diff = numGlobalCoords - num;
  if (diff < 0)
    diff /= -numGlobalCoords;
  else
    diff /= numGlobalCoords;

  if (rank == 0){
    if (diff > .01)
      cout << "Warning: Difference " << diff*100 << " percent" << endl;
    cout << "Mesh size: " << xdim << "x" << ydim << "x" <<
      zdim << ", " << num << " vertices." << endl;
  }

  // Divide coordinates.

  ssize_t numLocalCoords = num / nprocs;
  ssize_t leftOver = num % nprocs;
  ssize_t gid0 = 0;

  if (rank <= leftOver)
    gid0 = zgno_t(rank) * (numLocalCoords+1);
  else
    gid0 = (leftOver * (numLocalCoords+1)) + 
           ((zgno_t(rank) - leftOver) * numLocalCoords);

  if (rank < leftOver)
    numLocalCoords++;

  ssize_t gid1 = gid0 + numLocalCoords;

  zgno_t *ids = new zgno_t [numLocalCoords];
  if (!ids)
    throw bad_alloc();
  ArrayRCP<zgno_t> idArray(ids, 0, numLocalCoords, true);

  for (ssize_t i=gid0; i < gid1; i++)
    *ids++ = zgno_t(i);   

  RCP<const tMap_t> idMap = rcp(
    new tMap_t(num, idArray.view(0, numLocalCoords), 0, comm));

  // Create a Tpetra::MultiVector of coordinates.

  zscalar_t *x = new zscalar_t [numLocalCoords*3]; 
  if (!x)
    throw bad_alloc();
  ArrayRCP<zscalar_t> coordArray(x, 0, numLocalCoords*3, true);

  zscalar_t *y = x + numLocalCoords;
  zscalar_t *z = y + numLocalCoords;

  zgno_t xStart = 0;
  zgno_t yStart = 0;
  zgno_t xyPlane = xdim*ydim;
  zgno_t zStart = gid0 / xyPlane;
  zgno_t rem = gid0 % xyPlane;
  if (rem > 0){
    yStart = rem / xdim;
    xStart = rem % xdim;
  }

  zlno_t next = 0;
  for (zscalar_t zval=zStart; next < numLocalCoords && zval < zdim; zval++){
    for (zscalar_t yval=yStart; next < numLocalCoords && yval < ydim; yval++){
      for (zscalar_t xval=xStart; next < numLocalCoords && xval < xdim; xval++){
        x[next] = xval;
        y[next] = yval;
        z[next] = zval;
        next++;
      }
      xStart = 0;
    }
    yStart = 0;
  }

  ArrayView<const zscalar_t> xArray(x, numLocalCoords);
  ArrayView<const zscalar_t> yArray(y, numLocalCoords);
  ArrayView<const zscalar_t> zArray(z, numLocalCoords);
  ArrayRCP<ArrayView<const zscalar_t> > coordinates =
    arcp(new ArrayView<const zscalar_t> [3], 0, 3);
  coordinates[0] = xArray;
  coordinates[1] = yArray;
  coordinates[2] = zArray;

  ArrayRCP<const ArrayView<const zscalar_t> > constCoords =
   coordinates.getConst();

  RCP<tMVector_t> meshCoords = rcp(new tMVector_t(
    idMap, constCoords.view(0,3), 3));

  return meshCoords;
}
Esempio n. 27
0
 /*! \brief Return the metric values.
  *  \param values on return is the array of values.
  */
 ArrayRCP<const MetricValues<scalar_t> > getMetrics() const{
   //BDD return metricsConst_;
     if(metricsConst_.is_null()) return metrics_;
     return metricsConst_;
 }
Esempio n. 28
0
ArrayView<const T>::ArrayView( const ArrayRCP<const T> &arcp )
  : ptr_(arcp.getRawPtr()), size_(arcp.size()), arcp_(arcp)
{}
Esempio n. 29
0
 /*! \brief Print all the metrics
  */
 void printMetrics(std::ostream &os) const {
   Zoltan2::printMetrics<scalar_t, part_t>(os, 
     targetGlobalParts_, numGlobalParts_, numNonEmpty_, 
     metrics_.view(0, metrics_.size()));
 }
Esempio n. 30
0
 Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Vector(
                             const RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> > &map,
                             const ArrayRCP<Scalar> &view, EPrivateComputeViewConstructor /* dummy */)
 : MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>(map,view,view.size(),1,COMPUTE_VIEW_CONSTRUCTOR) {
 }