Example #1
0
void ParmetisPartitioner::initialize (const MeshBase & mesh,
                                      const unsigned int n_sbdmns)
{
  const dof_id_type n_active_local_elem = mesh.n_active_local_elem();

  // Set parameters.
  _pmetis->wgtflag = 2;                                      // weights on vertices only
  _pmetis->ncon    = 1;                                      // one weight per vertex
  _pmetis->numflag = 0;                                      // C-style 0-based numbering
  _pmetis->nparts  = static_cast<Parmetis::idx_t>(n_sbdmns); // number of subdomains to create
  _pmetis->edgecut = 0;                                      // the numbers of edges cut by the
                                                             // partition

  // Initialize data structures for ParMETIS
  _pmetis->vtxdist.resize (mesh.n_processors()+1); std::fill (_pmetis->vtxdist.begin(), _pmetis->vtxdist.end(), 0);
  _pmetis->tpwgts.resize  (_pmetis->nparts);       std::fill (_pmetis->tpwgts.begin(),  _pmetis->tpwgts.end(),  1./_pmetis->nparts);
  _pmetis->ubvec.resize   (_pmetis->ncon);         std::fill (_pmetis->ubvec.begin(),   _pmetis->ubvec.end(),   1.05);
  _pmetis->part.resize    (n_active_local_elem);   std::fill (_pmetis->part.begin(),    _pmetis->part.end(), 0);
  _pmetis->options.resize (5);
  _pmetis->vwgt.resize    (n_active_local_elem);

  // Set the options
  _pmetis->options[0] = 1;  // don't use default options
  _pmetis->options[1] = 0;  // default (level of timing)
  _pmetis->options[2] = 15; // random seed (default)
  _pmetis->options[3] = 2;  // processor distribution and subdomain distribution are decoupled

  // Find the number of active elements on each processor.  We cannot use
  // mesh.n_active_elem_on_proc(pid) since that only returns the number of
  // elements assigned to pid which are currently stored on the calling
  // processor. This will not in general be correct for parallel meshes
  // when (pid!=mesh.processor_id()).
  _n_active_elem_on_proc.resize(mesh.n_processors());
  mesh.comm().allgather(n_active_local_elem, _n_active_elem_on_proc);

  // count the total number of active elements in the mesh.  Note we cannot
  // use mesh.n_active_elem() in general since this only returns the number
  // of active elements which are stored on the calling processor.
  // We should not use n_active_elem for any allocation because that will
  // be inheritly unscalable, but it can be useful for libmesh_assertions.
  dof_id_type n_active_elem=0;

  // Set up the vtxdist array.  This will be the same on each processor.
  // ***** Consult the Parmetis documentation. *****
  libmesh_assert_equal_to (_pmetis->vtxdist.size(),
                           cast_int<std::size_t>(mesh.n_processors()+1));
  libmesh_assert_equal_to (_pmetis->vtxdist[0], 0);

  for (processor_id_type pid=0; pid<mesh.n_processors(); pid++)
    {
      _pmetis->vtxdist[pid+1] = _pmetis->vtxdist[pid] + _n_active_elem_on_proc[pid];
      n_active_elem += _n_active_elem_on_proc[pid];
    }
  libmesh_assert_equal_to (_pmetis->vtxdist.back(), static_cast<Parmetis::idx_t>(n_active_elem));

  // ParMetis expects the elements to be numbered in contiguous blocks
  // by processor, i.e. [0, ne0), [ne0, ne0+ne1), ...
  // Since we only partition active elements we should have no expectation
  // that we currently have such a distribution.  So we need to create it.
  // Also, at the same time we are going to map all the active elements into a globally
  // unique range [0,n_active_elem) which is *independent* of the current partitioning.
  // This can be fed to ParMetis as the initial partitioning of the subdomains (decoupled
  // from the partitioning of the objects themselves).  This allows us to get the same
  // resultant partitioning independed of the input partitioning.
  MeshTools::BoundingBox bbox =
    MeshTools::bounding_box(mesh);

  _global_index_by_pid_map.clear();

  // Maps active element ids into a contiguous range independent of partitioning.
  // (only needs local scope)
  vectormap<dof_id_type, dof_id_type> global_index_map;

  {
    std::vector<dof_id_type> global_index;

    // create the mapping which is contiguous by processor
    dof_id_type pid_offset=0;
    for (processor_id_type pid=0; pid<mesh.n_processors(); pid++)
      {
        MeshBase::const_element_iterator       it  = mesh.active_pid_elements_begin(pid);
        const MeshBase::const_element_iterator end = mesh.active_pid_elements_end(pid);

        // note that we may not have all (or any!) the active elements which belong on this processor,
        // but by calling this on all processors a unique range in [0,_n_active_elem_on_proc[pid])
        // is constructed.  Only the indices for the elements we pass in are returned in the array.
        MeshCommunication().find_global_indices (mesh.comm(),
                                                 bbox, it, end,
                                                 global_index);

        for (dof_id_type cnt=0; it != end; ++it)
          {
            const Elem * elem = *it;
            libmesh_assert (!_global_index_by_pid_map.count(elem->id()));
            libmesh_assert_less (cnt, global_index.size());
            libmesh_assert_less (global_index[cnt], _n_active_elem_on_proc[pid]);

            _global_index_by_pid_map.insert(std::make_pair(elem->id(), global_index[cnt++] + pid_offset));
          }

        pid_offset += _n_active_elem_on_proc[pid];
      }

    // create the unique mapping for all active elements independent of partitioning
    {
      MeshBase::const_element_iterator       it  = mesh.active_elements_begin();
      const MeshBase::const_element_iterator end = mesh.active_elements_end();

      // Calling this on all processors a unique range in [0,n_active_elem) is constructed.
      // Only the indices for the elements we pass in are returned in the array.
      MeshCommunication().find_global_indices (mesh.comm(),
                                               bbox, it, end,
                                               global_index);

      for (dof_id_type cnt=0; it != end; ++it)
        {
          const Elem * elem = *it;
          libmesh_assert (!global_index_map.count(elem->id()));
          libmesh_assert_less (cnt, global_index.size());
          libmesh_assert_less (global_index[cnt], n_active_elem);

          global_index_map.insert(std::make_pair(elem->id(), global_index[cnt++]));
        }
    }
    // really, shouldn't be close!
    libmesh_assert_less_equal (global_index_map.size(), n_active_elem);
    libmesh_assert_less_equal (_global_index_by_pid_map.size(), n_active_elem);

    // At this point the two maps should be the same size.  If they are not
    // then the number of active elements is not the same as the sum over all
    // processors of the number of active elements per processor, which means
    // there must be some unpartitioned objects out there.
    if (global_index_map.size() != _global_index_by_pid_map.size())
      libmesh_error_msg("ERROR:  ParmetisPartitioner cannot handle unpartitioned objects!");
  }

  // Finally, we need to initialize the vertex (partition) weights and the initial subdomain
  // mapping.  The subdomain mapping will be independent of the processor mapping, and is
  // defined by a simple mapping of the global indices we just found.
  {
    std::vector<dof_id_type> subdomain_bounds(mesh.n_processors());

    const dof_id_type first_local_elem = _pmetis->vtxdist[mesh.processor_id()];

    for (processor_id_type pid=0; pid<mesh.n_processors(); pid++)
      {
        dof_id_type tgt_subdomain_size = 0;

        // watch out for the case that n_subdomains < n_processors
        if (pid < static_cast<unsigned int>(_pmetis->nparts))
          {
            tgt_subdomain_size = n_active_elem/std::min
              (cast_int<Parmetis::idx_t>(mesh.n_processors()), _pmetis->nparts);

            if (pid < n_active_elem%_pmetis->nparts)
              tgt_subdomain_size++;
          }
        if (pid == 0)
          subdomain_bounds[0] = tgt_subdomain_size;
        else
          subdomain_bounds[pid] = subdomain_bounds[pid-1] + tgt_subdomain_size;
      }

    libmesh_assert_equal_to (subdomain_bounds.back(), n_active_elem);

    MeshBase::const_element_iterator       elem_it  = mesh.active_local_elements_begin();
    const MeshBase::const_element_iterator elem_end = mesh.active_local_elements_end();

    for (; elem_it != elem_end; ++elem_it)
      {
        const Elem * elem = *elem_it;

        libmesh_assert (_global_index_by_pid_map.count(elem->id()));
        const dof_id_type global_index_by_pid =
          _global_index_by_pid_map[elem->id()];
        libmesh_assert_less (global_index_by_pid, n_active_elem);

        const dof_id_type local_index =
          global_index_by_pid - first_local_elem;

        libmesh_assert_less (local_index, n_active_local_elem);
        libmesh_assert_less (local_index, _pmetis->vwgt.size());

        // TODO:[BSK] maybe there is a better weight?
        _pmetis->vwgt[local_index] = elem->n_nodes();

        // find the subdomain this element belongs in
        libmesh_assert (global_index_map.count(elem->id()));
        const dof_id_type global_index =
          global_index_map[elem->id()];

        libmesh_assert_less (global_index, subdomain_bounds.back());

        const unsigned int subdomain_id =
          std::distance(subdomain_bounds.begin(),
                        std::lower_bound(subdomain_bounds.begin(),
                                         subdomain_bounds.end(),
                                         global_index));
        libmesh_assert_less (subdomain_id, static_cast<unsigned int>(_pmetis->nparts));
        libmesh_assert_less (local_index, _pmetis->part.size());

        _pmetis->part[local_index] = subdomain_id;
      }
  }
}