void
MultiAppProjectionTransfer::initialSetup()
{
  getAppInfo();

  _proj_sys.resize(_to_problems.size(), NULL);

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    FEProblemBase & to_problem = *_to_problems[i_to];
    EquationSystems & to_es = to_problem.es();

    // Add the projection system.
    FEType fe_type = to_problem.getVariable(0, _to_var_name).feType();
    LinearImplicitSystem & proj_sys = to_es.add_system<LinearImplicitSystem>("proj-sys-" + name());
    _proj_var_num = proj_sys.add_variable("var", fe_type);
    proj_sys.attach_assemble_function(assemble_l2);
    _proj_sys[i_to] = &proj_sys;

    // Prevent the projection system from being written to checkpoint
    // files.  In the event of a recover or restart, we'll read the checkpoint
    // before this initialSetup method is called.  As a result, we'll find
    // systems in the checkpoint (the projection systems) that we don't know
    // what to do with, and there will be a crash.  We could fix this by making
    // the systems in the constructor, except we don't know how many sub apps
    // there are at the time of construction.  So instead, we'll just nuke the
    // projection system and rebuild it from scratch every recover/restart.
    proj_sys.hide_output() = true;

    // Reinitialize EquationSystems since we added a system.
    to_es.reinit();
  }

  if (_fixed_meshes)
  {
    _cached_qps.resize(n_processors());
    _cached_index_map.resize(n_processors());
  }
}
void
MultiAppNearestNodeTransfer::execute()
{
  _console << "Beginning NearestNodeTransfer " << name() << std::endl;

  getAppInfo();

  // Get the bounding boxes for the "from" domains.
  std::vector<BoundingBox> bboxes = getFromBoundingBoxes();

  // Figure out how many "from" domains each processor owns.
  std::vector<unsigned int> froms_per_proc = getFromsPerProc();

  ////////////////////
  // For every point in the local "to" domain, figure out which "from" domains
  // might contain it's nearest neighbor, and send that point to the processors
  // that own those "from" domains.
  //
  // How do we know which "from" domains might contain the nearest neighbor, you
  // ask?  Well, consider two "from" domains, A and B.  If every point in A is
  // closer than every point in B, then we know that B cannot possibly contain
  // the nearest neighbor.  Hence, we'll only check A for the nearest neighbor.
  // We'll use the functions bboxMaxDistance and bboxMinDistance to figure out
  // if every point in A is closer than every point in B.
  ////////////////////

  // outgoing_qps = nodes/centroids we'll send to other processors.
  std::vector<std::vector<Point>> outgoing_qps(n_processors());
  // When we get results back, node_index_map will tell us which results go with
  // which points
  std::vector<std::map<std::pair<unsigned int, unsigned int>, unsigned int>> node_index_map(
      n_processors());

  if (!_neighbors_cached)
  {
    for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
    {
      System * to_sys = find_sys(*_to_es[i_to], _to_var_name);
      unsigned int sys_num = to_sys->number();
      unsigned int var_num = to_sys->variable_number(_to_var_name);
      MeshBase * to_mesh = &_to_meshes[i_to]->getMesh();
      bool is_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

      if (is_nodal)
      {
        std::vector<Node *> target_local_nodes;

        if (isParamValid("target_boundary"))
        {
          BoundaryID target_bnd_id =
              _to_meshes[i_to]->getBoundaryID(getParam<BoundaryName>("target_boundary"));

          ConstBndNodeRange & bnd_nodes = *(_to_meshes[i_to])->getBoundaryNodeRange();
          for (const auto & bnode : bnd_nodes)
            if (bnode->_bnd_id == target_bnd_id && bnode->_node->processor_id() == processor_id())
              target_local_nodes.push_back(bnode->_node);
        }
        else
        {
          target_local_nodes.resize(to_mesh->n_local_nodes());
          unsigned int i = 0;
          for (auto & node : to_mesh->local_node_ptr_range())
            target_local_nodes[i++] = node;
        }

        for (const auto & node : target_local_nodes)
        {
          // Skip this node if the variable has no dofs at it.
          if (node->n_dofs(sys_num, var_num) < 1)
            continue;

          // Find which bboxes might have the nearest node to this point.
          Real nearest_max_distance = std::numeric_limits<Real>::max();
          for (const auto & bbox : bboxes)
          {
            Real distance = bboxMaxDistance(*node, bbox);
            if (distance < nearest_max_distance)
              nearest_max_distance = distance;
          }

          unsigned int from0 = 0;
          for (processor_id_type i_proc = 0; i_proc < n_processors();
               from0 += froms_per_proc[i_proc], i_proc++)
          {
            bool qp_found = false;
            for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && !qp_found;
                 i_from++)
            {
              Real distance = bboxMinDistance(*node, bboxes[i_from]);
              if (distance < nearest_max_distance || bboxes[i_from].contains_point(*node))
              {
                std::pair<unsigned int, unsigned int> key(i_to, node->id());
                node_index_map[i_proc][key] = outgoing_qps[i_proc].size();
                outgoing_qps[i_proc].push_back(*node + _to_positions[i_to]);
                qp_found = true;
              }
            }
          }
        }
      }
      else // Elemental
      {
        for (auto & elem : as_range(to_mesh->local_elements_begin(), to_mesh->local_elements_end()))
        {
          Point centroid = elem->centroid();

          // Skip this element if the variable has no dofs at it.
          if (elem->n_dofs(sys_num, var_num) < 1)
            continue;

          // Find which bboxes might have the nearest node to this point.
          Real nearest_max_distance = std::numeric_limits<Real>::max();
          for (const auto & bbox : bboxes)
          {
            Real distance = bboxMaxDistance(centroid, bbox);
            if (distance < nearest_max_distance)
              nearest_max_distance = distance;
          }

          unsigned int from0 = 0;
          for (processor_id_type i_proc = 0; i_proc < n_processors();
               from0 += froms_per_proc[i_proc], i_proc++)
          {
            bool qp_found = false;
            for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && !qp_found;
                 i_from++)
            {
              Real distance = bboxMinDistance(centroid, bboxes[i_from]);
              if (distance < nearest_max_distance || bboxes[i_from].contains_point(centroid))
              {
                std::pair<unsigned int, unsigned int> key(i_to, elem->id());
                node_index_map[i_proc][key] = outgoing_qps[i_proc].size();
                outgoing_qps[i_proc].push_back(centroid + _to_positions[i_to]);
                qp_found = true;
              }
            }
          }
        }
      }
    }
  }

  ////////////////////
  // Send local node/centroid positions off to the other processors and take
  // care of points sent to this processor.  We'll need to check the points
  // against all of the "from" domains that this processor owns.  For each
  // point, we'll find the nearest node, then we'll send the value at that node
  // and the distance between the node and the point back to the processor that
  // requested that point.
  ////////////////////

  std::vector<std::vector<Real>> incoming_evals(n_processors());
  std::vector<Parallel::Request> send_qps(n_processors());
  std::vector<Parallel::Request> send_evals(n_processors());

  // Create these here so that they live the entire life of this function
  // and are NOT reused per processor.
  std::vector<std::vector<Real>> processor_outgoing_evals(n_processors());

  if (!_neighbors_cached)
  {
    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      if (i_proc == processor_id())
        continue;
      _communicator.send(i_proc, outgoing_qps[i_proc], send_qps[i_proc]);
    }

    // Build an array of pointers to all of this processor's local nodes.  We
    // need to do this to avoid the expense of using LibMesh iterators.  This
    // step also takes care of limiting the search to boundary nodes, if
    // applicable.
    std::vector<std::vector<Node *>> local_nodes(froms_per_proc[processor_id()]);
    for (unsigned int i = 0; i < froms_per_proc[processor_id()]; i++)
    {
      getLocalNodes(_from_meshes[i], local_nodes[i]);
    }

    if (_fixed_meshes)
    {
      _cached_froms.resize(n_processors());
      _cached_dof_ids.resize(n_processors());
    }

    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      std::vector<Point> incoming_qps;
      if (i_proc == processor_id())
        incoming_qps = outgoing_qps[i_proc];
      else
        _communicator.receive(i_proc, incoming_qps);

      if (_fixed_meshes)
      {
        _cached_froms[i_proc].resize(incoming_qps.size());
        _cached_dof_ids[i_proc].resize(incoming_qps.size());
      }

      std::vector<Real> & outgoing_evals = processor_outgoing_evals[i_proc];
      outgoing_evals.resize(2 * incoming_qps.size());

      for (unsigned int qp = 0; qp < incoming_qps.size(); qp++)
      {
        Point qpt = incoming_qps[qp];
        outgoing_evals[2 * qp] = std::numeric_limits<Real>::max();
        for (unsigned int i_local_from = 0; i_local_from < froms_per_proc[processor_id()];
             i_local_from++)
        {
          MooseVariableFEBase & from_var =
              _from_problems[i_local_from]->getVariable(0,
                                                        _from_var_name,
                                                        Moose::VarKindType::VAR_ANY,
                                                        Moose::VarFieldType::VAR_FIELD_STANDARD);
          System & from_sys = from_var.sys().system();
          unsigned int from_sys_num = from_sys.number();
          unsigned int from_var_num = from_sys.variable_number(from_var.name());

          for (unsigned int i_node = 0; i_node < local_nodes[i_local_from].size(); i_node++)
          {
            Real current_distance =
                (qpt - *(local_nodes[i_local_from][i_node]) - _from_positions[i_local_from]).norm();
            if (current_distance < outgoing_evals[2 * qp])
            {
              // Assuming LAGRANGE!
              if (local_nodes[i_local_from][i_node]->n_dofs(from_sys_num, from_var_num) > 0)
              {
                dof_id_type from_dof =
                    local_nodes[i_local_from][i_node]->dof_number(from_sys_num, from_var_num, 0);

                outgoing_evals[2 * qp] = current_distance;
                outgoing_evals[2 * qp + 1] = (*from_sys.solution)(from_dof);

                if (_fixed_meshes)
                {
                  // Cache the nearest nodes.
                  _cached_froms[i_proc][qp] = i_local_from;
                  _cached_dof_ids[i_proc][qp] = from_dof;
                }
              }
            }
          }
        }
      }

      if (i_proc == processor_id())
        incoming_evals[i_proc] = outgoing_evals;
      else
        _communicator.send(i_proc, outgoing_evals, send_evals[i_proc]);
    }
  }

  else // We've cached the nearest nodes.
  {
    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      std::vector<Real> & outgoing_evals = processor_outgoing_evals[i_proc];
      outgoing_evals.resize(_cached_froms[i_proc].size());

      for (unsigned int qp = 0; qp < outgoing_evals.size(); qp++)
      {
        MooseVariableFEBase & from_var = _from_problems[_cached_froms[i_proc][qp]]->getVariable(
            0,
            _from_var_name,
            Moose::VarKindType::VAR_ANY,
            Moose::VarFieldType::VAR_FIELD_STANDARD);
        System & from_sys = from_var.sys().system();
        dof_id_type from_dof = _cached_dof_ids[i_proc][qp];
        // outgoing_evals[qp] = (*from_sys.solution)(_cached_dof_ids[i_proc][qp]);
        outgoing_evals[qp] = (*from_sys.solution)(from_dof);
      }

      if (i_proc == processor_id())
        incoming_evals[i_proc] = outgoing_evals;
      else
        _communicator.send(i_proc, outgoing_evals, send_evals[i_proc]);
    }
  }

  ////////////////////
  // Gather all of the evaluations, find the nearest one for each node/element,
  // and apply the values.
  ////////////////////

  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;

    _communicator.receive(i_proc, incoming_evals[i_proc]);
  }

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    // Loop over the master nodes and set the value of the variable
    System * to_sys = find_sys(*_to_es[i_to], _to_var_name);

    unsigned int sys_num = to_sys->number();
    unsigned int var_num = to_sys->variable_number(_to_var_name);

    NumericVector<Real> * solution = nullptr;
    switch (_direction)
    {
      case TO_MULTIAPP:
        solution = &getTransferVector(i_to, _to_var_name);
        break;
      case FROM_MULTIAPP:
        solution = to_sys->solution.get();
        break;
      default:
        mooseError("Unknown direction");
    }

    MeshBase * to_mesh = &_to_meshes[i_to]->getMesh();

    bool is_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

    if (is_nodal)
    {
      std::vector<Node *> target_local_nodes;

      if (isParamValid("target_boundary"))
      {
        BoundaryID target_bnd_id =
            _to_meshes[i_to]->getBoundaryID(getParam<BoundaryName>("target_boundary"));

        ConstBndNodeRange & bnd_nodes = *(_to_meshes[i_to])->getBoundaryNodeRange();
        for (const auto & bnode : bnd_nodes)
          if (bnode->_bnd_id == target_bnd_id && bnode->_node->processor_id() == processor_id())
            target_local_nodes.push_back(bnode->_node);
      }
      else
      {
        target_local_nodes.resize(to_mesh->n_local_nodes());
        unsigned int i = 0;
        for (auto & node : to_mesh->local_node_ptr_range())
          target_local_nodes[i++] = node;
      }

      for (const auto & node : target_local_nodes)
      {
        // Skip this node if the variable has no dofs at it.
        if (node->n_dofs(sys_num, var_num) < 1)
          continue;

        Real best_val = 0;
        if (!_neighbors_cached)
        {
          Real min_dist = std::numeric_limits<Real>::max();
          for (unsigned int i_from = 0; i_from < incoming_evals.size(); i_from++)
          {
            std::pair<unsigned int, unsigned int> key(i_to, node->id());
            if (node_index_map[i_from].find(key) == node_index_map[i_from].end())
              continue;
            unsigned int qp_ind = node_index_map[i_from][key];
            if (incoming_evals[i_from][2 * qp_ind] >= min_dist)
              continue;
            min_dist = incoming_evals[i_from][2 * qp_ind];
            best_val = incoming_evals[i_from][2 * qp_ind + 1];

            if (_fixed_meshes)
            {
              // Cache these indices.
              _cached_from_inds[node->id()] = i_from;
              _cached_qp_inds[node->id()] = qp_ind;
            }
          }
        }

        else
        {
          best_val = incoming_evals[_cached_from_inds[node->id()]][_cached_qp_inds[node->id()]];
        }

        dof_id_type dof = node->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    else // Elemental
    {
      for (auto & elem : as_range(to_mesh->local_elements_begin(), to_mesh->local_elements_end()))
      {
        // Skip this element if the variable has no dofs at it.
        if (elem->n_dofs(sys_num, var_num) < 1)
          continue;

        Real best_val = 0;
        if (!_neighbors_cached)
        {
          Real min_dist = std::numeric_limits<Real>::max();
          for (unsigned int i_from = 0; i_from < incoming_evals.size(); i_from++)
          {
            std::pair<unsigned int, unsigned int> key(i_to, elem->id());
            if (node_index_map[i_from].find(key) == node_index_map[i_from].end())
              continue;
            unsigned int qp_ind = node_index_map[i_from][key];
            if (incoming_evals[i_from][2 * qp_ind] >= min_dist)
              continue;
            min_dist = incoming_evals[i_from][2 * qp_ind];
            best_val = incoming_evals[i_from][2 * qp_ind + 1];

            if (_fixed_meshes)
            {
              // Cache these indices.
              _cached_from_inds[elem->id()] = i_from;
              _cached_qp_inds[elem->id()] = qp_ind;
            }
          }
        }

        else
        {
          best_val = incoming_evals[_cached_from_inds[elem->id()]][_cached_qp_inds[elem->id()]];
        }

        dof_id_type dof = elem->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    solution->close();
    to_sys->update();
  }

  if (_fixed_meshes)
    _neighbors_cached = true;

  // Make sure all our sends succeeded.
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;
    send_qps[i_proc].wait();
    send_evals[i_proc].wait();
  }

  _console << "Finished NearestNodeTransfer " << name() << std::endl;
}
void
MultiAppProjectionTransfer::execute()
{
  _console << "Beginning projection transfer " << name() << std::endl;

  getAppInfo();

  ////////////////////
  // We are going to project the solutions by solving some linear systems.  In
  // order to assemble the systems, we need to evaluate the "from" domain
  // solutions at quadrature points in the "to" domain.  Some parallel
  // communication is necessary because each processor doesn't necessarily have
  // all the "from" information it needs to set its "to" values.  We don't want
  // to use a bunch of big all-to-all broadcasts, so we'll use bounding boxes to
  // figure out which processors have the information we need and only
  // communicate with those processors.
  //
  // Each processor will
  // 1. Check its local quadrature points in the "to" domains to see which
  //    "from" domains they might be in.
  // 2. Send quadrature points to the processors with "from" domains that might
  //    contain those points.
  // 3. Recieve quadrature points from other processors, evaluate its mesh
  //    functions at those points, and send the values back to the proper
  //    processor
  // 4. Recieve mesh function evaluations from all relevant processors and
  //    decide which one to use at every quadrature point (the lowest global app
  //    index always wins)
  // 5. And use the mesh function evaluations to assemble and solve an L2
  //    projection system on its local elements.
  ////////////////////

  ////////////////////
  // For every combination of global "from" problem and local "to" problem, find
  // which "from" bounding boxes overlap with which "to" elements.  Keep track
  // of which processors own bounding boxes that overlap with which elements.
  // Build vectors of quadrature points to send to other processors for mesh
  // function evaluations.
  ////////////////////

  // Get the bounding boxes for the "from" domains.
  std::vector<MeshTools::BoundingBox> bboxes = getFromBoundingBoxes();

  // Figure out how many "from" domains each processor owns.
  std::vector<unsigned int> froms_per_proc = getFromsPerProc();

  std::vector<std::vector<Point> > outgoing_qps(n_processors());
  std::vector<std::map<std::pair<unsigned int, unsigned int>, unsigned int> > element_index_map(n_processors());
  // element_index_map[i_to, element_id] = index
  // outgoing_qps[index] is the first quadrature point in element

  if (! _qps_cached)
  {
    for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
    {
      MeshBase & to_mesh = _to_meshes[i_to]->getMesh();

      LinearImplicitSystem & system = * _proj_sys[i_to];

      FEType fe_type = system.variable_type(0);
      std::unique_ptr<FEBase> fe(FEBase::build(to_mesh.mesh_dimension(), fe_type));
      QGauss qrule(to_mesh.mesh_dimension(), fe_type.default_quadrature_order());
      fe->attach_quadrature_rule(&qrule);
      const std::vector<Point> & xyz = fe->get_xyz();

      MeshBase::const_element_iterator       el     = to_mesh.local_elements_begin();
      const MeshBase::const_element_iterator end_el = to_mesh.local_elements_end();

      unsigned int from0 = 0;
      for (processor_id_type i_proc = 0;
           i_proc < n_processors();
           from0 += froms_per_proc[i_proc], i_proc++)
      {
        for (el = to_mesh.local_elements_begin(); el != end_el; el++)
        {
          const Elem* elem = *el;
          fe->reinit (elem);

          bool qp_hit = false;
          for (unsigned int i_from = 0;
               i_from < froms_per_proc[i_proc] && ! qp_hit; i_from++)
          {
            for (unsigned int qp = 0;
                 qp < qrule.n_points() && ! qp_hit; qp ++)
            {
              Point qpt = xyz[qp];
              if (bboxes[from0 + i_from].contains_point(qpt + _to_positions[i_to]))
                qp_hit = true;
            }
          }

          if (qp_hit)
          {
            // The selected processor's bounding box contains at least one
            // quadrature point from this element.  Send all qps from this element
            // and remember where they are in the array using the map.
            std::pair<unsigned int, unsigned int> key(i_to, elem->id());
            element_index_map[i_proc][key] = outgoing_qps[i_proc].size();
            for (unsigned int qp = 0; qp < qrule.n_points(); qp ++)
            {
              Point qpt = xyz[qp];
              outgoing_qps[i_proc].push_back(qpt + _to_positions[i_to]);
            }
          }
        }
      }
    }

    if (_fixed_meshes)
      _cached_index_map = element_index_map;
  }
  else
  {
    element_index_map = _cached_index_map;
  }

  ////////////////////
  // Request quadrature point evaluations from other processors and handle
  // requests sent to this processor.
  ////////////////////

  // Non-blocking send quadrature points to other processors.
  std::vector<Parallel::Request> send_qps(n_processors());
  if (! _qps_cached)
    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
      if (i_proc != processor_id())
        _communicator.send(i_proc, outgoing_qps[i_proc], send_qps[i_proc]);

  // Get the local bounding boxes.
  std::vector<MeshTools::BoundingBox> local_bboxes(froms_per_proc[processor_id()]);
  {
    // Find the index to the first of this processor's local bounding boxes.
    unsigned int local_start = 0;
    for (processor_id_type i_proc = 0;
         i_proc < n_processors() && i_proc != processor_id();
         i_proc++)
      local_start += froms_per_proc[i_proc];

    // Extract the local bounding boxes.
    for (unsigned int i_from = 0; i_from < froms_per_proc[processor_id()]; i_from++)
      local_bboxes[i_from] = bboxes[local_start + i_from];
  }

  // Setup the local mesh functions.
  std::vector<MeshFunction *> local_meshfuns(froms_per_proc[processor_id()], NULL);
  for (unsigned int i_from = 0; i_from < _from_problems.size(); i_from++)
  {
    FEProblemBase & from_problem = *_from_problems[i_from];
    MooseVariable & from_var = from_problem.getVariable(0, _from_var_name);
    System & from_sys = from_var.sys().system();
    unsigned int from_var_num = from_sys.variable_number(from_var.name());

    MeshFunction * from_func = new MeshFunction(from_problem.es(),
         *from_sys.current_local_solution, from_sys.get_dof_map(), from_var_num);
    from_func->init(Trees::ELEMENTS);
    from_func->enable_out_of_mesh_mode(OutOfMeshValue);
    local_meshfuns[i_from] = from_func;
  }

  // Recieve quadrature points from other processors, evaluate mesh frunctions
  // at those points, and send the values back.
  std::vector<Parallel::Request> send_evals(n_processors());
  std::vector<Parallel::Request> send_ids(n_processors());
  std::vector<std::vector<Real> > outgoing_evals(n_processors());
  std::vector<std::vector<unsigned int> > outgoing_ids(n_processors());
  std::vector<std::vector<Real> > incoming_evals(n_processors());
  std::vector<std::vector<unsigned int> > incoming_app_ids(n_processors());
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    // Use the cached qps if they're available.
    std::vector<Point> incoming_qps;
    if (! _qps_cached)
    {
      if (i_proc == processor_id())
        incoming_qps = outgoing_qps[i_proc];
      else
        _communicator.receive(i_proc, incoming_qps);
      // Cache these qps for later if _fixed_meshes
      if (_fixed_meshes)
        _cached_qps[i_proc] = incoming_qps;
    }
    else
    {
      incoming_qps = _cached_qps[i_proc];
    }

    outgoing_evals[i_proc].resize(incoming_qps.size(), OutOfMeshValue);
    if (_direction == FROM_MULTIAPP)
      outgoing_ids[i_proc].resize(incoming_qps.size(), libMesh::invalid_uint);
    for (unsigned int qp = 0; qp < incoming_qps.size(); qp++)
    {
      Point qpt = incoming_qps[qp];

      // Loop until we've found the lowest-ranked app that actually contains
      // the quadrature point.
      for (unsigned int i_from = 0; i_from < _from_problems.size(); i_from++)
      {
        if (local_bboxes[i_from].contains_point(qpt))
        {
          outgoing_evals[i_proc][qp] = (* local_meshfuns[i_from])(qpt - _from_positions[i_from]);
          if (_direction == FROM_MULTIAPP)
            outgoing_ids[i_proc][qp] = _local2global_map[i_from];
        }
      }
    }

    if (i_proc == processor_id())
    {
      incoming_evals[i_proc] = outgoing_evals[i_proc];
      if (_direction == FROM_MULTIAPP)
        incoming_app_ids[i_proc] = outgoing_ids[i_proc];
    }
    else
    {
      _communicator.send(i_proc, outgoing_evals[i_proc], send_evals[i_proc]);
      if (_direction == FROM_MULTIAPP)
        _communicator.send(i_proc, outgoing_ids[i_proc], send_ids[i_proc]);
    }
  }

  ////////////////////
  // Gather all of the qp evaluations and pick out the best ones for each qp.
  ////////////////////
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;
    _communicator.receive(i_proc, incoming_evals[i_proc]);
    if (_direction == FROM_MULTIAPP)
      _communicator.receive(i_proc, incoming_app_ids[i_proc]);
  }

  std::vector<std::vector<Real> > final_evals(_to_problems.size());
  std::vector<std::map<unsigned int, unsigned int> > trimmed_element_maps(_to_problems.size());

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    MeshBase & to_mesh = _to_meshes[i_to]->getMesh();
    LinearImplicitSystem & system = * _proj_sys[i_to];

    FEType fe_type = system.variable_type(0);
    std::unique_ptr<FEBase> fe(FEBase::build(to_mesh.mesh_dimension(), fe_type));
    QGauss qrule(to_mesh.mesh_dimension(), fe_type.default_quadrature_order());
    fe->attach_quadrature_rule(&qrule);
    const std::vector<Point> & xyz = fe->get_xyz();

    MeshBase::const_element_iterator       el     = to_mesh.local_elements_begin();
    const MeshBase::const_element_iterator end_el = to_mesh.local_elements_end();

    for (el = to_mesh.active_local_elements_begin(); el != end_el; el++)
    {
      const Elem* elem = *el;
      fe->reinit (elem);

      bool element_is_evaled = false;
      std::vector<Real> evals(qrule.n_points(), 0.);

      for (unsigned int qp = 0; qp < qrule.n_points(); qp++)
      {
        Point qpt = xyz[qp];

        unsigned int lowest_app_rank = libMesh::invalid_uint;
        for (unsigned int i_proc = 0; i_proc < n_processors(); i_proc++)
        {
          // Ignore the selected processor if the element wasn't found in it's
          // bounding box.
          std::map<std::pair<unsigned int, unsigned int>, unsigned int> & map = element_index_map[i_proc];
          std::pair<unsigned int, unsigned int> key(i_to, elem->id());
          if (map.find(key) == map.end())
            continue;
          unsigned int qp0 = map[key];

          // Ignore the selected processor if it's app has a higher rank than the
          // previously found lowest app rank.
          if (_direction == FROM_MULTIAPP)
            if (incoming_app_ids[i_proc][qp0 + qp] >= lowest_app_rank)
              continue;

          // Ignore the selected processor if the qp was actually outside the
          // processor's subapp's mesh.
          if (incoming_evals[i_proc][qp0 + qp] == OutOfMeshValue)
            continue;

          // This is the best meshfunction evaluation so far, save it.
          element_is_evaled = true;
          evals[qp] = incoming_evals[i_proc][qp0 + qp];
        }
      }

      // If we found good evaluations for any of the qps in this element, save
      // those evaluations for later.
      if (element_is_evaled)
      {
        trimmed_element_maps[i_to][elem->id()] = final_evals[i_to].size();
        for (unsigned int qp = 0; qp < qrule.n_points(); qp++)
          final_evals[i_to].push_back(evals[qp]);
      }
    }
  }

  ////////////////////
  // We now have just one or zero mesh function values at all of our local
  // quadrature points.  Stash those values (and a map linking them to element
  // ids) in the equation systems parameters and project the solution.
  ////////////////////

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    _to_es[i_to]->parameters.set<std::vector<Real>*>("final_evals") = & final_evals[i_to];
    _to_es[i_to]->parameters.set<std::map<unsigned int, unsigned int>*>("element_map") = & trimmed_element_maps[i_to];
    projectSolution(i_to);
    _to_es[i_to]->parameters.set<std::vector<Real>*>("final_evals") = NULL;
    _to_es[i_to]->parameters.set<std::map<unsigned int, unsigned int>*>("element_map") = NULL;
  }

  for (unsigned int i = 0; i < _from_problems.size(); i++)
    delete local_meshfuns[i];


  // Make sure all our sends succeeded.
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;
    if (! _qps_cached)
      send_qps[i_proc].wait();
    send_evals[i_proc].wait();
    if (_direction == FROM_MULTIAPP)
      send_ids[i_proc].wait();
  }

  if (_fixed_meshes)
    _qps_cached = true;

  _console << "Finished projection transfer " << name() << std::endl;
}
void
MultiAppMeshFunctionTransfer::execute()
{
  Moose::out << "Beginning MeshFunctionTransfer " << name() << std::endl;

  getAppInfo();

  /**
   * For every combination of global "from" problem and local "to" problem, find
   * which "from" bounding boxes overlap with which "to" elements.  Keep track
   * of which processors own bounding boxes that overlap with which elements.
   * Build vectors of node locations/element centroids to send to other
   * processors for mesh function evaluations.
   */

  // Get the bounding boxes for the "from" domains.
  std::vector<MeshTools::BoundingBox> bboxes = getFromBoundingBoxes();

  // Figure out how many "from" domains each processor owns.
  std::vector<unsigned int> froms_per_proc = getFromsPerProc();

  std::vector<std::vector<Point> > outgoing_points(n_processors());
  std::vector<std::map<std::pair<unsigned int, unsigned int>, unsigned int> > point_index_map(n_processors());
  // point_index_map[i_to, element_id] = index
  // outgoing_points[index] is the first quadrature point in element

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    System * to_sys = find_sys(*_to_es[i_to], _to_var_name);
    unsigned int sys_num = to_sys->number();
    unsigned int var_num = to_sys->variable_number(_to_var_name);
    MeshBase * to_mesh = & _to_meshes[i_to]->getMesh();
    bool is_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

    if (is_nodal)
    {
      MeshBase::const_node_iterator node_it = to_mesh->local_nodes_begin();
      MeshBase::const_node_iterator node_end = to_mesh->local_nodes_end();

      for (; node_it != node_end; ++node_it)
      {
        Node * node = *node_it;

        // Skip this node if the variable has no dofs at it.
        if (node->n_dofs(sys_num, var_num) < 1)
          continue;

        // Loop over the "froms" on processor i_proc.  If the node is found in
        // any of the "froms", add that node to the vector that will be sent to
        // i_proc.
        unsigned int from0 = 0;
        for (processor_id_type i_proc = 0;
             i_proc < n_processors();
             from0 += froms_per_proc[i_proc], i_proc++)
        {
          bool point_found = false;
          for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && ! point_found; i_from++)
          {
            if (bboxes[i_from].contains_point(*node + _to_positions[i_to]))
            {
              std::pair<unsigned int, unsigned int> key(i_to, node->id());
              point_index_map[i_proc][key] = outgoing_points[i_proc].size();
              outgoing_points[i_proc].push_back(*node + _to_positions[i_to]);
              point_found = true;
            }
          }
        }
      }
    }
    else // Elemental
    {
      MeshBase::const_element_iterator elem_it = to_mesh->local_elements_begin();
      MeshBase::const_element_iterator elem_end = to_mesh->local_elements_end();

      for (; elem_it != elem_end; ++elem_it)
      {
        Elem * elem = *elem_it;

        Point centroid = elem->centroid();

        // Skip this element if the variable has no dofs at it.
        if (elem->n_dofs(sys_num, var_num) < 1)
          continue;

        // Loop over the "froms" on processor i_proc.  If the elem is found in
        // any of the "froms", add that elem to the vector that will be sent to
        // i_proc.
        unsigned int from0 = 0;
        for (processor_id_type i_proc = 0;
             i_proc < n_processors();
             from0 += froms_per_proc[i_proc], i_proc++)
        {
          bool point_found = false;
          for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && ! point_found; i_from++)
          {
            if (bboxes[i_from].contains_point(centroid + _to_positions[i_to]))
            {
              std::pair<unsigned int, unsigned int> key(i_to, elem->id());
              point_index_map[i_proc][key] = outgoing_points[i_proc].size();
              outgoing_points[i_proc].push_back(centroid + _to_positions[i_to]);
              point_found = true;
            }
          }
        }
      }
    }
  }

  /**
   * Request point evaluations from other processors and handle requests sent to
   * this processor.
   */

  // Get the local bounding boxes.
  std::vector<MeshTools::BoundingBox> local_bboxes(froms_per_proc[processor_id()]);
  {
    // Find the index to the first of this processor's local bounding boxes.
    unsigned int local_start = 0;
    for (processor_id_type i_proc = 0;
         i_proc < n_processors() && i_proc != processor_id();
         i_proc++)
    {
      local_start += froms_per_proc[i_proc];
    }

    // Extract the local bounding boxes.
    for (unsigned int i_from = 0; i_from < froms_per_proc[processor_id()]; i_from++)
    {
      local_bboxes[i_from] = bboxes[local_start + i_from];
    }
  }

  // Setup the local mesh functions.
  std::vector<MooseSharedPointer<MeshFunction> > local_meshfuns;
  for (unsigned int i_from = 0; i_from < _from_problems.size(); i_from++)
  {
    FEProblem & from_problem = *_from_problems[i_from];
    MooseVariable & from_var = from_problem.getVariable(0, _from_var_name);
    System & from_sys = from_var.sys().system();
    unsigned int from_var_num = from_sys.variable_number(from_var.name());

    MooseSharedPointer<MeshFunction> from_func;
    //TODO: make MultiAppTransfer give me the right es
    if (_displaced_source_mesh && from_problem.getDisplacedProblem())
      from_func.reset(new MeshFunction(from_problem.getDisplacedProblem()->es(),
           *from_sys.current_local_solution, from_sys.get_dof_map(), from_var_num));
    else
      from_func.reset(new MeshFunction(from_problem.es(),
           *from_sys.current_local_solution, from_sys.get_dof_map(), from_var_num));
    from_func->init(Trees::ELEMENTS);
    from_func->enable_out_of_mesh_mode(OutOfMeshValue);
    local_meshfuns.push_back(from_func);
  }

  // Send points to other processors.
  std::vector<std::vector<Real> > incoming_evals(n_processors());
  std::vector<std::vector<unsigned int> > incoming_app_ids(n_processors());
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;
    _communicator.send(i_proc, outgoing_points[i_proc]);
  }

  // Recieve points from other processors, evaluate mesh frunctions at those
  // points, and send the values back.
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    std::vector<Point> incoming_points;
    if (i_proc == processor_id())
      incoming_points = outgoing_points[i_proc];
    else
      _communicator.receive(i_proc, incoming_points);

    std::vector<Real> outgoing_evals(incoming_points.size(), OutOfMeshValue);
    std::vector<unsigned int> outgoing_ids(incoming_points.size(), -1); // -1 = largest unsigned int
    for (unsigned int i_pt = 0; i_pt < incoming_points.size(); i_pt++)
    {
      Point pt = incoming_points[i_pt];

      // Loop until we've found the lowest-ranked app that actually contains
      // the quadrature point.
      for (unsigned int i_from = 0; i_from < _from_problems.size() && outgoing_evals[i_pt] == OutOfMeshValue; i_from++)
      {
        if (local_bboxes[i_from].contains_point(pt))
        {
          outgoing_evals[i_pt] = (* local_meshfuns[i_from])(pt - _from_positions[i_from]);
          if (_direction == FROM_MULTIAPP)
            outgoing_ids[i_pt] = _local2global_map[i_from];
        }
      }
    }

    if (i_proc == processor_id())
    {
      incoming_evals[i_proc] = outgoing_evals;
      if (_direction == FROM_MULTIAPP)
        incoming_app_ids[i_proc] = outgoing_ids;
    }
    else
    {
      _communicator.send(i_proc, outgoing_evals);
      if (_direction == FROM_MULTIAPP)
        _communicator.send(i_proc, outgoing_ids);
    }
  }

  /**
   * Gather all of the evaluations, pick out the best ones for each point, and
   * apply them to the solution vector.  When we are transferring from
   * multiapps, there may be multiple overlapping apps for a particular point.
   * In that case, we'll try to use the value from the app with the lowest id.
   */

  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;

    _communicator.receive(i_proc, incoming_evals[i_proc]);
    if (_direction == FROM_MULTIAPP)
      _communicator.receive(i_proc, incoming_app_ids[i_proc]);
  }

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    System * to_sys = find_sys(*_to_es[i_to], _to_var_name);

    unsigned int sys_num = to_sys->number();
    unsigned int var_num = to_sys->variable_number(_to_var_name);

    NumericVector<Real> * solution;
    switch (_direction)
    {
      case TO_MULTIAPP:
        solution = & getTransferVector(i_to, _to_var_name);
        break;
      case FROM_MULTIAPP:
        solution = to_sys->solution.get();
        break;
    }

    MeshBase * to_mesh = & _to_meshes[i_to]->getMesh();

    bool is_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

    if (is_nodal)
    {
      MeshBase::const_node_iterator node_it = to_mesh->local_nodes_begin();
      MeshBase::const_node_iterator node_end = to_mesh->local_nodes_end();

      for (; node_it != node_end; ++node_it)
      {
        Node * node = *node_it;

        // Skip this node if the variable has no dofs at it.
        if (node->n_dofs(sys_num, var_num) < 1)
          continue;

        unsigned int lowest_app_rank = libMesh::invalid_uint;
        Real best_val = 0.;
        bool point_found = false;
        for (unsigned int i_proc = 0; i_proc < incoming_evals.size(); i_proc++)
        {
          // Skip this proc if the node wasn't in it's bounding boxes.
          std::pair<unsigned int, unsigned int> key(i_to, node->id());
          if (point_index_map[i_proc].find(key) == point_index_map[i_proc].end())
            continue;
          unsigned int i_pt = point_index_map[i_proc][key];

          // Ignore this proc if it's app has a higher rank than the
          // previously found lowest app rank.
          if (_direction == FROM_MULTIAPP)
          {
            if (incoming_app_ids[i_proc][i_pt] >= lowest_app_rank)
              continue;
          }

          // Ignore this proc if the point was actually outside its meshes.
          if (incoming_evals[i_proc][i_pt] == OutOfMeshValue)
            continue;

          best_val = incoming_evals[i_proc][i_pt];
          point_found = true;
        }

        if (_error_on_miss && ! point_found)
          mooseError("Point not found! " << *node + _to_positions[i_to]);

        dof_id_type dof = node->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    else // Elemental
    {
      MeshBase::const_element_iterator elem_it = to_mesh->local_elements_begin();
      MeshBase::const_element_iterator elem_end = to_mesh->local_elements_end();

      for (; elem_it != elem_end; ++elem_it)
      {
        Elem * elem = *elem_it;

        // Skip this element if the variable has no dofs at it.
        if (elem->n_dofs(sys_num, var_num) < 1)
          continue;

        unsigned int lowest_app_rank = libMesh::invalid_uint;
        Real best_val = 0;
        bool point_found = false;
        for (unsigned int i_proc = 0; i_proc < incoming_evals.size(); i_proc++)
        {
          // Skip this proc if the elem wasn't in it's bounding boxes.
          std::pair<unsigned int, unsigned int> key(i_to, elem->id());
          if (point_index_map[i_proc].find(key) == point_index_map[i_proc].end())
            continue;
          unsigned int i_pt = point_index_map[i_proc][key];

          // Ignore this proc if it's app has a higher rank than the
          // previously found lowest app rank.
          if (_direction == FROM_MULTIAPP)
          {
            if (incoming_app_ids[i_proc][i_pt] >= lowest_app_rank)
              continue;
          }

          // Ignore this proc if the point was actually outside its meshes.
          if (incoming_evals[i_proc][i_pt] == OutOfMeshValue)
            continue;

          best_val = incoming_evals[i_proc][i_pt];
          point_found = true;
        }

        if (_error_on_miss && ! point_found)
          mooseError("Point not found! " << elem->centroid() + _to_positions[i_to]);

        dof_id_type dof = elem->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    solution->close();
    to_sys->update();
  }

  _console << "Finished MeshFunctionTransfer " << name() << std::endl;
}
void
MultiAppNearestNodeTransfer::execute()
{
  _console << "Beginning NearestNodeTransfer " << name() << std::endl;

  getAppInfo();

  // Get the bounding boxes for the "from" domains.
  std::vector<BoundingBox> bboxes;
  if (isParamValid("source_boundary"))
    bboxes = getFromBoundingBoxes(
        _from_meshes[0]->getBoundaryID(getParam<BoundaryName>("source_boundary")));
  else
    bboxes = getFromBoundingBoxes();

  // Figure out how many "from" domains each processor owns.
  std::vector<unsigned int> froms_per_proc = getFromsPerProc();

  ////////////////////
  // For every point in the local "to" domain, figure out which "from" domains
  // might contain it's nearest neighbor, and send that point to the processors
  // that own those "from" domains.
  //
  // How do we know which "from" domains might contain the nearest neighbor, you
  // ask?  Well, consider two "from" domains, A and B.  If every point in A is
  // closer than every point in B, then we know that B cannot possibly contain
  // the nearest neighbor.  Hence, we'll only check A for the nearest neighbor.
  // We'll use the functions bboxMaxDistance and bboxMinDistance to figure out
  // if every point in A is closer than every point in B.
  ////////////////////

  // outgoing_qps = nodes/centroids we'll send to other processors.
  std::vector<std::vector<Point>> outgoing_qps(n_processors());
  // When we get results back, node_index_map will tell us which results go with
  // which points
  std::vector<std::map<std::pair<unsigned int, unsigned int>, unsigned int>> node_index_map(
      n_processors());

  if (!_neighbors_cached)
  {
    for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
    {
      System * to_sys = find_sys(*_to_es[i_to], _to_var_name);
      unsigned int sys_num = to_sys->number();
      unsigned int var_num = to_sys->variable_number(_to_var_name);
      MeshBase * to_mesh = &_to_meshes[i_to]->getMesh();
      bool is_to_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

      if (is_to_nodal)
      {
        std::vector<Node *> target_local_nodes;

        if (isParamValid("target_boundary"))
        {
          BoundaryID target_bnd_id =
              _to_meshes[i_to]->getBoundaryID(getParam<BoundaryName>("target_boundary"));

          ConstBndNodeRange & bnd_nodes = *(_to_meshes[i_to])->getBoundaryNodeRange();
          for (const auto & bnode : bnd_nodes)
            if (bnode->_bnd_id == target_bnd_id && bnode->_node->processor_id() == processor_id())
              target_local_nodes.push_back(bnode->_node);
        }
        else
        {
          target_local_nodes.resize(to_mesh->n_local_nodes());
          unsigned int i = 0;
          for (auto & node : to_mesh->local_node_ptr_range())
            target_local_nodes[i++] = node;
        }

        // For error checking: keep track of all target_local_nodes
        // which are successfully mapped to at least one domain where
        // the nearest neighbor might be found.
        std::set<Node *> local_nodes_found;

        for (const auto & node : target_local_nodes)
        {
          // Skip this node if the variable has no dofs at it.
          if (node->n_dofs(sys_num, var_num) < 1)
            continue;

          // Find which bboxes might have the nearest node to this point.
          Real nearest_max_distance = std::numeric_limits<Real>::max();
          for (const auto & bbox : bboxes)
          {
            Real distance = bboxMaxDistance(*node, bbox);
            if (distance < nearest_max_distance)
              nearest_max_distance = distance;
          }

          unsigned int from0 = 0;
          for (processor_id_type i_proc = 0; i_proc < n_processors();
               from0 += froms_per_proc[i_proc], i_proc++)
          {
            bool qp_found = false;

            for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && !qp_found;
                 i_from++)
            {

              Real distance = bboxMinDistance(*node, bboxes[i_from]);

              if (distance <= nearest_max_distance || bboxes[i_from].contains_point(*node))
              {
                std::pair<unsigned int, unsigned int> key(i_to, node->id());
                node_index_map[i_proc][key] = outgoing_qps[i_proc].size();
                outgoing_qps[i_proc].push_back(*node + _to_positions[i_to]);
                qp_found = true;
                local_nodes_found.insert(node);
              }
            }
          }
        }

        // By the time we get to here, we should have found at least
        // one candidate BoundingBox for every node in the
        // target_local_nodes array that has dofs for the current
        // variable in the current System.
        for (const auto & node : target_local_nodes)
          if (node->n_dofs(sys_num, var_num) && !local_nodes_found.count(node))
            mooseError("In ",
                       name(),
                       ": No candidate BoundingBoxes found for node ",
                       node->id(),
                       " at position ",
                       *node);
      }
      else // Elemental
      {
        // For error checking: keep track of all local elements
        // which are successfully mapped to at least one domain where
        // the nearest neighbor might be found.
        std::set<Elem *> local_elems_found;

        for (auto & elem : as_range(to_mesh->local_elements_begin(), to_mesh->local_elements_end()))
        {
          Point centroid = elem->centroid();

          // Skip this element if the variable has no dofs at it.
          if (elem->n_dofs(sys_num, var_num) < 1)
            continue;

          // Find which bboxes might have the nearest node to this point.
          Real nearest_max_distance = std::numeric_limits<Real>::max();
          for (const auto & bbox : bboxes)
          {
            Real distance = bboxMaxDistance(centroid, bbox);
            if (distance < nearest_max_distance)
              nearest_max_distance = distance;
          }

          unsigned int from0 = 0;
          for (processor_id_type i_proc = 0; i_proc < n_processors();
               from0 += froms_per_proc[i_proc], i_proc++)
          {
            bool qp_found = false;
            for (unsigned int i_from = from0; i_from < from0 + froms_per_proc[i_proc] && !qp_found;
                 i_from++)
            {
              Real distance = bboxMinDistance(centroid, bboxes[i_from]);
              if (distance <= nearest_max_distance || bboxes[i_from].contains_point(centroid))
              {
                std::pair<unsigned int, unsigned int> key(i_to, elem->id());
                node_index_map[i_proc][key] = outgoing_qps[i_proc].size();
                outgoing_qps[i_proc].push_back(centroid + _to_positions[i_to]);
                qp_found = true;
                local_elems_found.insert(elem);
              }
            }
          }
        }

        // Verify that we found at least one candidate bounding
        // box for each local element with dofs for the current
        // variable in the current System.
        for (auto & elem : as_range(to_mesh->local_elements_begin(), to_mesh->local_elements_end()))
          if (elem->n_dofs(sys_num, var_num) && !local_elems_found.count(elem))
            mooseError("In ",
                       name(),
                       ": No candidate BoundingBoxes found for Elem ",
                       elem->id(),
                       ", centroid = ",
                       elem->centroid());
      }
    }
  }

  ////////////////////
  // Send local node/centroid positions off to the other processors and take
  // care of points sent to this processor.  We'll need to check the points
  // against all of the "from" domains that this processor owns.  For each
  // point, we'll find the nearest node, then we'll send the value at that node
  // and the distance between the node and the point back to the processor that
  // requested that point.
  ////////////////////

  std::vector<std::vector<Real>> incoming_evals(n_processors());
  std::vector<Parallel::Request> send_qps(n_processors());
  std::vector<Parallel::Request> send_evals(n_processors());

  // Create these here so that they live the entire life of this function
  // and are NOT reused per processor.
  std::vector<std::vector<Real>> processor_outgoing_evals(n_processors());

  if (!_neighbors_cached)
  {
    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      if (i_proc == processor_id())
        continue;
      _communicator.send(i_proc, outgoing_qps[i_proc], send_qps[i_proc]);
    }

    // Build an array of pointers to all of this processor's local entities (nodes or
    // elements).  We need to do this to avoid the expense of using LibMesh iterators.
    // This step also takes care of limiting the search to boundary nodes, if
    // applicable.
    std::vector<std::vector<std::pair<Point, DofObject *>>> local_entities(
        froms_per_proc[processor_id()]);

    // Local array of all from Variable references
    std::vector<std::reference_wrapper<MooseVariableFEBase>> _from_vars;

    for (unsigned int i = 0; i < froms_per_proc[processor_id()]; i++)
    {
      MooseVariableFEBase & from_var = _from_problems[i]->getVariable(
          0, _from_var_name, Moose::VarKindType::VAR_ANY, Moose::VarFieldType::VAR_FIELD_STANDARD);
      bool is_to_nodal = from_var.feType().family == LAGRANGE;

      _from_vars.emplace_back(from_var);
      getLocalEntities(_from_meshes[i], local_entities[i], is_to_nodal);
    }

    if (_fixed_meshes)
    {
      _cached_froms.resize(n_processors());
      _cached_dof_ids.resize(n_processors());
    }

    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      // We either use our own outgoing_qps or receive them from
      // another processor.
      std::vector<Point> incoming_qps;
      if (i_proc == processor_id())
        incoming_qps = outgoing_qps[i_proc];
      else
        _communicator.receive(i_proc, incoming_qps);

      if (_fixed_meshes)
      {
        _cached_froms[i_proc].resize(incoming_qps.size());
        _cached_dof_ids[i_proc].resize(incoming_qps.size());
      }

      std::vector<Real> & outgoing_evals = processor_outgoing_evals[i_proc];
      // Resize this vector to two times the size of the incoming_qps
      // vector because we are going to store both the value from the nearest
      // local node *and* the distance between the incoming_qp and that node
      // for later comparison purposes.
      outgoing_evals.resize(2 * incoming_qps.size());

      for (unsigned int qp = 0; qp < incoming_qps.size(); qp++)
      {
        const Point & qpt = incoming_qps[qp];
        outgoing_evals[2 * qp] = std::numeric_limits<Real>::max();
        for (unsigned int i_local_from = 0; i_local_from < froms_per_proc[processor_id()];
             i_local_from++)
        {
          MooseVariableFEBase & from_var = _from_vars[i_local_from];
          System & from_sys = from_var.sys().system();
          unsigned int from_sys_num = from_sys.number();
          unsigned int from_var_num = from_sys.variable_number(from_var.name());

          for (unsigned int i_node = 0; i_node < local_entities[i_local_from].size(); i_node++)
          {
            // Compute distance between the current incoming_qp to local node i_node.
            Real current_distance =
                (qpt - local_entities[i_local_from][i_node].first - _from_positions[i_local_from])
                    .norm();

            // If an incoming_qp is equally close to two or more local nodes, then
            // the first one we test will "win", even though any of the others could
            // also potentially be chosen instead... there's no way to decide among
            // the set of all equidistant points.
            //
            // outgoing_evals[2 * qp] is the current closest distance between a local point and
            // the incoming_qp.
            if (current_distance < outgoing_evals[2 * qp])
            {
              // Assuming LAGRANGE!
              if (local_entities[i_local_from][i_node].second->n_dofs(from_sys_num, from_var_num) >
                  0)
              {
                dof_id_type from_dof = local_entities[i_local_from][i_node].second->dof_number(
                    from_sys_num, from_var_num, 0);

                // The indexing of the outgoing_evals vector looks
                // like [(distance, value), (distance, value), ...]
                // for each incoming_qp. We only keep the value from
                // the node with the smallest distance to the
                // incoming_qp, and then we compare across all
                // processors later and pick the closest one.
                outgoing_evals[2 * qp] = current_distance;
                outgoing_evals[2 * qp + 1] = (*from_sys.solution)(from_dof);

                if (_fixed_meshes)
                {
                  // Cache the nearest nodes.
                  _cached_froms[i_proc][qp] = i_local_from;
                  _cached_dof_ids[i_proc][qp] = from_dof;
                }
              }
            }
          }
        }
      }

      if (i_proc == processor_id())
        incoming_evals[i_proc] = outgoing_evals;
      else
        _communicator.send(i_proc, outgoing_evals, send_evals[i_proc]);
    }
  }

  else // We've cached the nearest nodes.
  {
    for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
    {
      std::vector<Real> & outgoing_evals = processor_outgoing_evals[i_proc];
      outgoing_evals.resize(_cached_froms[i_proc].size());

      for (unsigned int qp = 0; qp < outgoing_evals.size(); qp++)
      {
        MooseVariableFEBase & from_var = _from_problems[_cached_froms[i_proc][qp]]->getVariable(
            0,
            _from_var_name,
            Moose::VarKindType::VAR_ANY,
            Moose::VarFieldType::VAR_FIELD_STANDARD);
        System & from_sys = from_var.sys().system();
        dof_id_type from_dof = _cached_dof_ids[i_proc][qp];
        // outgoing_evals[qp] = (*from_sys.solution)(_cached_dof_ids[i_proc][qp]);
        outgoing_evals[qp] = (*from_sys.solution)(from_dof);
      }

      if (i_proc == processor_id())
        incoming_evals[i_proc] = outgoing_evals;
      else
        _communicator.send(i_proc, outgoing_evals, send_evals[i_proc]);
    }
  }

  ////////////////////
  // Gather all of the evaluations, find the nearest one for each node/element,
  // and apply the values.
  ////////////////////

  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;

    _communicator.receive(i_proc, incoming_evals[i_proc]);
  }

  for (unsigned int i_to = 0; i_to < _to_problems.size(); i_to++)
  {
    // Loop over the master nodes and set the value of the variable
    System * to_sys = find_sys(*_to_es[i_to], _to_var_name);

    unsigned int sys_num = to_sys->number();
    unsigned int var_num = to_sys->variable_number(_to_var_name);

    NumericVector<Real> * solution = nullptr;
    switch (_direction)
    {
      case TO_MULTIAPP:
        solution = &getTransferVector(i_to, _to_var_name);
        break;
      case FROM_MULTIAPP:
        solution = to_sys->solution.get();
        break;
      default:
        mooseError("Unknown direction");
    }

    const MeshBase & to_mesh = _to_meshes[i_to]->getMesh();

    bool is_to_nodal = to_sys->variable_type(var_num).family == LAGRANGE;

    if (is_to_nodal)
    {
      std::vector<Node *> target_local_nodes;

      if (isParamValid("target_boundary"))
      {
        BoundaryID target_bnd_id =
            _to_meshes[i_to]->getBoundaryID(getParam<BoundaryName>("target_boundary"));

        ConstBndNodeRange & bnd_nodes = *(_to_meshes[i_to])->getBoundaryNodeRange();
        for (const auto & bnode : bnd_nodes)
          if (bnode->_bnd_id == target_bnd_id && bnode->_node->processor_id() == processor_id())
            target_local_nodes.push_back(bnode->_node);
      }
      else
      {
        target_local_nodes.resize(to_mesh.n_local_nodes());
        unsigned int i = 0;
        for (auto & node : to_mesh.local_node_ptr_range())
          target_local_nodes[i++] = node;
      }

      for (const auto & node : target_local_nodes)
      {
        // Skip this node if the variable has no dofs at it.
        if (node->n_dofs(sys_num, var_num) < 1)
          continue;

        // If nothing is in the node_index_map for a given local node,
        // it will get the value 0.
        Real best_val = 0;
        if (!_neighbors_cached)
        {
          // Search through all the incoming evaluation points from
          // different processors for the one with the closest
          // point. If there are multiple values from other processors
          // which are equidistant, the first one we check will "win".
          Real min_dist = std::numeric_limits<Real>::max();
          for (unsigned int i_from = 0; i_from < incoming_evals.size(); i_from++)
          {
            std::pair<unsigned int, unsigned int> key(i_to, node->id());
            if (node_index_map[i_from].find(key) == node_index_map[i_from].end())
              continue;
            unsigned int qp_ind = node_index_map[i_from][key];
            if (incoming_evals[i_from][2 * qp_ind] >= min_dist)
              continue;

            // If we made it here, we are going set a new value and
            // distance because we found one that was closer.
            min_dist = incoming_evals[i_from][2 * qp_ind];
            best_val = incoming_evals[i_from][2 * qp_ind + 1];

            if (_fixed_meshes)
            {
              // Cache these indices.
              _cached_from_inds[node->id()] = i_from;
              _cached_qp_inds[node->id()] = qp_ind;
            }
          }
        }

        else
        {
          best_val = incoming_evals[_cached_from_inds[node->id()]][_cached_qp_inds[node->id()]];
        }

        dof_id_type dof = node->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    else // Elemental
    {
      for (auto & elem : to_mesh.active_local_element_ptr_range())
      {
        // Skip this element if the variable has no dofs at it.
        if (elem->n_dofs(sys_num, var_num) < 1)
          continue;

        Real best_val = 0;
        if (!_neighbors_cached)
        {
          Real min_dist = std::numeric_limits<Real>::max();
          for (unsigned int i_from = 0; i_from < incoming_evals.size(); i_from++)
          {
            std::pair<unsigned int, unsigned int> key(i_to, elem->id());
            if (node_index_map[i_from].find(key) == node_index_map[i_from].end())
              continue;
            unsigned int qp_ind = node_index_map[i_from][key];
            if (incoming_evals[i_from][2 * qp_ind] >= min_dist)
              continue;
            min_dist = incoming_evals[i_from][2 * qp_ind];
            best_val = incoming_evals[i_from][2 * qp_ind + 1];

            if (_fixed_meshes)
            {
              // Cache these indices.
              _cached_from_inds[elem->id()] = i_from;
              _cached_qp_inds[elem->id()] = qp_ind;
            }
          }
        }

        else
        {
          best_val = incoming_evals[_cached_from_inds[elem->id()]][_cached_qp_inds[elem->id()]];
        }

        dof_id_type dof = elem->dof_number(sys_num, var_num, 0);
        solution->set(dof, best_val);
      }
    }
    solution->close();
    to_sys->update();
  }

  if (_fixed_meshes)
    _neighbors_cached = true;

  // Make sure all our sends succeeded.
  for (processor_id_type i_proc = 0; i_proc < n_processors(); i_proc++)
  {
    if (i_proc == processor_id())
      continue;
    send_qps[i_proc].wait();
    send_evals[i_proc].wait();
  }

  _console << "Finished NearestNodeTransfer " << name() << std::endl;
}
static int main(int argc, char** argv) {
    // Skip over the first argument.
    argc--;
    argv++;

    bool generateFlag = false;
    String8 targetConfigStr;
    Vector<String8> splitApkPaths;
    String8 baseApkPath;
    while (argc > 0) {
        const String8 arg(*argv);
        if (arg == "--target") {
            argc--;
            argv++;
            if (argc < 1) {
                fprintf(stderr, "error: missing parameter for --target.\n");
                usage();
                return 1;
            }
            targetConfigStr.setTo(*argv);
        } else if (arg == "--split") {
            argc--;
            argv++;
            if (argc < 1) {
                fprintf(stderr, "error: missing parameter for --split.\n");
                usage();
                return 1;
            }
            splitApkPaths.add(String8(*argv));
        } else if (arg == "--base") {
            argc--;
            argv++;
            if (argc < 1) {
                fprintf(stderr, "error: missing parameter for --base.\n");
                usage();
                return 1;
            }

            if (baseApkPath.size() > 0) {
                fprintf(stderr, "error: multiple --base flags not allowed.\n");
                usage();
                return 1;
            }
            baseApkPath.setTo(*argv);
        } else if (arg == "--generate") {
            generateFlag = true;
        } else if (arg == "--help") {
            help();
            return 0;
        } else {
            fprintf(stderr, "error: unknown argument '%s'.\n", arg.string());
            usage();
            return 1;
        }
        argc--;
        argv++;
    }

    if (!generateFlag && targetConfigStr == "") {
        usage();
        return 1;
    }

    if (baseApkPath.size() == 0) {
        fprintf(stderr, "error: missing --base argument.\n");
        usage();
        return 1;
    }

    // Find out some details about the base APK.
    AppInfo baseAppInfo;
    if (!getAppInfo(baseApkPath, baseAppInfo)) {
        fprintf(stderr, "error: unable to read base APK: '%s'.\n", baseApkPath.string());
        return 1;
    }

    SplitDescription targetSplit;
    if (!generateFlag) {
        if (!SplitDescription::parse(targetConfigStr, &targetSplit)) {
            fprintf(stderr, "error: invalid --target config: '%s'.\n",
                    targetConfigStr.string());
            usage();
            return 1;
        }

        // We don't want to match on things that will change at run-time
        // (orientation, w/h, etc.).
        removeRuntimeQualifiers(&targetSplit.config);
    }

    splitApkPaths.add(baseApkPath);

    KeyedVector<String8, Vector<SplitDescription> > apkPathSplitMap;
    KeyedVector<SplitDescription, String8> splitApkPathMap;
    Vector<SplitDescription> splitConfigs;
    const size_t splitCount = splitApkPaths.size();
    for (size_t i = 0; i < splitCount; i++) {
        Vector<SplitDescription> splits = extractSplitDescriptionsFromApk(splitApkPaths[i]);
        if (splits.isEmpty()) {
            fprintf(stderr, "error: invalid --split path: '%s'. No splits found.\n",
                    splitApkPaths[i].string());
            usage();
            return 1;
        }
        apkPathSplitMap.replaceValueFor(splitApkPaths[i], splits);
        const size_t apkSplitDescriptionCount = splits.size();
        for (size_t j = 0; j < apkSplitDescriptionCount; j++) {
            splitApkPathMap.replaceValueFor(splits[j], splitApkPaths[i]);
        }
        splitConfigs.appendVector(splits);
    }

    if (!generateFlag) {
        Vector<SplitDescription> matchingConfigs = select(targetSplit, splitConfigs);
        const size_t matchingConfigCount = matchingConfigs.size();
        SortedVector<String8> matchingSplitPaths;
        for (size_t i = 0; i < matchingConfigCount; i++) {
            matchingSplitPaths.add(splitApkPathMap.valueFor(matchingConfigs[i]));
        }

        const size_t matchingSplitApkPathCount = matchingSplitPaths.size();
        for (size_t i = 0; i < matchingSplitApkPathCount; i++) {
            if (matchingSplitPaths[i] != baseApkPath) {
                fprintf(stdout, "%s\n", matchingSplitPaths[i].string());
            }
        }
    } else {
        generate(apkPathSplitMap, baseApkPath);
    }
    return 0;
}