std::vector<std::vector<size_t> > 
     get_procs_with_keys(const std::vector<size_t>& local_key_list, Graph& g) {
   // this machine will get all keys from each processor where
   // key = procid mod numprocs
   std::vector<std::vector<size_t> > procs_with_keys(rmi.numprocs());
   for (size_t i = 0; i < local_key_list.size(); ++i) {
     if (g.l_vertex(i).owned() && local_key_list[i] != (size_t)(-1)) {
       procid_t target_procid = local_key_list[i] % rmi.numprocs();
       procs_with_keys[target_procid].push_back(local_key_list[i]);
     }
   }
   rmi.all_to_all(procs_with_keys);
   return procs_with_keys;
 }
    void injective_join(injective_join_index& target,
                        TargetGraph& target_graph,
                        injective_join_index& source,
                        SourceGraph& source_graph,
                        JoinOp joinop) {
      // build up the exchange structure.
      // move right vertex data to left
      std::vector<
          std::vector<
              std::pair<size_t, typename SourceGraph::vertex_data_type> > > 
            source_data(rmi.numprocs());

      for (size_t i = 0; i < source.opposing_join_proc.size(); ++i) {
        if (source_graph.l_vertex(i).owned()) {
          procid_t target_proc = source.opposing_join_proc[i];
          if (target_proc >= 0 && target_proc < rmi.numprocs()) {
            source_data[target_proc].push_back(
                std::make_pair(source.vtx_to_key[i],
                               source_graph.l_vertex(i).data()));
          }
        }
      }
      // exchange
      rmi.all_to_all(source_data);
      // ok. now join against left
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (size_t p = 0;p < source_data.size(); ++p) {
        for (size_t i = 0;i < source_data[p].size(); ++i) {
          // find the target vertex with the matching key
          hopscotch_map<size_t, vertex_id_type>::const_iterator iter = 
              target.key_to_vtx.find(source_data[p][i].first);
          ASSERT_TRUE(iter != target.key_to_vtx.end());
          // found it!
          typename TargetGraph::local_vertex_type 
              lvtx = target_graph.l_vertex(iter->second);
          typename TargetGraph::vertex_type vtx(lvtx);
          joinop(vtx, source_data[p][i].second);
        }
      }
      target_graph.synchronize();
    }
    void compute_injective_join() {
      std::vector<std::vector<size_t> > left_keys = 
          get_procs_with_keys(left_inj_index.vtx_to_key, left_graph);
      std::vector<std::vector<size_t> > right_keys = 
          get_procs_with_keys(right_inj_index.vtx_to_key, right_graph);
      // now. for each key on the right, I need to figure out which proc it
      // belongs in. and vice versa. This is actually kind of annoying.
      // but since it is one-to-one, I only need to make a hash map of one side.
      hopscotch_map<size_t, procid_t> left_key_to_procs;

      // construct a hash table of keys to procs
      // clear frequently to use less memory
      for (size_t p = 0; p < left_keys.size(); ++p) {
        for (size_t i = 0; i < left_keys[p].size(); ++i) {
          ASSERT_MSG(left_key_to_procs.count(left_keys[p][i]) == 0,
                     "Duplicate keys not permitted for left graph keys in injective join");
          left_key_to_procs.insert(std::make_pair(left_keys[p][i], p));
        }
        std::vector<size_t>().swap(left_keys[p]);
      }
      left_keys.clear();
     
      std::vector<
          std::vector<
              std::pair<size_t, procid_t> > > left_match(rmi.numprocs());
      std::vector<
          std::vector<
              std::pair<size_t, procid_t> > > right_match(rmi.numprocs());

      // now for each key on the right, find the matching key on the left
      for (size_t p = 0; p < right_keys.size(); ++p) {
        for (size_t i = 0; i < right_keys[p].size(); ++i) {
          size_t key = right_keys[p][i];
          hopscotch_map<size_t, procid_t>::iterator iter =
              left_key_to_procs.find(key);
          if (iter != left_key_to_procs.end()) {
            ASSERT_MSG(iter->second != (procid_t)(-1),
                       "Duplicate keys not permitted for right graph keys in injective join");
            // we have a match
            procid_t left_proc = iter->second;
            procid_t right_proc = p;
            // now. left has to be told about right and right
            // has to be told about left
            left_match[left_proc].push_back(std::make_pair(key, right_proc));
            right_match[right_proc].push_back(std::make_pair(key, left_proc));
            // set the map entry to -1 
            // so we know if it is ever reused
            iter->second = (procid_t)(-1); 
          }
        }
        std::vector<size_t>().swap(right_keys[p]);
      }
      right_keys.clear();

      rmi.all_to_all(left_match);
      rmi.all_to_all(right_match);
      // fill in the index
      // go through the left match and set up the opposing index to based
      // on the match result
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (size_t p = 0;p < left_match.size(); ++p) {
        for (size_t i = 0;i < left_match[p].size(); ++i) {
          // search for the key in the left index
          hopscotch_map<size_t, vertex_id_type>::const_iterator iter = 
              left_inj_index.key_to_vtx.find(left_match[p][i].first);
          ASSERT_TRUE(iter != left_inj_index.key_to_vtx.end());
          // fill in the match
          left_inj_index.opposing_join_proc[iter->second] = left_match[p][i].second;
        }
      }
      left_match.clear();
      // repeat for the right match
#ifdef _OPENMP
#pragma omp parallel for
#endif
      for (size_t p = 0;p < right_match.size(); ++p) {
        for (size_t i = 0;i < right_match[p].size(); ++i) {
          // search for the key in the right index
          hopscotch_map<size_t, vertex_id_type>::const_iterator iter = 
              right_inj_index.key_to_vtx.find(right_match[p][i].first);
          ASSERT_TRUE(iter != right_inj_index.key_to_vtx.end());
          // fill in the match
          right_inj_index.opposing_join_proc[iter->second] = right_match[p][i].second;
        }
      }
      right_match.clear();
      // ok done.
    }