Exemplo n.º 1
0
void writeTextGraph_MPI(graph_t *G)
{
    uint32_t TotVertices;
    TotVertices=G->n;
    int size = G->nproc;
    int rank = G->rank;
    int lgsize;
    char filename[FNAME_LEN];
    for (lgsize = 0; lgsize < size; ++lgsize) {
        if ((1 << lgsize) == size) break;
    }
    if (strlen(G->filename) == 0) {
        sprintf(filename, "graph.%d", G->rank);
    } else {
        sprintf(filename, "%s.%d", G->filename, G->rank);
    }
    printf("%d:start write to file = %s\n",G->rank,filename);
    
    FILE *F = fopen(filename, "w");
    if (!F) error(EXIT_FAILURE, 0, "Error in opening file %s", filename);

    for (vertex_id_t i = 0; i < G->local_n; i++) {
        fprintf(F, " %" PRIu32 ": [",  VERTEX_TO_GLOBAL(i,TotVertices,size,rank));
        for (edge_id_t j = G->rowsIndices[i]; j < G->rowsIndices[i+1]; j++) {
            fprintf(F, "[%d,%f]", G->endV[j], G->weights[j]);
            if (j != G->rowsIndices[i+1] - 1) fprintf(F, ",");
        }
        fprintf(F, "]\n");
    }
    fclose(F);
}
Exemplo n.º 2
0
int64_t vertex_to_global_for_pred(int v_rank, size_t v_local)
{
	return VERTEX_TO_GLOBAL(v_rank, v_local);
}
Exemplo n.º 3
0
int validate_bfs_result(const csr_graph* const g, const int64_t root, const int64_t* const pred, const int64_t nvisited) {
  int validation_passed = 1;
  int root_is_mine = (VERTEX_OWNER(root) == rank);

  const size_t nlocalverts = g->nlocalverts;
  const size_t nlocaledges = g->nlocaledges;
  const int64_t nglobalverts = g->nglobalverts;

  /* Check that root is its own parent. */
  if (root_is_mine) {
    if (pred[VERTEX_LOCAL(root)] != root) {
      fprintf(stderr, "%d: Validation error: parent of root vertex %" PRId64 " is %" PRId64 ", not the root itself.\n", rank, root, pred[VERTEX_LOCAL(root)]);
      validation_passed = 0;
    }
  }

  /* Check that nothing else is its own parent, and check for in-range
   * values. */
  int any_range_errors = 0;
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    int64_t v = VERTEX_TO_GLOBAL(i);
    assert (VERTEX_OWNER(v) == rank);
    assert (VERTEX_LOCAL(v) == i);
    if (v != root && pred[i] == v) {
      fprintf(stderr, "%d: Validation error: parent of non-root vertex %" PRId64 " is itself.\n", rank, v);
      validation_passed = 0;
    }
    if (pred[i] < -1 || pred[i] >= nglobalverts) {
      fprintf(stderr, "%d: Validation error: parent of vertex %" PRId64 " is out-of-range value %" PRId64 ".\n", rank, v, pred[i]);
      validation_passed = 0;
      any_range_errors = 1;
    }
  }
  MPI_Allreduce(MPI_IN_PLACE, &any_range_errors, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);

  /* Check that nvisited is correct. */
  int64_t nvisited_actual = 0;
  for (i = 0; i < nlocalverts; ++i) {
    if (pred[i] != -1) ++nvisited_actual;
  }
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_actual, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  if (nvisited_actual != nvisited) {
    fprintf(stderr, "%d: Validation error: claimed visit count %" PRId64 " is different from actual count %" PRId64 ".\n", rank, nvisited, nvisited_actual);
    validation_passed = 0;
  }

  if (!any_range_errors) { /* Other parts of validation assume in-range values */

    /* Check that there is an edge from each vertex to its claimed
     * predecessor. */
    size_t i;
    for (i = 0; i < nlocalverts; ++i) {
      int64_t v = VERTEX_TO_GLOBAL(i);
      int64_t p = pred[i];
      if (p == -1) continue;
      int found_pred_edge = 0;
      if (v == p) found_pred_edge = 1; /* Root vertex */
      size_t ei, ei_end = g->rowstarts[i + 1];
      for (ei = g->rowstarts[i]; ei < ei_end; ++ei) {
        int64_t w = g->column[ei];
        if (w == p) {
          found_pred_edge = 1;
          break;
        }
      }
      if (!found_pred_edge) {
        fprintf(stderr, "%d: Validation error: no graph edge from vertex %" PRId64 " to its parent %" PRId64 ".\n", rank, v, p);
        validation_passed = 0;
      }
    }

    /* Create a vertex depth map to use for later validation. */
    int64_t* depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t));
    { /* Scope some code that has a lot of temporary variables. */
      int64_t* pred_depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex */
      size_t i;
      for (i = 0; i < nlocalverts; ++i) depth[i] = INT64_MAX;
      if (root_is_mine) depth[VERTEX_LOCAL(root)] = 0;
      /* Send each vertex that appears in the local part of the predecessor map
       * to its owner; record the original locations so we can put the answers
       * into pred_depth. */
      /* Do a histogram sort by owner (this same kind of sort is used other
       * places as well).  First, count the number of vertices going to each
       * destination. */
      int* num_preds_per_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */
      for (i = 0; i < nlocalverts; ++i) {
        ++num_preds_per_owner[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])];
      }
      int64_t* preds_per_owner = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Predecessors sorted by owner */
      int64_t* preds_per_owner_results_offsets = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Indices into pred_depth to write */
      /* Second, do a prefix sum to get the displacements of the different
       * owners in the outgoing array. */
      int* pred_owner_displs = (int*)xmalloc((size + 1) * sizeof(int));
      pred_owner_displs[0] = 0;
      int r;
      for (r = 0; r < size; ++r) {
        pred_owner_displs[r + 1] = pred_owner_displs[r] + num_preds_per_owner[r];
      }
      /* Last, put the vertices into the correct positions in the array, based
       * on their owners and the counts and displacements computed earlier. */
      int* pred_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int));
      memcpy(pred_owner_offsets, pred_owner_displs, (size + 1) * sizeof(int));
      for (i = 0; i < nlocalverts; ++i) {
        int* offset_ptr = &pred_owner_offsets[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])];
        preds_per_owner[*offset_ptr] = pred[i];
        preds_per_owner_results_offsets[*offset_ptr] = i;
        ++*offset_ptr;
      }
      for (r = 0; r < size; ++r) {
        assert (pred_owner_offsets[r] == pred_owner_displs[r + 1]);
      }
      free(pred_owner_offsets);

      /* Send around the number of vertices that will be sent to each destination. */
      int* num_my_preds_per_sender = (int*)xmalloc(size * sizeof(int));
      MPI_Alltoall(num_preds_per_owner, 1, MPI_INT,
                   num_my_preds_per_sender, 1, MPI_INT,
                   MPI_COMM_WORLD);
      int* my_preds_per_sender_displs = (int*)xmalloc((size + 1) * sizeof(int));
      my_preds_per_sender_displs[0] = 0;
      for (r = 0; r < size; ++r) {
        my_preds_per_sender_displs[r + 1] = my_preds_per_sender_displs[r] + num_my_preds_per_sender[r];
      }
      /* Send around the actual vertex data (list of depth requests that will
       * be responded to at each BFS iteration). */
      int64_t* my_depth_requests = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t));
      int64_t* my_depth_replies = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t));
      MPI_Alltoallv(preds_per_owner, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE,
                    my_depth_requests, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE,
                    MPI_COMM_WORLD);

      int64_t* pred_depth_raw = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex, ordered by source proc */

      /* Do a mini-BFS (naively) over just the predecessor graph (hopefully a
       * tree) produced by the real BFS; fill in the depth map. */
      while (1) {
        int any_changed = 0;
        int i;
        /* Create and send the depth values requested by other nodes.  The list
         * of requests is sent once, and are stored on the receiver so the
         * replies can be sent (possibly with updated depth values) at every
         * iteration. */
        for (i = 0; i < my_preds_per_sender_displs[size]; ++i) {
          my_depth_replies[i] = (my_depth_requests[i] == -1 ? INT64_MAX : depth[VERTEX_LOCAL(my_depth_requests[i])]);
        }
        MPI_Alltoallv(my_depth_replies, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE,
                      pred_depth_raw, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);
        {
          size_t i;
          /* Put the received depths into the local array. */
          for (i = 0; i < nlocalverts; ++i) {
            pred_depth[preds_per_owner_results_offsets[i]] = pred_depth_raw[i];
          }
          /* Check those values to determine if they violate any correctness
           * conditions. */
          for (i = 0; i < nlocalverts; ++i) {
            int64_t v = VERTEX_TO_GLOBAL(i);
            if (v == root) {
              /* The depth and predecessor for this were checked earlier. */
            } else if (depth[i] == INT64_MAX && pred_depth[i] == INT64_MAX) {
              /* OK -- depth should be filled in later. */
            } else if (depth[i] == INT64_MAX && pred_depth[i] != INT64_MAX) {
              depth[i] = pred_depth[i] + 1;
              any_changed = 1;
            } else if (depth[i] != pred_depth[i] + 1) {
              fprintf(stderr, "%d: Validation error: BFS predecessors do not form a tree; see vertices %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ").\n", rank, v, depth[i], pred[i], pred_depth[i]);
              validation_passed = 0;
            } else {
              /* Vertex already has its correct depth value. */
            }
          }
        }
        MPI_Allreduce(MPI_IN_PLACE, &any_changed, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
        if (!any_changed) break;
      }

      free(num_preds_per_owner);
      free(num_my_preds_per_sender);
      free(preds_per_owner);
      free(preds_per_owner_results_offsets);
      free(my_preds_per_sender_displs);
      free(my_depth_requests);
      free(my_depth_replies);
      free(pred_owner_displs);
      free(pred_depth);
      free(pred_depth_raw);
    }

    /* Check that all edges connect vertices whose depths differ by at most
     * one. */
    {
      int64_t maxlocaledges = 0;
      MPI_Allreduce((void*)&nlocaledges, &maxlocaledges, 1, INT64_T_MPI_TYPE, MPI_MAX, MPI_COMM_WORLD);
      /* We break the total list of overall edges into chunks to reduce the
       * amount of data to be sent at a time (since we are using MPI_Alltoallv
       * to send data collectively). */
      const int edge_chunk_size = (1 << 23); /* Reduce memory usage */
      int num_edge_groups = (maxlocaledges + edge_chunk_size - 1) / edge_chunk_size;
      int eg;
      for (eg = 0; eg < num_edge_groups; ++eg) {
        size_t first_edge_index = (size_t)(eg * edge_chunk_size);
        if (first_edge_index > nlocaledges) first_edge_index = nlocaledges;
        size_t last_edge_index = (size_t)((eg + 1) * edge_chunk_size);
        if (last_edge_index > nlocaledges) last_edge_index = nlocaledges;
        /* Sort the edge targets in this chunk by their owners (histogram
         * sort); see the BFS code above for details of the steps of the
         * algorithm. */
        int* num_edge_targets_by_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */
        size_t ei;
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          ++num_edge_targets_by_owner[VERTEX_OWNER(g->column[ei])];
        }
        int* edge_targets_by_owner_displs = (int*)xmalloc((size + 1) * sizeof(int));
        edge_targets_by_owner_displs[0] = 0;
        int i;
        for (i = 0; i < size; ++i) {
          edge_targets_by_owner_displs[i + 1] = edge_targets_by_owner_displs[i] + num_edge_targets_by_owner[i];
        }
        int64_t* edge_targets_by_owner = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t));
        int64_t* edge_targets_by_owner_indices = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t)); /* Source indices for where to write the targets */
        int* edge_targets_by_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int));
        memcpy(edge_targets_by_owner_offsets, edge_targets_by_owner_displs, (size + 1) * sizeof(int));
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          edge_targets_by_owner[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = g->column[ei];
          edge_targets_by_owner_indices[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = ei;
          ++edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])];
        }
        for (i = 0; i < size; ++i) {
          assert (edge_targets_by_owner_offsets[i] == edge_targets_by_owner_displs[i + 1]);
        }
        free(edge_targets_by_owner_offsets);

        /* Send around the number of data elements that will be sent later. */
        int* num_incoming_targets_by_src = (int*)xmalloc(size * sizeof(int));
        MPI_Alltoall(num_edge_targets_by_owner, 1, MPI_INT,
                     num_incoming_targets_by_src, 1, MPI_INT,
                     MPI_COMM_WORLD);
        int* incoming_targets_by_src_displs = (int*)xmalloc((size + 1) * sizeof(int));
        incoming_targets_by_src_displs[0] = 0;
        for (i = 0; i < size; ++i) {
          incoming_targets_by_src_displs[i + 1] = incoming_targets_by_src_displs[i] + num_incoming_targets_by_src[i];
        }

        int64_t* target_depth_requests = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t));
        int64_t* target_depth_replies = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t));

        /* Send the actual requests for the depths of edge targets. */
        MPI_Alltoallv(edge_targets_by_owner, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE,
                      target_depth_requests, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);

        free(edge_targets_by_owner);

        /* Fill in the replies for the requests sent to me. */
        for (i = 0; i < incoming_targets_by_src_displs[size]; ++i) {
          assert (VERTEX_OWNER(target_depth_requests[i]) == rank);
          target_depth_replies[i] = depth[VERTEX_LOCAL(target_depth_requests[i])];
        }

        free(target_depth_requests);

        int64_t* target_depth_raw = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t));

        /* Send back the replies. */
        MPI_Alltoallv(target_depth_replies, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE,
                      target_depth_raw, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE,
                      MPI_COMM_WORLD);

        free(target_depth_replies);
        free(num_incoming_targets_by_src);
        free(num_edge_targets_by_owner);
        free(incoming_targets_by_src_displs);
        free(edge_targets_by_owner_displs);

        int64_t* target_depth = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t));

        /* Put the replies into the proper order (original order of the edges).
         * */
        for (ei = 0; ei < last_edge_index - first_edge_index; ++ei) {
          target_depth[edge_targets_by_owner_indices[ei] - first_edge_index] = target_depth_raw[ei];
        }

        free(target_depth_raw);
        free(edge_targets_by_owner_indices);

        /* Check the depth relationship of the endpoints of each edge in the
         * current chunk. */
        size_t src_i = 0;
        for (ei = first_edge_index; ei < last_edge_index; ++ei) {
          while (ei >= g->rowstarts[src_i + 1]) {
            ++src_i;
          }
          int64_t src = VERTEX_TO_GLOBAL(src_i);
          int64_t src_depth = depth[src_i];
          int64_t tgt = g->column[ei];
          int64_t tgt_depth = target_depth[ei - first_edge_index];
          if (src_depth != INT64_MAX && tgt_depth == INT64_MAX) {
            fprintf(stderr, "%d: Validation error: edge connects vertex %" PRId64 " in the BFS tree (depth %" PRId64 ") to vertex %" PRId64 " outside the tree.\n", rank, src, src_depth, tgt);
            validation_passed = 0;
          } else if (src_depth == INT64_MAX && tgt_depth != INT64_MAX) {
            /* Skip this for now; this problem will be caught when scanning
             * reversed copy of this edge.  Set the failure flag, though,
             * just in case. */
            validation_passed = 0;
          } else if (src_depth - tgt_depth < -1 ||
                     src_depth - tgt_depth > 1) {
            fprintf(stderr, "%d: Validation error: depths of edge endpoints %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ") are too far apart (abs. val. > 1).\n", rank, src, src_depth, tgt, tgt_depth);
            validation_passed = 0;
          }
        }
        free(target_depth);
      }
    }

    free(depth);

  } /* End of part skipped by range errors */
  
  /* Collect the global validation result. */
  MPI_Allreduce(MPI_IN_PLACE, &validation_passed, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
  return validation_passed;
}
Exemplo n.º 4
0
/* This BFS represents its queues as bitmaps and uses some data representation
 * tricks to fit with the use of MPI one-sided operations.  It is not much
 * faster than the standard version on the machines I have tested it on, but
 * systems that have good RDMA hardware and good MPI one-sided implementations
 * might get better performance from it.  This code might also be good to
 * translate to UPC, Co-array Fortran, SHMEM, or GASNet since those systems are
 * more designed for one-sided remote memory operations. */
void run_mpi_bfs(const csr_graph* const g, int64_t root, int64_t* pred, int64_t* nvisited) {
  const size_t nlocalverts = g->nlocalverts;
  const int64_t nglobalverts = g->nglobalverts;
  int64_t nvisited_local = 0;

  /* Set up a second predecessor map so we can read from one and modify the
   * other. */
  int64_t* orig_pred = pred;
  int64_t* pred2 = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  /* The queues (old and new) are represented as bitmaps.  Each bit in the
   * queue bitmap says to check elts_per_queue_bit elements in the predecessor
   * map for vertices that need to be visited.  In other words, the queue
   * bitmap is an overapproximation of the actual queue; because MPI_Accumulate
   * does not get any information on the result of the update, sometimes
   * elements are also added to the bitmap when they were actually already
   * black.  Because of this, the predecessor map needs to be checked to be
   * sure a given vertex actually needs to be processed. */
  const int elts_per_queue_bit = 4;
  const int ulong_bits = sizeof(unsigned long) * CHAR_BIT;
  int64_t queue_nbits = (nlocalverts + elts_per_queue_bit - 1) / elts_per_queue_bit;
  int64_t queue_nwords = (queue_nbits + ulong_bits - 1) / ulong_bits;
  unsigned long* queue_bitmap1 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  unsigned long* queue_bitmap2 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  memset(queue_bitmap1, 0, queue_nwords * sizeof(unsigned long));

  /* List of local vertices (used as sources in MPI_Accumulate). */
  int64_t* local_vertices = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));
  {size_t i; for (i = 0; i < nlocalverts; ++i) local_vertices[i] = VERTEX_TO_GLOBAL(i);}

  /* List of all bit masks for an unsigned long (used as sources in
   * MPI_Accumulate). */
  unsigned long masks[ulong_bits];
  {int i; for (i = 0; i < ulong_bits; ++i) masks[i] = (1UL << i);}

  /* Coding of predecessor map: */
  /* - White (not visited): INT64_MAX */
  /* - Grey (in queue): 0 .. nglobalverts-1 */
  /* - Black (done): -nglobalverts .. -1 */

  /* Set initial predecessor map. */
  {size_t i; for (i = 0; i < nlocalverts; ++i) pred[i] = INT64_MAX;}

  /* Mark root as grey and add it to the queue. */
  if (VERTEX_OWNER(root) == rank) {
    pred[VERTEX_LOCAL(root)] = root;
    queue_bitmap1[VERTEX_LOCAL(root) / elts_per_queue_bit / ulong_bits] |= (1UL << ((VERTEX_LOCAL(root) / elts_per_queue_bit) % ulong_bits));
  }

  /* Create MPI windows on the two predecessor arrays and the two queues. */
  MPI_Win pred_win, pred2_win, queue1_win, queue2_win;
  MPI_Win_create(pred, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred_win);
  MPI_Win_create(pred2, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred2_win);
  MPI_Win_create(queue_bitmap1, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue1_win);
  MPI_Win_create(queue_bitmap2, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue2_win);

  while (1) {
    int64_t i;
    /* Clear the next-level queue. */
    memset(queue_bitmap2, 0, queue_nwords * sizeof(unsigned long));

    /* The pred2 array is pred with all grey vertices changed to black. */
    memcpy(pred2, pred, nlocalverts * sizeof(int64_t));
    for (i = 0; i < (int64_t)nlocalverts; ++i) {
      if (pred2[i] >= 0 && pred2[i] < nglobalverts) pred2[i] -= nglobalverts;
    }

    /* Start one-sided operations for this level. */
    MPI_Win_fence(MPI_MODE_NOPRECEDE, pred2_win);
    MPI_Win_fence(MPI_MODE_NOPRECEDE, queue2_win);

    /* Step through the words of the queue bitmap. */
    for (i = 0; i < queue_nwords; ++i) {
      unsigned long val = queue_bitmap1[i];
      int bitnum;
      /* Skip any that are all zero. */
      if (!val) continue;
      /* Scan the bits in the word. */
      for (bitnum = 0; bitnum < ulong_bits; ++bitnum) {
        size_t first_v_local = (size_t)((i * ulong_bits + bitnum) * elts_per_queue_bit);
        if (first_v_local >= nlocalverts) break;
        int bit = (int)((val >> bitnum) & 1);
        /* Skip any that are zero. */
        if (!bit) continue;
        /* Scan the queue elements corresponding to this bit. */
        int qelem_idx;
        for (qelem_idx = 0; qelem_idx < elts_per_queue_bit; ++qelem_idx) {
          size_t v_local = first_v_local + qelem_idx;
          if (v_local >= nlocalverts) continue;
          /* Since the queue is an overapproximation, check the predecessor map
           * to be sure this vertex is grey. */
          if (pred[v_local] >= 0 && pred[v_local] < nglobalverts) {
            ++nvisited_local;
            size_t ei, ei_end = g->rowstarts[v_local + 1];
            /* Walk the incident edges. */
            for (ei = g->rowstarts[v_local]; ei < ei_end; ++ei) {
              int64_t w = g->column[ei];
              if (w == VERTEX_TO_GLOBAL(v_local)) continue; /* Self-loop */
              /* Set the predecessor of the other edge endpoint (note use of
               * MPI_MIN and the coding of the predecessor map). */
              MPI_Accumulate(&local_vertices[v_local], 1, INT64_T_MPI_TYPE, VERTEX_OWNER(w), VERTEX_LOCAL(w), 1, INT64_T_MPI_TYPE, MPI_MIN, pred2_win);
              /* Mark the endpoint in the remote queue (note that the min may
               * not do an update, so the queue is an overapproximation in this
               * way as well). */
              MPI_Accumulate(&masks[((VERTEX_LOCAL(w) / elts_per_queue_bit) % ulong_bits)], 1, MPI_UNSIGNED_LONG, VERTEX_OWNER(w), VERTEX_LOCAL(w) / elts_per_queue_bit / ulong_bits, 1, MPI_UNSIGNED_LONG, MPI_BOR, queue2_win);
            }
          }
        }
      }
    }
    /* End one-sided operations. */
    MPI_Win_fence(MPI_MODE_NOSUCCEED, queue2_win);
    MPI_Win_fence(MPI_MODE_NOSUCCEED, pred2_win);

    /* Test if there are any elements in the next-level queue (globally); stop
     * if none. */
    int any_set = 0;
    for (i = 0; i < queue_nwords; ++i) {
      if (queue_bitmap2[i] != 0) {any_set = 1; break;}
    }
    MPI_Allreduce(MPI_IN_PLACE, &any_set, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
    if (!any_set) break;

    /* Swap queues and predecessor maps. */
    {MPI_Win temp = queue1_win; queue1_win = queue2_win; queue2_win = temp;}
    {unsigned long* temp = queue_bitmap1; queue_bitmap1 = queue_bitmap2; queue_bitmap2 = temp;}
    {MPI_Win temp = pred_win; pred_win = pred2_win; pred2_win = temp;}
    {int64_t* temp = pred; pred = pred2; pred2 = temp;}
  }
  MPI_Win_free(&pred_win);
  MPI_Win_free(&pred2_win);
  MPI_Win_free(&queue1_win);
  MPI_Win_free(&queue2_win);
  MPI_Free_mem(local_vertices);
  MPI_Free_mem(queue_bitmap1);
  MPI_Free_mem(queue_bitmap2);

  /* Clean up the predecessor map swapping since the surrounding code does not
   * allow the BFS to change the predecessor map pointer. */
  if (pred2 != orig_pred) {
    memcpy(orig_pred, pred2, nlocalverts * sizeof(int64_t));
    MPI_Free_mem(pred2);
  } else {
    MPI_Free_mem(pred);
  }

  /* Change from special coding of predecessor map to the one the benchmark
   * requires. */
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    if (orig_pred[i] < 0) {
      orig_pred[i] += nglobalverts;
    } else if (orig_pred[i] == INT64_MAX) {
      orig_pred[i] = -1;
    }
  }

  /* Count visited vertices. */
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_local, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  *nvisited = nvisited_local;
}