Ejemplo n.º 1
0
/* Use the predecessors in the given map to write the BFS levels to the high 16
 * bits of each element in pred; this also catches some problems in pred
 * itself.  Returns true if the predecessor map is valid. */
static int build_bfs_depth_map(const int64_t nglobalverts, const size_t nlocalverts, const size_t maxlocalverts, const int64_t root, int64_t* const pred) {
  (void)nglobalverts;
  int validation_passed = 1;
  int root_owner;

  size_t root_local;
  get_vertex_distribution_for_pred(1, &root, &root_owner, &root_local);
  int root_is_mine = (root_owner == rank);
  if (root_is_mine) assert (root_local < nlocalverts);

  {
    ptrdiff_t i;
#pragma omp parallel for
    for (i = 0; i < (ptrdiff_t)nlocalverts; ++i) write_pred_entry_depth(&pred[i], UINT16_MAX);
    if (root_is_mine) write_pred_entry_depth(&pred[root_local], 0);
  }
  int64_t* restrict pred_pred = (int64_t*)xMPI_Alloc_mem(size_min(CHUNKSIZE, nlocalverts) * sizeof(int64_t)); /* Predecessor info of predecessor vertex for each local vertex */
  gather* pred_win = init_gather((void*)pred, nlocalverts, sizeof(int64_t), pred_pred, size_min(CHUNKSIZE, nlocalverts), size_min(CHUNKSIZE, nlocalverts), MPI_INT64_T);
  int64_t* restrict pred_vtx = (int64_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int64_t)); /* Vertex (not depth) part of pred map */
  int* restrict pred_owner = (int*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int));
  size_t* restrict pred_local = (size_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(size_t));
  int iter_number = 0;
  {
    /* Iteratively update depth[v] = min(depth[v], depth[pred[v]] + 1) [saturating at UINT16_MAX] until no changes. */
    while (1) {
      ++iter_number;
      int any_changes = 0;
      ptrdiff_t ii;
      for (ii = 0; ii < (ptrdiff_t)maxlocalverts; ii += CHUNKSIZE) {
        ptrdiff_t i_start = ptrdiff_min(ii, nlocalverts);
        ptrdiff_t i_end = ptrdiff_min(ii + CHUNKSIZE, nlocalverts);
        begin_gather(pred_win);
        ptrdiff_t i;
        assert (i_start >= 0 && i_start <= (ptrdiff_t)nlocalverts);
        assert (i_end >= 0 && i_end <= (ptrdiff_t)nlocalverts);
#pragma omp parallel for
        for (i = i_start; i < i_end; ++i) {
          pred_vtx[i - i_start] = get_pred_from_pred_entry(pred[i]);
        }
        get_vertex_distribution_for_pred(i_end - i_start, pred_vtx, pred_owner, pred_local);
#pragma omp parallel for
        for (i = i_start; i < i_end; ++i) {
          if (pred[i] != -1) {
            add_gather_request(pred_win, i - i_start, pred_owner[i - i_start], pred_local[i - i_start], i - i_start); //shit happened here first
          } else {
            pred_pred[i - i_start] = -1;
          }
        }
        end_gather(pred_win);

#pragma omp parallel for reduction(&&:validation_passed) reduction(||:any_changes)
        for (i = i_start; i < i_end; ++i) {
          if (rank == root_owner && (size_t)i == root_local) continue;
          if (get_depth_from_pred_entry(pred_pred[i - i_start]) != UINT16_MAX) {
            if (get_depth_from_pred_entry(pred[i]) != UINT16_MAX && get_depth_from_pred_entry(pred[i]) != get_depth_from_pred_entry(pred_pred[i - i_start]) + 1) {
              fprintf(stderr, "%d: Validation error: BFS predecessors do not form a tree; see vertices %" PRId64 " (depth %" PRIu16 ") and %" PRId64 " (depth %" PRIu16 ").\n", rank, vertex_to_global_for_pred(rank, i), get_depth_from_pred_entry(pred[i]), get_pred_from_pred_entry(pred[i]), get_depth_from_pred_entry(pred_pred[i - i_start]));
              validation_passed = 0;
            } else if (get_depth_from_pred_entry(pred[i]) == get_depth_from_pred_entry(pred_pred[i - i_start]) + 1) {
              /* Nothing to do */
            } else {
              write_pred_entry_depth(&pred[i], get_depth_from_pred_entry(pred_pred[i - i_start]) + 1);
              any_changes = 1;
            }
          }
        }
      }
      MPI_Allreduce(MPI_IN_PLACE, &any_changes, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
      if (!any_changes) break;
    }
  }
  destroy_gather(pred_win);
  MPI_Free_mem(pred_pred);
  free(pred_owner);
  free(pred_local);
  free(pred_vtx);
  return validation_passed;
}
Ejemplo n.º 2
0
/* Returns true if result is valid.  Also, updates high 16 bits of each element
 * of pred to contain the BFS level number (or -1 if not visited) of each
 * vertex; this is based on the predecessor map if the user didn't provide it.
 * */
int validate_bfs_result(const tuple_graph* const tg, const int64_t nglobalverts, const size_t nlocalverts, const int64_t root, int64_t* const pred, int64_t* const edge_visit_count_ptr) {

  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (pred);
  *edge_visit_count_ptr = 0; /* Ensure it is a valid pointer */
  int ranges_ok = check_value_ranges(nglobalverts, nlocalverts, pred);
  if (root < 0 || root >= nglobalverts) {
    fprintf(stderr, "%d: Validation error: root vertex %" PRId64 " is invalid.\n", rank, root);
    ranges_ok = 0;
  }
  if (!ranges_ok) return 0; /* Fail */

  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (pred);

  int validation_passed = 1;
  int root_owner;
  size_t root_local;
  get_vertex_distribution_for_pred(1, &root, &root_owner, &root_local);
  int root_is_mine = (root_owner == rank);

  /* Get maximum values so loop counts are consistent across ranks. */
  uint64_t maxlocalverts_ui = nlocalverts;
  MPI_Allreduce(MPI_IN_PLACE, &maxlocalverts_ui, 1, MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD);
  size_t maxlocalverts = (size_t)maxlocalverts_ui;

  ptrdiff_t max_bufsize = tuple_graph_max_bufsize(tg);
  ptrdiff_t edge_chunk_size = ptrdiff_min(HALF_CHUNKSIZE, max_bufsize);

  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (pred);

  /* Check that root is its own parent. */
  if (root_is_mine) {
    assert (root_local < nlocalverts);
    if (get_pred_from_pred_entry(pred[root_local]) != root) {
      fprintf(stderr, "%d: Validation error: parent of root vertex %" PRId64 " is %" PRId64 ", not the root itself.\n", rank, root, get_pred_from_pred_entry(pred[root_local]));
      validation_passed = 0;
    }
  }

  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (pred);
  
  /* Check that nothing else is its own parent. */
  {
    int* restrict pred_owner = (int*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int));
    size_t* restrict pred_local = (size_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(size_t));
    int64_t* restrict pred_vtx = (int64_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int64_t)); /* Vertex (not depth) part of pred map */
    ptrdiff_t ii;
    for (ii = 0; ii < (ptrdiff_t)nlocalverts; ii += CHUNKSIZE) {
      ptrdiff_t i_start = ii;
      ptrdiff_t i_end = ptrdiff_min(ii + CHUNKSIZE, nlocalverts);
      ptrdiff_t i;
      assert (i_start >= 0 && i_start <= (ptrdiff_t)nlocalverts);
      assert (i_end >= 0 && i_end <= (ptrdiff_t)nlocalverts);
#pragma omp parallel for
      for (i = i_start; i < i_end; ++i) {
        pred_vtx[i - i_start] = get_pred_from_pred_entry(pred[i]);
      }
      get_vertex_distribution_for_pred(i_end - i_start, pred_vtx, pred_owner, pred_local);
#pragma omp parallel for reduction(&&:validation_passed)
      for (i = i_start; i < i_end; ++i) {

        if ((!root_is_mine || (size_t)i != root_local) &&
            get_pred_from_pred_entry(pred[i]) != -1 &&
            pred_owner[i - i_start] == rank &&
            pred_local[i - i_start] == (size_t)i) {
          fprintf(stderr, "%d: Validation error: parent of non-root vertex %" PRId64 " is itself.\n", rank, vertex_to_global_for_pred(rank, i));
          validation_passed = 0;
        }
      }
    }
    free(pred_owner);
    free(pred_local);
    free(pred_vtx);
  }

  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (pred);

  if (bfs_writes_depth_map()) {
    int check_ok = check_bfs_depth_map_using_predecessors(tg, nglobalverts, nlocalverts, maxlocalverts, root, pred);
    if (!check_ok) validation_passed = 0;
  } else {
    
    /* Create a vertex depth map to use for later validation. */
    int pred_ok = build_bfs_depth_map(nglobalverts, nlocalverts, maxlocalverts, root, pred); //shit happened here
    if (!pred_ok) validation_passed = 0;
  }

  {
    /* Check that all edges connect vertices whose depths differ by at most
     * one, and check that there is an edge from each vertex to its claimed
     * predecessor.  Also, count visited edges (including duplicates and
     * self-loops).  */
    unsigned char* restrict pred_valid = (unsigned char*)xMPI_Alloc_mem(nlocalverts * sizeof(unsigned char));
    memset(pred_valid, 0, nlocalverts * sizeof(unsigned char));
    int64_t* restrict edge_endpoint = (int64_t*)xmalloc(2 * edge_chunk_size * sizeof(int64_t));
    int* restrict edge_owner = (int*)xmalloc(2 * edge_chunk_size * sizeof(int));
    size_t* restrict edge_local = (size_t*)xmalloc(2 * edge_chunk_size * sizeof(size_t));
    int64_t* restrict edge_preds = (int64_t*)xMPI_Alloc_mem(2 * edge_chunk_size * sizeof(int64_t));
    gather* pred_win = init_gather((void*)pred, nlocalverts, sizeof(int64_t), edge_preds, 2 * edge_chunk_size, 2 * edge_chunk_size, MPI_INT64_T);
    unsigned char one = 1;
    scatter_constant* pred_valid_win = init_scatter_constant((void*)pred_valid, nlocalverts, sizeof(unsigned char), &one, 2 * edge_chunk_size, MPI_UNSIGNED_CHAR);
    int64_t edge_visit_count = 0;
    ITERATE_TUPLE_GRAPH_BEGIN(tg, buf, bufsize) {
      ptrdiff_t ii;
      for (ii = 0; ii < max_bufsize; ii += HALF_CHUNKSIZE) {
        ptrdiff_t i_start = ptrdiff_min(ii, bufsize);
        ptrdiff_t i_end = ptrdiff_min(ii + HALF_CHUNKSIZE, bufsize);
        assert (i_end - i_start <= edge_chunk_size);
        ptrdiff_t i;
#pragma omp parallel for
        for (i = i_start; i < i_end; ++i) {
          int64_t v0 = get_v0_from_edge(&buf[i]);
          int64_t v1 = get_v1_from_edge(&buf[i]);
          edge_endpoint[(i - i_start) * 2 + 0] = v0;
          edge_endpoint[(i - i_start) * 2 + 1] = v1;
        }
        get_vertex_distribution_for_pred(2 * (i_end - i_start), edge_endpoint, edge_owner, edge_local);
        begin_gather(pred_win);
#pragma omp parallel for
        for (i = i_start; i < i_end; ++i) {
          add_gather_request(pred_win, (i - i_start) * 2 + 0, edge_owner[(i - i_start) * 2 + 0], edge_local[(i - i_start) * 2 + 0], (i - i_start) * 2 + 0);
          add_gather_request(pred_win, (i - i_start) * 2 + 1, edge_owner[(i - i_start) * 2 + 1], edge_local[(i - i_start) * 2 + 1], (i - i_start) * 2 + 1);
        }
        end_gather(pred_win);
        begin_scatter_constant(pred_valid_win);
#pragma omp parallel for reduction(&&:validation_passed) reduction(+:edge_visit_count)
        for (i = i_start; i < i_end; ++i) {
          int64_t src = get_v0_from_edge(&buf[i]);
          int64_t tgt = get_v1_from_edge(&buf[i]);
          uint16_t src_depth = get_depth_from_pred_entry(edge_preds[(i - i_start) * 2 + 0]);
          uint16_t tgt_depth = get_depth_from_pred_entry(edge_preds[(i - i_start) * 2 + 1]);
          if (src_depth != UINT16_MAX && tgt_depth == UINT16_MAX) {
            fprintf(stderr, "%d: Validation error: edge connects vertex %" PRId64 " in the BFS tree (depth %" PRIu16 ") to vertex %" PRId64 " outside the tree.\n", rank, src, src_depth, tgt);
            validation_passed = 0;
          } else if (src_depth == UINT16_MAX && tgt_depth != UINT16_MAX) {
            fprintf(stderr, "%d: Validation error: edge connects vertex %" PRId64 " in the BFS tree (depth %" PRIu16 ") to vertex %" PRId64 " outside the tree.\n", rank, tgt, tgt_depth, src);
            validation_passed = 0;
          } else if (src_depth - tgt_depth < -1 ||
                     src_depth - tgt_depth > 1) {
            fprintf(stderr, "%d: Validation error: depths of edge endpoints %" PRId64 " (depth %" PRIu16 ") and %" PRId64 " (depth %" PRIu16 ") are too far apart (abs. val. > 1).\n", rank, src, src_depth, tgt, tgt_depth);
            validation_passed = 0;
          } else if (src_depth != UINT16_MAX) {
            ++edge_visit_count;
          }
          if (get_pred_from_pred_entry(edge_preds[(i - i_start) * 2 + 0]) == tgt) {
            add_scatter_constant_request(pred_valid_win, edge_owner[(i - i_start) * 2 + 0], edge_local[(i - i_start) * 2 + 0], (i - i_start) * 2 + 0);
          }
          if (get_pred_from_pred_entry(edge_preds[(i - i_start) * 2 + 1]) == src) {
            add_scatter_constant_request(pred_valid_win, edge_owner[(i - i_start) * 2 + 1], edge_local[(i - i_start) * 2 + 1], (i - i_start) * 2 + 1);
          }
        }
        end_scatter_constant(pred_valid_win);
      }
    } ITERATE_TUPLE_GRAPH_END;
    destroy_gather(pred_win);
    MPI_Free_mem(edge_preds);
    free(edge_owner);
    free(edge_local);
    free(edge_endpoint);
    destroy_scatter_constant(pred_valid_win);
    ptrdiff_t i;
#pragma omp parallel for reduction(&&:validation_passed)
    for (i = 0; i < (ptrdiff_t)nlocalverts; ++i) {
      int64_t p = get_pred_from_pred_entry(pred[i]);
      if (p == -1) continue;
      int found_pred_edge = pred_valid[i];
      if (root_owner == rank && root_local == (size_t)i) found_pred_edge = 1; /* Root vertex */
      if (!found_pred_edge) {
        int64_t v = vertex_to_global_for_pred(rank, i);
        fprintf(stderr, "%d: Validation error: no graph edge from vertex %" PRId64 " to its parent %" PRId64 ".\n", rank, v, get_pred_from_pred_entry(pred[i]));
        validation_passed = 0;
      }
    }
    MPI_Free_mem(pred_valid);

    MPI_Allreduce(MPI_IN_PLACE, &edge_visit_count, 1, MPI_INT64_T, MPI_SUM, MPI_COMM_WORLD);
    *edge_visit_count_ptr = edge_visit_count;
  }
Ejemplo n.º 3
0
int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);

  setup_globals();

  /* Parse arguments. */
  int SCALE = 16;
  int edgefactor = 16; /* nedges / nvertices, i.e., 2*avg. degree */
  // if (argc >= 2) SCALE = atoi(argv[1]);
  // if (argc >= 3) edgefactor = atoi(argv[2]);
  char* name = argv[1];
  if (argc >= 3) SCALE = atoi(argv[2]);
  if (argc >= 4) edgefactor = atoi(argv[3]);
  // if (argc <= 1 || argc >= 4 || SCALE == 0 || edgefactor == 0) {
  //   if (rank == 0) {
  //     fprintf(stderr, "Usage: %s SCALE edgefactor\n  SCALE = log_2(# vertices) [integer, required]\n  edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]);
  //   }
  if (argc <= 2 || argc >= 5 || SCALE == 0 || edgefactor == 0) {
    if (rank == 0) {
      fprintf(stderr, "Usage: %s filename SCALE edgefactor\n  SCALE = log_2(# vertices) [integer, required]\n  edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]);
    }
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  uint64_t seed1 = 2, seed2 = 3;

  // const char* filename = getenv("TMPFILE");
  const char* filename = name;

  /* If filename is NULL, store data in memory */

  tuple_graph tg;
  tg.nglobaledges = (int64_t)(edgefactor) << SCALE;
  int64_t nglobalverts = (int64_t)(1) << SCALE;

  tg.data_in_file = (filename != NULL);

  if (tg.data_in_file) {
      printf("data in file \n");

    MPI_File_set_errhandler(MPI_FILE_NULL, MPI_ERRORS_ARE_FATAL);
    // MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_DELETE_ON_CLOSE | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile);
    MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile);
    MPI_File_set_size(tg.edgefile, tg.nglobaledges * sizeof(packed_edge));
    MPI_File_set_view(tg.edgefile, 0, packed_edge_mpi_type, packed_edge_mpi_type, "native", MPI_INFO_NULL);
    MPI_File_set_atomicity(tg.edgefile, 0);
  }

  /* Make the raw graph edges. */
  /* Get roots for BFS runs, plus maximum vertex with non-zero degree (used by
   * validator). */
  int num_bfs_roots = 64;
  int64_t* bfs_roots = (int64_t*)xmalloc(num_bfs_roots * sizeof(int64_t));
  int64_t max_used_vertex = 0;

  double make_graph_start = MPI_Wtime();
  {
    /* Spread the two 64-bit numbers into five nonzero values in the correct
     * range. */
    uint_fast32_t seed[5];
    make_mrg_seed(seed1, seed2, seed);

    /* As the graph is being generated, also keep a bitmap of vertices with
     * incident edges.  We keep a grid of processes, each row of which has a
     * separate copy of the bitmap (distributed among the processes in the
     * row), and then do an allreduce at the end.  This scheme is used to avoid
     * non-local communication and reading the file separately just to find BFS
     * roots. */
    MPI_Offset nchunks_in_file = (tg.nglobaledges + FILE_CHUNKSIZE - 1) / FILE_CHUNKSIZE;
    int64_t bitmap_size_in_bytes = int64_min(BITMAPSIZE, (nglobalverts + CHAR_BIT - 1) / CHAR_BIT);
    if (bitmap_size_in_bytes * size * CHAR_BIT < nglobalverts) {
      bitmap_size_in_bytes = (nglobalverts + size * CHAR_BIT - 1) / (size * CHAR_BIT);
    }
    int ranks_per_row = ((nglobalverts + CHAR_BIT - 1) / CHAR_BIT + bitmap_size_in_bytes - 1) / bitmap_size_in_bytes;
    int nrows = size / ranks_per_row;
    int my_row = -1, my_col = -1;
    unsigned char* restrict has_edge = NULL;
    MPI_Comm cart_comm;
    {
      int dims[2] = {size / ranks_per_row, ranks_per_row};
      int periods[2] = {0, 0};
      MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &cart_comm);
    }
    int in_generating_rectangle = 0;
    if (cart_comm != MPI_COMM_NULL) {
      in_generating_rectangle = 1;
      {
        int dims[2], periods[2], coords[2];
        MPI_Cart_get(cart_comm, 2, dims, periods, coords);
        my_row = coords[0];
        my_col = coords[1];
      }
      MPI_Comm this_col;
      MPI_Comm_split(cart_comm, my_col, my_row, &this_col);
      MPI_Comm_free(&cart_comm);
      has_edge = (unsigned char*)xMPI_Alloc_mem(bitmap_size_in_bytes);
      memset(has_edge, 0, bitmap_size_in_bytes);
      /* Every rank in a given row creates the same vertices (for updating the
       * bitmap); only one writes them to the file (or final memory buffer). */
      packed_edge* buf = (packed_edge*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge));
      MPI_Offset block_limit = (nchunks_in_file + nrows - 1) / nrows;
      // fprintf(stderr, "%d: nchunks_in_file = %" PRId64 ", block_limit = %" PRId64 " in grid of %d rows, %d cols\n", rank, (int64_t)nchunks_in_file, (int64_t)block_limit, nrows, ranks_per_row);
      if (tg.data_in_file) {
        tg.edgememory_size = 0;
        tg.edgememory = NULL;
      } else {
        int my_pos = my_row + my_col * nrows;
        int last_pos = (tg.nglobaledges % ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row) != 0) ?
                       (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row) :
                       -1;
        int64_t edges_left = tg.nglobaledges % FILE_CHUNKSIZE;
        int64_t nedges = FILE_CHUNKSIZE * (tg.nglobaledges / ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row)) +
                         FILE_CHUNKSIZE * (my_pos < (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row)) +
                         (my_pos == last_pos ? edges_left : 0);
        /* fprintf(stderr, "%d: nedges = %" PRId64 " of %" PRId64 "\n", rank, (int64_t)nedges, (int64_t)tg.nglobaledges); */
        tg.edgememory_size = nedges;
        tg.edgememory = (packed_edge*)xmalloc(nedges * sizeof(packed_edge));
      }
      MPI_Offset block_idx;
      for (block_idx = 0; block_idx < block_limit; ++block_idx) {
        /* fprintf(stderr, "%d: On block %d of %d\n", rank, (int)block_idx, (int)block_limit); */
        MPI_Offset start_edge_index = int64_min(FILE_CHUNKSIZE * (block_idx * nrows + my_row), tg.nglobaledges);
        MPI_Offset edge_count = int64_min(tg.nglobaledges - start_edge_index, FILE_CHUNKSIZE);
        packed_edge* actual_buf = (!tg.data_in_file && block_idx % ranks_per_row == my_col) ?
                                  tg.edgememory + FILE_CHUNKSIZE * (block_idx / ranks_per_row) :
                                  buf;
        /* fprintf(stderr, "%d: My range is [%" PRId64 ", %" PRId64 ") %swriting into index %" PRId64 "\n", rank, (int64_t)start_edge_index, (int64_t)(start_edge_index + edge_count), (my_col == (block_idx % ranks_per_row)) ? "" : "not ", (int64_t)(FILE_CHUNKSIZE * (block_idx / ranks_per_row))); */
        if (!tg.data_in_file && block_idx % ranks_per_row == my_col) {
          assert (FILE_CHUNKSIZE * (block_idx / ranks_per_row) + edge_count <= tg.edgememory_size);
        }

	// debug
	char* wtxbuf = (char*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge));

        // generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf);
        generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf);
        if (tg.data_in_file && my_col == (block_idx % ranks_per_row)) { /* Try to spread writes among ranks */
          // MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE);


	    // debug
	    printf("%d: %d, %d\n", rank, start_edge_index, edge_count);
	    int i;
	    // for (i = start_edge_index; i < start_edge_index + 3; i++) {
	    // if(block_idx == 0) {
	    // 	for (i = 0; i < 3; i++) {
	    // 	    if (edge_count > 3)
	    // 		printf("%d: %d\t%d\n", rank, actual_buf[i].v0, actual_buf[i].v1);
	    // 	}

	    // }

	    
	    

          MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE);
        }
        ptrdiff_t i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (i = 0; i < edge_count; ++i) {
          int64_t src = get_v0_from_edge(&actual_buf[i]);
          int64_t tgt = get_v1_from_edge(&actual_buf[i]);
          if (src == tgt) continue;
          if (src / bitmap_size_in_bytes / CHAR_BIT == my_col) {
#ifdef _OPENMP
#pragma omp atomic
#endif
            has_edge[(src / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (src % CHAR_BIT));
          }
          if (tgt / bitmap_size_in_bytes / CHAR_BIT == my_col) {
#ifdef _OPENMP
#pragma omp atomic
#endif
            has_edge[(tgt / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (tgt % CHAR_BIT));
          }
        }
      }
      free(buf);
#if 0
      /* The allreduce for each root acts like we did this: */
      MPI_Allreduce(MPI_IN_PLACE, has_edge, bitmap_size_in_bytes, MPI_UNSIGNED_CHAR, MPI_BOR, this_col);
#endif
      MPI_Comm_free(&this_col);
    } else {
      tg.edgememory = NULL;
      tg.edgememory_size = 0;
    }
    MPI_Allreduce(&tg.edgememory_size, &tg.max_edgememory_size, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);

#ifndef GEN_ONLY
    /* Find roots and max used vertex */
    {
      uint64_t counter = 0;
      int bfs_root_idx;
      for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) {
        int64_t root;
        while (1) {
          double d[2];
          make_random_numbers(2, seed1, seed2, counter, d);
          root = (int64_t)((d[0] + d[1]) * nglobalverts) % nglobalverts;
          counter += 2;
          if (counter > 2 * nglobalverts) break;
          int is_duplicate = 0;
          int i;
          for (i = 0; i < bfs_root_idx; ++i) {
            if (root == bfs_roots[i]) {
              is_duplicate = 1;
              break;
            }
          }
          if (is_duplicate) continue; /* Everyone takes the same path here */
          int root_ok = 0;
          if (in_generating_rectangle && (root / CHAR_BIT / bitmap_size_in_bytes) == my_col) {
            root_ok = (has_edge[(root / CHAR_BIT) % bitmap_size_in_bytes] & (1 << (root % CHAR_BIT))) != 0;
          }
          MPI_Allreduce(MPI_IN_PLACE, &root_ok, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
          if (root_ok) break;
        }
        bfs_roots[bfs_root_idx] = root;
      }
      num_bfs_roots = bfs_root_idx;

      /* Find maximum non-zero-degree vertex. */
      {
        int64_t i;
        max_used_vertex = 0;
        if (in_generating_rectangle) {
          for (i = bitmap_size_in_bytes * CHAR_BIT; i > 0; --i) {
            if (i > nglobalverts) continue;
            if (has_edge[(i - 1) / CHAR_BIT] & (1 << ((i - 1) % CHAR_BIT))) {
              max_used_vertex = (i - 1) + my_col * CHAR_BIT * bitmap_size_in_bytes;
              break;
            }
          }
        }
        MPI_Allreduce(MPI_IN_PLACE, &max_used_vertex, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
      }
    }
#endif

    if (in_generating_rectangle) {
      MPI_Free_mem(has_edge);
    }
    if (tg.data_in_file) {
      MPI_File_sync(tg.edgefile);
    }
  }

  double make_graph_stop = MPI_Wtime();
  double make_graph_time = make_graph_stop - make_graph_start;
  if (rank == 0) { /* Not an official part of the results */
    fprintf(stderr, "graph_generation:               %f s\n", make_graph_time);
  }


  //debug
#ifndef GEN_ONLY //!GEN_ONLY

  /* Make user's graph data structure. */
  double data_struct_start = MPI_Wtime();
  make_graph_data_structure(&tg);
  double data_struct_stop = MPI_Wtime();
  double data_struct_time = data_struct_stop - data_struct_start;
  if (rank == 0) { /* Not an official part of the results */
    fprintf(stderr, "construction_time:              %f s\n", data_struct_time);
  }

  /* Number of edges visited in each BFS; a double so get_statistics can be
   * used directly. */
  double* edge_counts = (double*)xmalloc(num_bfs_roots * sizeof(double));

  /* Run BFS. */
  int validation_passed = 1;
  double* bfs_times = (double*)xmalloc(num_bfs_roots * sizeof(double));
  double* validate_times = (double*)xmalloc(num_bfs_roots * sizeof(double));
  uint64_t nlocalverts = get_nlocalverts_for_pred();
  int64_t* pred = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  int bfs_root_idx;
  for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) {
    int64_t root = bfs_roots[bfs_root_idx];

    if (rank == 0) fprintf(stderr, "Running BFS %d\n", bfs_root_idx);

    /* Clear the pred array. */
    memset(pred, 0, nlocalverts * sizeof(int64_t));

    /* Do the actual BFS. */
    double bfs_start = MPI_Wtime();
    run_bfs(root, &pred[0]);
    double bfs_stop = MPI_Wtime();
    bfs_times[bfs_root_idx] = bfs_stop - bfs_start;
    if (rank == 0) fprintf(stderr, "Time for BFS %d is %f\n", bfs_root_idx, bfs_times[bfs_root_idx]);

    /* Validate result. */
    if (rank == 0) fprintf(stderr, "Validating BFS %d\n", bfs_root_idx);

    double validate_start = MPI_Wtime();
    int64_t edge_visit_count;
    int validation_passed_one = validate_bfs_result(&tg, max_used_vertex + 1, nlocalverts, root, pred, &edge_visit_count);
    double validate_stop = MPI_Wtime();
    validate_times[bfs_root_idx] = validate_stop - validate_start;
    if (rank == 0) fprintf(stderr, "Validate time for BFS %d is %f\n", bfs_root_idx, validate_times[bfs_root_idx]);
    edge_counts[bfs_root_idx] = (double)edge_visit_count;
    if (rank == 0) fprintf(stderr, "TEPS for BFS %d is %g\n", bfs_root_idx, edge_visit_count / bfs_times[bfs_root_idx]);

    if (!validation_passed_one) {
      validation_passed = 0;
      if (rank == 0) fprintf(stderr, "Validation failed for this BFS root; skipping rest.\n");
      break;
    }
  }

  MPI_Free_mem(pred);
  free(bfs_roots);
  free_graph_data_structure();

#endif //!GEN_ONLY

  if (tg.data_in_file) {
    MPI_File_close(&tg.edgefile);
  } else {
    free(tg.edgememory); tg.edgememory = NULL;
  }

#ifndef GEN_ONLY
  /* Print results. */
  if (rank == 0) {
    if (!validation_passed) {
      fprintf(stdout, "No results printed for invalid run.\n");
    } else {
      int i;
      fprintf(stdout, "SCALE:                          %d\n", SCALE);
      fprintf(stdout, "edgefactor:                     %d\n", edgefactor);
      fprintf(stdout, "NBFS:                           %d\n", num_bfs_roots);
      fprintf(stdout, "graph_generation:               %g\n", make_graph_time);
      fprintf(stdout, "num_mpi_processes:              %d\n", size);
      fprintf(stdout, "construction_time:              %g\n", data_struct_time);
      double stats[s_LAST];
      get_statistics(bfs_times, num_bfs_roots, stats);
      fprintf(stdout, "min_time:                       %g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_time:             %g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_time:                    %g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_time:             %g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_time:                       %g\n", stats[s_maximum]);
      fprintf(stdout, "mean_time:                      %g\n", stats[s_mean]);
      fprintf(stdout, "stddev_time:                    %g\n", stats[s_std]);
      get_statistics(edge_counts, num_bfs_roots, stats);
      fprintf(stdout, "min_nedge:                      %.11g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_nedge:            %.11g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_nedge:                   %.11g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_nedge:            %.11g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_nedge:                      %.11g\n", stats[s_maximum]);
      fprintf(stdout, "mean_nedge:                     %.11g\n", stats[s_mean]);
      fprintf(stdout, "stddev_nedge:                   %.11g\n", stats[s_std]);
      double* secs_per_edge = (double*)xmalloc(num_bfs_roots * sizeof(double));
      for (i = 0; i < num_bfs_roots; ++i) secs_per_edge[i] = bfs_times[i] / edge_counts[i];
      get_statistics(secs_per_edge, num_bfs_roots, stats);
      fprintf(stdout, "min_TEPS:                       %g\n", 1. / stats[s_maximum]);
      fprintf(stdout, "firstquartile_TEPS:             %g\n", 1. / stats[s_thirdquartile]);
      fprintf(stdout, "median_TEPS:                    %g\n", 1. / stats[s_median]);
      fprintf(stdout, "thirdquartile_TEPS:             %g\n", 1. / stats[s_firstquartile]);
      fprintf(stdout, "max_TEPS:                       %g\n", 1. / stats[s_minimum]);
      fprintf(stdout, "harmonic_mean_TEPS:             %g\n", 1. / stats[s_mean]);
      /* Formula from:
       * Title: The Standard Errors of the Geometric and Harmonic Means and
       *        Their Application to Index Numbers
       * Author(s): Nilan Norris
       * Source: The Annals of Mathematical Statistics, Vol. 11, No. 4 (Dec., 1940), pp. 445-448
       * Publisher(s): Institute of Mathematical Statistics
       * Stable URL: http://www.jstor.org/stable/2235723
       * (same source as in specification). */
      fprintf(stdout, "harmonic_stddev_TEPS:           %g\n", stats[s_std] / (stats[s_mean] * stats[s_mean] * sqrt(num_bfs_roots - 1)));
      free(secs_per_edge); secs_per_edge = NULL;
      free(edge_counts); edge_counts = NULL;
      get_statistics(validate_times, num_bfs_roots, stats);
      fprintf(stdout, "min_validate:                   %g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_validate:         %g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_validate:                %g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_validate:         %g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_validate:                   %g\n", stats[s_maximum]);
      fprintf(stdout, "mean_validate:                  %g\n", stats[s_mean]);
      fprintf(stdout, "stddev_validate:                %g\n", stats[s_std]);
#if 0
      for (i = 0; i < num_bfs_roots; ++i) {
        fprintf(stdout, "Run %3d:                        %g s, validation %g s\n", i + 1, bfs_times[i], validate_times[i]);
      }
#endif
    }
  }
  free(bfs_times);
  free(validate_times);

#endif
  cleanup_globals();
  MPI_Finalize();
  return 0;
}
Ejemplo n.º 4
0
/* Check the BFS levels in pred against the predecessors given there.  Returns
 * true if the maps are valid. */
static int check_bfs_depth_map_using_predecessors(const tuple_graph* const tg, const int64_t nglobalverts, const size_t nlocalverts, const size_t maxlocalverts, const int64_t root, const int64_t* const pred) {
  (void)nglobalverts; /* Avoid warning */
  assert (tg->edgememory_size >= 0 && tg->max_edgememory_size >= tg->edgememory_size && tg->max_edgememory_size <= tg->nglobaledges);
  assert (root >= 0 && root < nglobalverts);
  assert (nglobalverts >= 0);
  assert (pred);

  int validation_passed = 1;
  int root_owner;
  size_t root_local;
  get_vertex_distribution_for_pred(1, &root, &root_owner, &root_local);
  int root_is_mine = (root_owner == rank);
  if (root_is_mine) assert (root_local < nlocalverts);

  {
    ptrdiff_t i;
    if (root_is_mine && get_depth_from_pred_entry(pred[root_local]) != 0) {
      fprintf(stderr, "%d: Validation error: depth of root vertex %" PRId64 " is %" PRIu16 ", not 0.\n", rank, root, get_depth_from_pred_entry(pred[root_local]));
      validation_passed = 0;
    }
#pragma omp parallel for reduction(&&:validation_passed)
    for (i = 0; i < (ptrdiff_t)nlocalverts; ++i) {
      if (get_pred_from_pred_entry(pred[i]) == -1 &&
          get_depth_from_pred_entry(pred[i]) != UINT16_MAX) {
        fprintf(stderr, "%d: Validation error: depth of vertex %" PRId64 " with no predecessor is %" PRIu16 ", not UINT16_MAX.\n", rank, vertex_to_global_for_pred(rank, i), get_depth_from_pred_entry(pred[i]));
        validation_passed = 0;
      } else if (get_pred_from_pred_entry(pred[i]) != -1 &&
                 get_depth_from_pred_entry(pred[i]) == UINT16_MAX) {
        fprintf(stderr, "%d: Validation error: predecessor of claimed unreachable vertex %" PRId64 " is %" PRId64 ", not -1.\n", rank, vertex_to_global_for_pred(rank, i), get_pred_from_pred_entry(pred[i]));
        validation_passed = 0;
      }
    }
  }
  int64_t* restrict pred_pred = (int64_t*)xMPI_Alloc_mem(size_min(CHUNKSIZE, nlocalverts) * sizeof(int64_t)); /* Predecessor info of predecessor vertex for each local vertex */
  gather* pred_win = init_gather((void*)pred, nlocalverts, sizeof(int64_t), pred_pred, size_min(CHUNKSIZE, nlocalverts), size_min(CHUNKSIZE, nlocalverts), MPI_INT64_T);
  int64_t* restrict pred_vtx = (int64_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int64_t)); /* Vertex (not depth) part of pred map */
  int* restrict pred_owner = (int*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(int));
  size_t* restrict pred_local = (size_t*)xmalloc(size_min(CHUNKSIZE, nlocalverts) * sizeof(size_t));
  size_t ii;
  for (ii = 0; ii < maxlocalverts; ii += CHUNKSIZE) {
    ptrdiff_t i_start = ptrdiff_min(ii, nlocalverts);
    ptrdiff_t i_end = ptrdiff_min(ii + CHUNKSIZE, nlocalverts);
    begin_gather(pred_win);
    ptrdiff_t i;
    assert (i_start >= 0 && i_start <= (ptrdiff_t)nlocalverts);
    assert (i_end >= 0 && i_end <= (ptrdiff_t)nlocalverts);
    assert (i_end >= i_start);
    assert (i_end - i_start >= 0 && i_end - i_start <= (ptrdiff_t)size_min(CHUNKSIZE, nlocalverts));
#pragma omp parallel for
    for (i = i_start; i < i_end; ++i) {
      pred_vtx[i - i_start] = get_pred_from_pred_entry(pred[i]);
    }
    get_vertex_distribution_for_pred(i_end - i_start, pred_vtx, pred_owner, pred_local);
#pragma omp parallel for
    for (i = i_start; i < i_end; ++i) {
      if (pred[i] != -1) {
        add_gather_request(pred_win, i - i_start, pred_owner[i - i_start], pred_local[i - i_start], i - i_start);
      } else {
        pred_pred[i - i_start] = -1;
      }
    }
    end_gather(pred_win);
#pragma omp parallel for reduction(&&:validation_passed)
    for (i = i_start; i < i_end; ++i) {
      if (rank == root_owner && (size_t)i == root_local) continue;
      if (get_pred_from_pred_entry(pred[i]) == -1) continue; /* Already checked */
      if (get_depth_from_pred_entry(pred_pred[i - i_start]) == UINT16_MAX) {
        fprintf(stderr, "%d: Validation error: predecessor %" PRId64 " of vertex %" PRId64 " (depth %" PRIu16 ") is marked as unreachable.\n", rank, get_pred_from_pred_entry(pred[i]), vertex_to_global_for_pred(rank, i), get_depth_from_pred_entry(pred[i]));
        validation_passed = 0;
      }
      if (get_depth_from_pred_entry(pred[i]) != get_depth_from_pred_entry(pred_pred[i - i_start]) + 1) {
        fprintf(stderr, "%d: Validation error: BFS predecessors do not form a tree; see vertices %" PRId64 " (depth %" PRIu16 ") and %" PRId64 " (depth %" PRIu16 ").\n", rank, vertex_to_global_for_pred(rank, i), get_depth_from_pred_entry(pred[i]), get_pred_from_pred_entry(pred[i]), get_depth_from_pred_entry(pred_pred[i - i_start]));
        validation_passed = 0;
      }
    }
  }
  destroy_gather(pred_win);
  MPI_Free_mem(pred_pred);
  free(pred_owner);
  free(pred_local);
  free(pred_vtx);
  return validation_passed;
}
Ejemplo n.º 5
0
/* This BFS represents its queues as bitmaps and uses some data representation
 * tricks to fit with the use of MPI one-sided operations.  It is not much
 * faster than the standard version on the machines I have tested it on, but
 * systems that have good RDMA hardware and good MPI one-sided implementations
 * might get better performance from it.  This code might also be good to
 * translate to UPC, Co-array Fortran, SHMEM, or GASNet since those systems are
 * more designed for one-sided remote memory operations. */
void run_mpi_bfs(const csr_graph* const g, int64_t root, int64_t* pred, int64_t* nvisited) {
  const size_t nlocalverts = g->nlocalverts;
  const int64_t nglobalverts = g->nglobalverts;
  int64_t nvisited_local = 0;

  /* Set up a second predecessor map so we can read from one and modify the
   * other. */
  int64_t* orig_pred = pred;
  int64_t* pred2 = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  /* The queues (old and new) are represented as bitmaps.  Each bit in the
   * queue bitmap says to check elts_per_queue_bit elements in the predecessor
   * map for vertices that need to be visited.  In other words, the queue
   * bitmap is an overapproximation of the actual queue; because MPI_Accumulate
   * does not get any information on the result of the update, sometimes
   * elements are also added to the bitmap when they were actually already
   * black.  Because of this, the predecessor map needs to be checked to be
   * sure a given vertex actually needs to be processed. */
  const int elts_per_queue_bit = 4;
  const int ulong_bits = sizeof(unsigned long) * CHAR_BIT;
  int64_t queue_nbits = (nlocalverts + elts_per_queue_bit - 1) / elts_per_queue_bit;
  int64_t queue_nwords = (queue_nbits + ulong_bits - 1) / ulong_bits;
  unsigned long* queue_bitmap1 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  unsigned long* queue_bitmap2 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long));
  memset(queue_bitmap1, 0, queue_nwords * sizeof(unsigned long));

  /* List of local vertices (used as sources in MPI_Accumulate). */
  int64_t* local_vertices = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));
  {size_t i; for (i = 0; i < nlocalverts; ++i) local_vertices[i] = VERTEX_TO_GLOBAL(i);}

  /* List of all bit masks for an unsigned long (used as sources in
   * MPI_Accumulate). */
  unsigned long masks[ulong_bits];
  {int i; for (i = 0; i < ulong_bits; ++i) masks[i] = (1UL << i);}

  /* Coding of predecessor map: */
  /* - White (not visited): INT64_MAX */
  /* - Grey (in queue): 0 .. nglobalverts-1 */
  /* - Black (done): -nglobalverts .. -1 */

  /* Set initial predecessor map. */
  {size_t i; for (i = 0; i < nlocalverts; ++i) pred[i] = INT64_MAX;}

  /* Mark root as grey and add it to the queue. */
  if (VERTEX_OWNER(root) == rank) {
    pred[VERTEX_LOCAL(root)] = root;
    queue_bitmap1[VERTEX_LOCAL(root) / elts_per_queue_bit / ulong_bits] |= (1UL << ((VERTEX_LOCAL(root) / elts_per_queue_bit) % ulong_bits));
  }

  /* Create MPI windows on the two predecessor arrays and the two queues. */
  MPI_Win pred_win, pred2_win, queue1_win, queue2_win;
  MPI_Win_create(pred, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred_win);
  MPI_Win_create(pred2, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred2_win);
  MPI_Win_create(queue_bitmap1, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue1_win);
  MPI_Win_create(queue_bitmap2, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue2_win);

  while (1) {
    int64_t i;
    /* Clear the next-level queue. */
    memset(queue_bitmap2, 0, queue_nwords * sizeof(unsigned long));

    /* The pred2 array is pred with all grey vertices changed to black. */
    memcpy(pred2, pred, nlocalverts * sizeof(int64_t));
    for (i = 0; i < (int64_t)nlocalverts; ++i) {
      if (pred2[i] >= 0 && pred2[i] < nglobalverts) pred2[i] -= nglobalverts;
    }

    /* Start one-sided operations for this level. */
    MPI_Win_fence(MPI_MODE_NOPRECEDE, pred2_win);
    MPI_Win_fence(MPI_MODE_NOPRECEDE, queue2_win);

    /* Step through the words of the queue bitmap. */
    for (i = 0; i < queue_nwords; ++i) {
      unsigned long val = queue_bitmap1[i];
      int bitnum;
      /* Skip any that are all zero. */
      if (!val) continue;
      /* Scan the bits in the word. */
      for (bitnum = 0; bitnum < ulong_bits; ++bitnum) {
        size_t first_v_local = (size_t)((i * ulong_bits + bitnum) * elts_per_queue_bit);
        if (first_v_local >= nlocalverts) break;
        int bit = (int)((val >> bitnum) & 1);
        /* Skip any that are zero. */
        if (!bit) continue;
        /* Scan the queue elements corresponding to this bit. */
        int qelem_idx;
        for (qelem_idx = 0; qelem_idx < elts_per_queue_bit; ++qelem_idx) {
          size_t v_local = first_v_local + qelem_idx;
          if (v_local >= nlocalverts) continue;
          /* Since the queue is an overapproximation, check the predecessor map
           * to be sure this vertex is grey. */
          if (pred[v_local] >= 0 && pred[v_local] < nglobalverts) {
            ++nvisited_local;
            size_t ei, ei_end = g->rowstarts[v_local + 1];
            /* Walk the incident edges. */
            for (ei = g->rowstarts[v_local]; ei < ei_end; ++ei) {
              int64_t w = g->column[ei];
              if (w == VERTEX_TO_GLOBAL(v_local)) continue; /* Self-loop */
              /* Set the predecessor of the other edge endpoint (note use of
               * MPI_MIN and the coding of the predecessor map). */
              MPI_Accumulate(&local_vertices[v_local], 1, INT64_T_MPI_TYPE, VERTEX_OWNER(w), VERTEX_LOCAL(w), 1, INT64_T_MPI_TYPE, MPI_MIN, pred2_win);
              /* Mark the endpoint in the remote queue (note that the min may
               * not do an update, so the queue is an overapproximation in this
               * way as well). */
              MPI_Accumulate(&masks[((VERTEX_LOCAL(w) / elts_per_queue_bit) % ulong_bits)], 1, MPI_UNSIGNED_LONG, VERTEX_OWNER(w), VERTEX_LOCAL(w) / elts_per_queue_bit / ulong_bits, 1, MPI_UNSIGNED_LONG, MPI_BOR, queue2_win);
            }
          }
        }
      }
    }
    /* End one-sided operations. */
    MPI_Win_fence(MPI_MODE_NOSUCCEED, queue2_win);
    MPI_Win_fence(MPI_MODE_NOSUCCEED, pred2_win);

    /* Test if there are any elements in the next-level queue (globally); stop
     * if none. */
    int any_set = 0;
    for (i = 0; i < queue_nwords; ++i) {
      if (queue_bitmap2[i] != 0) {any_set = 1; break;}
    }
    MPI_Allreduce(MPI_IN_PLACE, &any_set, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
    if (!any_set) break;

    /* Swap queues and predecessor maps. */
    {MPI_Win temp = queue1_win; queue1_win = queue2_win; queue2_win = temp;}
    {unsigned long* temp = queue_bitmap1; queue_bitmap1 = queue_bitmap2; queue_bitmap2 = temp;}
    {MPI_Win temp = pred_win; pred_win = pred2_win; pred2_win = temp;}
    {int64_t* temp = pred; pred = pred2; pred2 = temp;}
  }
  MPI_Win_free(&pred_win);
  MPI_Win_free(&pred2_win);
  MPI_Win_free(&queue1_win);
  MPI_Win_free(&queue2_win);
  MPI_Free_mem(local_vertices);
  MPI_Free_mem(queue_bitmap1);
  MPI_Free_mem(queue_bitmap2);

  /* Clean up the predecessor map swapping since the surrounding code does not
   * allow the BFS to change the predecessor map pointer. */
  if (pred2 != orig_pred) {
    memcpy(orig_pred, pred2, nlocalverts * sizeof(int64_t));
    MPI_Free_mem(pred2);
  } else {
    MPI_Free_mem(pred);
  }

  /* Change from special coding of predecessor map to the one the benchmark
   * requires. */
  size_t i;
  for (i = 0; i < nlocalverts; ++i) {
    if (orig_pred[i] < 0) {
      orig_pred[i] += nglobalverts;
    } else if (orig_pred[i] == INT64_MAX) {
      orig_pred[i] = -1;
    }
  }

  /* Count visited vertices. */
  MPI_Allreduce(MPI_IN_PLACE, &nvisited_local, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD);
  *nvisited = nvisited_local;
}