void get_vertex_distribution_for_pred(size_t count, const int64_t* vertex_p, int* owner_p, size_t* local_p) { const int64_t* restrict vertex = vertex_p; int* restrict owner = owner_p; size_t* restrict local = local_p; ptrdiff_t i; #pragma omp parallel for for (i = 0; i < (ptrdiff_t)count; ++i) { owner[i] = VERTEX_OWNER(vertex[i]); local[i] = VERTEX_LOCAL(vertex[i]); } }
int validate_bfs_result(const csr_graph* const g, const int64_t root, const int64_t* const pred, const int64_t nvisited) { int validation_passed = 1; int root_is_mine = (VERTEX_OWNER(root) == rank); const size_t nlocalverts = g->nlocalverts; const size_t nlocaledges = g->nlocaledges; const int64_t nglobalverts = g->nglobalverts; /* Check that root is its own parent. */ if (root_is_mine) { if (pred[VERTEX_LOCAL(root)] != root) { fprintf(stderr, "%d: Validation error: parent of root vertex %" PRId64 " is %" PRId64 ", not the root itself.\n", rank, root, pred[VERTEX_LOCAL(root)]); validation_passed = 0; } } /* Check that nothing else is its own parent, and check for in-range * values. */ int any_range_errors = 0; size_t i; for (i = 0; i < nlocalverts; ++i) { int64_t v = VERTEX_TO_GLOBAL(i); assert (VERTEX_OWNER(v) == rank); assert (VERTEX_LOCAL(v) == i); if (v != root && pred[i] == v) { fprintf(stderr, "%d: Validation error: parent of non-root vertex %" PRId64 " is itself.\n", rank, v); validation_passed = 0; } if (pred[i] < -1 || pred[i] >= nglobalverts) { fprintf(stderr, "%d: Validation error: parent of vertex %" PRId64 " is out-of-range value %" PRId64 ".\n", rank, v, pred[i]); validation_passed = 0; any_range_errors = 1; } } MPI_Allreduce(MPI_IN_PLACE, &any_range_errors, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); /* Check that nvisited is correct. */ int64_t nvisited_actual = 0; for (i = 0; i < nlocalverts; ++i) { if (pred[i] != -1) ++nvisited_actual; } MPI_Allreduce(MPI_IN_PLACE, &nvisited_actual, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD); if (nvisited_actual != nvisited) { fprintf(stderr, "%d: Validation error: claimed visit count %" PRId64 " is different from actual count %" PRId64 ".\n", rank, nvisited, nvisited_actual); validation_passed = 0; } if (!any_range_errors) { /* Other parts of validation assume in-range values */ /* Check that there is an edge from each vertex to its claimed * predecessor. */ size_t i; for (i = 0; i < nlocalverts; ++i) { int64_t v = VERTEX_TO_GLOBAL(i); int64_t p = pred[i]; if (p == -1) continue; int found_pred_edge = 0; if (v == p) found_pred_edge = 1; /* Root vertex */ size_t ei, ei_end = g->rowstarts[i + 1]; for (ei = g->rowstarts[i]; ei < ei_end; ++ei) { int64_t w = g->column[ei]; if (w == p) { found_pred_edge = 1; break; } } if (!found_pred_edge) { fprintf(stderr, "%d: Validation error: no graph edge from vertex %" PRId64 " to its parent %" PRId64 ".\n", rank, v, p); validation_passed = 0; } } /* Create a vertex depth map to use for later validation. */ int64_t* depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); { /* Scope some code that has a lot of temporary variables. */ int64_t* pred_depth = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex */ size_t i; for (i = 0; i < nlocalverts; ++i) depth[i] = INT64_MAX; if (root_is_mine) depth[VERTEX_LOCAL(root)] = 0; /* Send each vertex that appears in the local part of the predecessor map * to its owner; record the original locations so we can put the answers * into pred_depth. */ /* Do a histogram sort by owner (this same kind of sort is used other * places as well). First, count the number of vertices going to each * destination. */ int* num_preds_per_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */ for (i = 0; i < nlocalverts; ++i) { ++num_preds_per_owner[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])]; } int64_t* preds_per_owner = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Predecessors sorted by owner */ int64_t* preds_per_owner_results_offsets = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Indices into pred_depth to write */ /* Second, do a prefix sum to get the displacements of the different * owners in the outgoing array. */ int* pred_owner_displs = (int*)xmalloc((size + 1) * sizeof(int)); pred_owner_displs[0] = 0; int r; for (r = 0; r < size; ++r) { pred_owner_displs[r + 1] = pred_owner_displs[r] + num_preds_per_owner[r]; } /* Last, put the vertices into the correct positions in the array, based * on their owners and the counts and displacements computed earlier. */ int* pred_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int)); memcpy(pred_owner_offsets, pred_owner_displs, (size + 1) * sizeof(int)); for (i = 0; i < nlocalverts; ++i) { int* offset_ptr = &pred_owner_offsets[pred[i] == -1 ? size - 1 : VERTEX_OWNER(pred[i])]; preds_per_owner[*offset_ptr] = pred[i]; preds_per_owner_results_offsets[*offset_ptr] = i; ++*offset_ptr; } for (r = 0; r < size; ++r) { assert (pred_owner_offsets[r] == pred_owner_displs[r + 1]); } free(pred_owner_offsets); /* Send around the number of vertices that will be sent to each destination. */ int* num_my_preds_per_sender = (int*)xmalloc(size * sizeof(int)); MPI_Alltoall(num_preds_per_owner, 1, MPI_INT, num_my_preds_per_sender, 1, MPI_INT, MPI_COMM_WORLD); int* my_preds_per_sender_displs = (int*)xmalloc((size + 1) * sizeof(int)); my_preds_per_sender_displs[0] = 0; for (r = 0; r < size; ++r) { my_preds_per_sender_displs[r + 1] = my_preds_per_sender_displs[r] + num_my_preds_per_sender[r]; } /* Send around the actual vertex data (list of depth requests that will * be responded to at each BFS iteration). */ int64_t* my_depth_requests = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t)); int64_t* my_depth_replies = (int64_t*)xmalloc(my_preds_per_sender_displs[size] * sizeof(int64_t)); MPI_Alltoallv(preds_per_owner, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE, my_depth_requests, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE, MPI_COMM_WORLD); int64_t* pred_depth_raw = (int64_t*)xmalloc(nlocalverts * sizeof(int64_t)); /* Depth of predecessor vertex for each local vertex, ordered by source proc */ /* Do a mini-BFS (naively) over just the predecessor graph (hopefully a * tree) produced by the real BFS; fill in the depth map. */ while (1) { int any_changed = 0; int i; /* Create and send the depth values requested by other nodes. The list * of requests is sent once, and are stored on the receiver so the * replies can be sent (possibly with updated depth values) at every * iteration. */ for (i = 0; i < my_preds_per_sender_displs[size]; ++i) { my_depth_replies[i] = (my_depth_requests[i] == -1 ? INT64_MAX : depth[VERTEX_LOCAL(my_depth_requests[i])]); } MPI_Alltoallv(my_depth_replies, num_my_preds_per_sender, my_preds_per_sender_displs, INT64_T_MPI_TYPE, pred_depth_raw, num_preds_per_owner, pred_owner_displs, INT64_T_MPI_TYPE, MPI_COMM_WORLD); { size_t i; /* Put the received depths into the local array. */ for (i = 0; i < nlocalverts; ++i) { pred_depth[preds_per_owner_results_offsets[i]] = pred_depth_raw[i]; } /* Check those values to determine if they violate any correctness * conditions. */ for (i = 0; i < nlocalverts; ++i) { int64_t v = VERTEX_TO_GLOBAL(i); if (v == root) { /* The depth and predecessor for this were checked earlier. */ } else if (depth[i] == INT64_MAX && pred_depth[i] == INT64_MAX) { /* OK -- depth should be filled in later. */ } else if (depth[i] == INT64_MAX && pred_depth[i] != INT64_MAX) { depth[i] = pred_depth[i] + 1; any_changed = 1; } else if (depth[i] != pred_depth[i] + 1) { fprintf(stderr, "%d: Validation error: BFS predecessors do not form a tree; see vertices %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ").\n", rank, v, depth[i], pred[i], pred_depth[i]); validation_passed = 0; } else { /* Vertex already has its correct depth value. */ } } } MPI_Allreduce(MPI_IN_PLACE, &any_changed, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if (!any_changed) break; } free(num_preds_per_owner); free(num_my_preds_per_sender); free(preds_per_owner); free(preds_per_owner_results_offsets); free(my_preds_per_sender_displs); free(my_depth_requests); free(my_depth_replies); free(pred_owner_displs); free(pred_depth); free(pred_depth_raw); } /* Check that all edges connect vertices whose depths differ by at most * one. */ { int64_t maxlocaledges = 0; MPI_Allreduce((void*)&nlocaledges, &maxlocaledges, 1, INT64_T_MPI_TYPE, MPI_MAX, MPI_COMM_WORLD); /* We break the total list of overall edges into chunks to reduce the * amount of data to be sent at a time (since we are using MPI_Alltoallv * to send data collectively). */ const int edge_chunk_size = (1 << 23); /* Reduce memory usage */ int num_edge_groups = (maxlocaledges + edge_chunk_size - 1) / edge_chunk_size; int eg; for (eg = 0; eg < num_edge_groups; ++eg) { size_t first_edge_index = (size_t)(eg * edge_chunk_size); if (first_edge_index > nlocaledges) first_edge_index = nlocaledges; size_t last_edge_index = (size_t)((eg + 1) * edge_chunk_size); if (last_edge_index > nlocaledges) last_edge_index = nlocaledges; /* Sort the edge targets in this chunk by their owners (histogram * sort); see the BFS code above for details of the steps of the * algorithm. */ int* num_edge_targets_by_owner = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */ size_t ei; for (ei = first_edge_index; ei < last_edge_index; ++ei) { ++num_edge_targets_by_owner[VERTEX_OWNER(g->column[ei])]; } int* edge_targets_by_owner_displs = (int*)xmalloc((size + 1) * sizeof(int)); edge_targets_by_owner_displs[0] = 0; int i; for (i = 0; i < size; ++i) { edge_targets_by_owner_displs[i + 1] = edge_targets_by_owner_displs[i] + num_edge_targets_by_owner[i]; } int64_t* edge_targets_by_owner = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t)); int64_t* edge_targets_by_owner_indices = (int64_t*)xmalloc(edge_targets_by_owner_displs[size] * sizeof(int64_t)); /* Source indices for where to write the targets */ int* edge_targets_by_owner_offsets = (int*)xmalloc((size + 1) * sizeof(int)); memcpy(edge_targets_by_owner_offsets, edge_targets_by_owner_displs, (size + 1) * sizeof(int)); for (ei = first_edge_index; ei < last_edge_index; ++ei) { edge_targets_by_owner[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = g->column[ei]; edge_targets_by_owner_indices[edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]] = ei; ++edge_targets_by_owner_offsets[VERTEX_OWNER(g->column[ei])]; } for (i = 0; i < size; ++i) { assert (edge_targets_by_owner_offsets[i] == edge_targets_by_owner_displs[i + 1]); } free(edge_targets_by_owner_offsets); /* Send around the number of data elements that will be sent later. */ int* num_incoming_targets_by_src = (int*)xmalloc(size * sizeof(int)); MPI_Alltoall(num_edge_targets_by_owner, 1, MPI_INT, num_incoming_targets_by_src, 1, MPI_INT, MPI_COMM_WORLD); int* incoming_targets_by_src_displs = (int*)xmalloc((size + 1) * sizeof(int)); incoming_targets_by_src_displs[0] = 0; for (i = 0; i < size; ++i) { incoming_targets_by_src_displs[i + 1] = incoming_targets_by_src_displs[i] + num_incoming_targets_by_src[i]; } int64_t* target_depth_requests = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t)); int64_t* target_depth_replies = (int64_t*)xmalloc(incoming_targets_by_src_displs[size] * sizeof(int64_t)); /* Send the actual requests for the depths of edge targets. */ MPI_Alltoallv(edge_targets_by_owner, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE, target_depth_requests, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE, MPI_COMM_WORLD); free(edge_targets_by_owner); /* Fill in the replies for the requests sent to me. */ for (i = 0; i < incoming_targets_by_src_displs[size]; ++i) { assert (VERTEX_OWNER(target_depth_requests[i]) == rank); target_depth_replies[i] = depth[VERTEX_LOCAL(target_depth_requests[i])]; } free(target_depth_requests); int64_t* target_depth_raw = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t)); /* Send back the replies. */ MPI_Alltoallv(target_depth_replies, num_incoming_targets_by_src, incoming_targets_by_src_displs, INT64_T_MPI_TYPE, target_depth_raw, num_edge_targets_by_owner, edge_targets_by_owner_displs, INT64_T_MPI_TYPE, MPI_COMM_WORLD); free(target_depth_replies); free(num_incoming_targets_by_src); free(num_edge_targets_by_owner); free(incoming_targets_by_src_displs); free(edge_targets_by_owner_displs); int64_t* target_depth = (int64_t*)xmalloc((last_edge_index - first_edge_index) * sizeof(int64_t)); /* Put the replies into the proper order (original order of the edges). * */ for (ei = 0; ei < last_edge_index - first_edge_index; ++ei) { target_depth[edge_targets_by_owner_indices[ei] - first_edge_index] = target_depth_raw[ei]; } free(target_depth_raw); free(edge_targets_by_owner_indices); /* Check the depth relationship of the endpoints of each edge in the * current chunk. */ size_t src_i = 0; for (ei = first_edge_index; ei < last_edge_index; ++ei) { while (ei >= g->rowstarts[src_i + 1]) { ++src_i; } int64_t src = VERTEX_TO_GLOBAL(src_i); int64_t src_depth = depth[src_i]; int64_t tgt = g->column[ei]; int64_t tgt_depth = target_depth[ei - first_edge_index]; if (src_depth != INT64_MAX && tgt_depth == INT64_MAX) { fprintf(stderr, "%d: Validation error: edge connects vertex %" PRId64 " in the BFS tree (depth %" PRId64 ") to vertex %" PRId64 " outside the tree.\n", rank, src, src_depth, tgt); validation_passed = 0; } else if (src_depth == INT64_MAX && tgt_depth != INT64_MAX) { /* Skip this for now; this problem will be caught when scanning * reversed copy of this edge. Set the failure flag, though, * just in case. */ validation_passed = 0; } else if (src_depth - tgt_depth < -1 || src_depth - tgt_depth > 1) { fprintf(stderr, "%d: Validation error: depths of edge endpoints %" PRId64 " (depth %" PRId64 ") and %" PRId64 " (depth %" PRId64 ") are too far apart (abs. val. > 1).\n", rank, src, src_depth, tgt, tgt_depth); validation_passed = 0; } } free(target_depth); } } free(depth); } /* End of part skipped by range errors */ /* Collect the global validation result. */ MPI_Allreduce(MPI_IN_PLACE, &validation_passed, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); return validation_passed; }
/* This BFS represents its queues as bitmaps and uses some data representation * tricks to fit with the use of MPI one-sided operations. It is not much * faster than the standard version on the machines I have tested it on, but * systems that have good RDMA hardware and good MPI one-sided implementations * might get better performance from it. This code might also be good to * translate to UPC, Co-array Fortran, SHMEM, or GASNet since those systems are * more designed for one-sided remote memory operations. */ void run_mpi_bfs(const csr_graph* const g, int64_t root, int64_t* pred, int64_t* nvisited) { const size_t nlocalverts = g->nlocalverts; const int64_t nglobalverts = g->nglobalverts; int64_t nvisited_local = 0; /* Set up a second predecessor map so we can read from one and modify the * other. */ int64_t* orig_pred = pred; int64_t* pred2 = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t)); /* The queues (old and new) are represented as bitmaps. Each bit in the * queue bitmap says to check elts_per_queue_bit elements in the predecessor * map for vertices that need to be visited. In other words, the queue * bitmap is an overapproximation of the actual queue; because MPI_Accumulate * does not get any information on the result of the update, sometimes * elements are also added to the bitmap when they were actually already * black. Because of this, the predecessor map needs to be checked to be * sure a given vertex actually needs to be processed. */ const int elts_per_queue_bit = 4; const int ulong_bits = sizeof(unsigned long) * CHAR_BIT; int64_t queue_nbits = (nlocalverts + elts_per_queue_bit - 1) / elts_per_queue_bit; int64_t queue_nwords = (queue_nbits + ulong_bits - 1) / ulong_bits; unsigned long* queue_bitmap1 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long)); unsigned long* queue_bitmap2 = (unsigned long*)xMPI_Alloc_mem(queue_nwords * sizeof(unsigned long)); memset(queue_bitmap1, 0, queue_nwords * sizeof(unsigned long)); /* List of local vertices (used as sources in MPI_Accumulate). */ int64_t* local_vertices = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t)); {size_t i; for (i = 0; i < nlocalverts; ++i) local_vertices[i] = VERTEX_TO_GLOBAL(i);} /* List of all bit masks for an unsigned long (used as sources in * MPI_Accumulate). */ unsigned long masks[ulong_bits]; {int i; for (i = 0; i < ulong_bits; ++i) masks[i] = (1UL << i);} /* Coding of predecessor map: */ /* - White (not visited): INT64_MAX */ /* - Grey (in queue): 0 .. nglobalverts-1 */ /* - Black (done): -nglobalverts .. -1 */ /* Set initial predecessor map. */ {size_t i; for (i = 0; i < nlocalverts; ++i) pred[i] = INT64_MAX;} /* Mark root as grey and add it to the queue. */ if (VERTEX_OWNER(root) == rank) { pred[VERTEX_LOCAL(root)] = root; queue_bitmap1[VERTEX_LOCAL(root) / elts_per_queue_bit / ulong_bits] |= (1UL << ((VERTEX_LOCAL(root) / elts_per_queue_bit) % ulong_bits)); } /* Create MPI windows on the two predecessor arrays and the two queues. */ MPI_Win pred_win, pred2_win, queue1_win, queue2_win; MPI_Win_create(pred, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred_win); MPI_Win_create(pred2, nlocalverts * sizeof(int64_t), sizeof(int64_t), MPI_INFO_NULL, MPI_COMM_WORLD, &pred2_win); MPI_Win_create(queue_bitmap1, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue1_win); MPI_Win_create(queue_bitmap2, queue_nwords * sizeof(unsigned long), sizeof(unsigned long), MPI_INFO_NULL, MPI_COMM_WORLD, &queue2_win); while (1) { int64_t i; /* Clear the next-level queue. */ memset(queue_bitmap2, 0, queue_nwords * sizeof(unsigned long)); /* The pred2 array is pred with all grey vertices changed to black. */ memcpy(pred2, pred, nlocalverts * sizeof(int64_t)); for (i = 0; i < (int64_t)nlocalverts; ++i) { if (pred2[i] >= 0 && pred2[i] < nglobalverts) pred2[i] -= nglobalverts; } /* Start one-sided operations for this level. */ MPI_Win_fence(MPI_MODE_NOPRECEDE, pred2_win); MPI_Win_fence(MPI_MODE_NOPRECEDE, queue2_win); /* Step through the words of the queue bitmap. */ for (i = 0; i < queue_nwords; ++i) { unsigned long val = queue_bitmap1[i]; int bitnum; /* Skip any that are all zero. */ if (!val) continue; /* Scan the bits in the word. */ for (bitnum = 0; bitnum < ulong_bits; ++bitnum) { size_t first_v_local = (size_t)((i * ulong_bits + bitnum) * elts_per_queue_bit); if (first_v_local >= nlocalverts) break; int bit = (int)((val >> bitnum) & 1); /* Skip any that are zero. */ if (!bit) continue; /* Scan the queue elements corresponding to this bit. */ int qelem_idx; for (qelem_idx = 0; qelem_idx < elts_per_queue_bit; ++qelem_idx) { size_t v_local = first_v_local + qelem_idx; if (v_local >= nlocalverts) continue; /* Since the queue is an overapproximation, check the predecessor map * to be sure this vertex is grey. */ if (pred[v_local] >= 0 && pred[v_local] < nglobalverts) { ++nvisited_local; size_t ei, ei_end = g->rowstarts[v_local + 1]; /* Walk the incident edges. */ for (ei = g->rowstarts[v_local]; ei < ei_end; ++ei) { int64_t w = g->column[ei]; if (w == VERTEX_TO_GLOBAL(v_local)) continue; /* Self-loop */ /* Set the predecessor of the other edge endpoint (note use of * MPI_MIN and the coding of the predecessor map). */ MPI_Accumulate(&local_vertices[v_local], 1, INT64_T_MPI_TYPE, VERTEX_OWNER(w), VERTEX_LOCAL(w), 1, INT64_T_MPI_TYPE, MPI_MIN, pred2_win); /* Mark the endpoint in the remote queue (note that the min may * not do an update, so the queue is an overapproximation in this * way as well). */ MPI_Accumulate(&masks[((VERTEX_LOCAL(w) / elts_per_queue_bit) % ulong_bits)], 1, MPI_UNSIGNED_LONG, VERTEX_OWNER(w), VERTEX_LOCAL(w) / elts_per_queue_bit / ulong_bits, 1, MPI_UNSIGNED_LONG, MPI_BOR, queue2_win); } } } } } /* End one-sided operations. */ MPI_Win_fence(MPI_MODE_NOSUCCEED, queue2_win); MPI_Win_fence(MPI_MODE_NOSUCCEED, pred2_win); /* Test if there are any elements in the next-level queue (globally); stop * if none. */ int any_set = 0; for (i = 0; i < queue_nwords; ++i) { if (queue_bitmap2[i] != 0) {any_set = 1; break;} } MPI_Allreduce(MPI_IN_PLACE, &any_set, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if (!any_set) break; /* Swap queues and predecessor maps. */ {MPI_Win temp = queue1_win; queue1_win = queue2_win; queue2_win = temp;} {unsigned long* temp = queue_bitmap1; queue_bitmap1 = queue_bitmap2; queue_bitmap2 = temp;} {MPI_Win temp = pred_win; pred_win = pred2_win; pred2_win = temp;} {int64_t* temp = pred; pred = pred2; pred2 = temp;} } MPI_Win_free(&pred_win); MPI_Win_free(&pred2_win); MPI_Win_free(&queue1_win); MPI_Win_free(&queue2_win); MPI_Free_mem(local_vertices); MPI_Free_mem(queue_bitmap1); MPI_Free_mem(queue_bitmap2); /* Clean up the predecessor map swapping since the surrounding code does not * allow the BFS to change the predecessor map pointer. */ if (pred2 != orig_pred) { memcpy(orig_pred, pred2, nlocalverts * sizeof(int64_t)); MPI_Free_mem(pred2); } else { MPI_Free_mem(pred); } /* Change from special coding of predecessor map to the one the benchmark * requires. */ size_t i; for (i = 0; i < nlocalverts; ++i) { if (orig_pred[i] < 0) { orig_pred[i] += nglobalverts; } else if (orig_pred[i] == INT64_MAX) { orig_pred[i] = -1; } } /* Count visited vertices. */ MPI_Allreduce(MPI_IN_PLACE, &nvisited_local, 1, INT64_T_MPI_TYPE, MPI_SUM, MPI_COMM_WORLD); *nvisited = nvisited_local; }
/* This version is the traditional level-synchronized BFS using two queues. A * bitmap is used to indicate which vertices have been visited. Messages are * sent and processed asynchronously throughout the code to hopefully overlap * communication with computation. */ void run_bfs(int64_t root, int64_t* pred) { allocate_memory(); const ptrdiff_t nlocalverts = g.nlocalverts; const size_t* const restrict rowstarts = g.rowstarts; const int64_t* const restrict column = g.column; int64_t maxlocalverts = g.max_nlocalverts; /* Set up the visited bitmap. */ const int ulong_bits = sizeof(unsigned long) * CHAR_BIT; const int ulong_bits_squared = ulong_bits * ulong_bits; int64_t local_queue_summary_size = (maxlocalverts + ulong_bits_squared - 1) / ulong_bits_squared; int64_t local_queue_size = local_queue_summary_size * ulong_bits; int lg_local_queue_size = lg_int64_t(local_queue_size); int64_t global_queue_summary_size = MUL_SIZE(local_queue_summary_size); int64_t global_queue_size = MUL_SIZE(local_queue_size); #define SWIZZLE_VERTEX(c) ((VERTEX_OWNER(c) << lg_local_queue_size) * ulong_bits | VERTEX_LOCAL(c)) #if 0 int64_t* restrict column_swizzled = (int64_t*)xmalloc(nlocaledges * sizeof(int64_t)); { size_t i; for (i = 0; i < nlocaledges; ++i) { int64_t c = column[i]; column_swizzled[i] = SWIZZLE_VERTEX(c); } } #endif unsigned long* restrict in_queue = g_in_queue; memset(in_queue, 0, global_queue_size * sizeof(unsigned long)); unsigned long* restrict in_queue_summary = g_in_queue_summary; memset(in_queue_summary, 0, global_queue_summary_size * sizeof(unsigned long)); unsigned long* restrict out_queue = g_out_queue; unsigned long* restrict out_queue_summary = g_out_queue_summary; unsigned long* restrict visited = g_visited; memset(visited, 0, local_queue_size * sizeof(unsigned long)); #define SET_IN(v) do {int64_t vs = SWIZZLE_VERTEX(v); size_t word_idx = vs / ulong_bits; int bit_idx = vs % ulong_bits; unsigned long mask = (1UL << bit_idx); in_queue_summary[word_idx / ulong_bits] |= (1UL << (word_idx % ulong_bits)); in_queue[word_idx] |= mask;} while (0) #define TEST_IN(vs) (((in_queue_summary[vs / ulong_bits / ulong_bits] & (1UL << ((vs / ulong_bits) % ulong_bits))) != 0) && ((in_queue[vs / ulong_bits] & (1UL << (vs % ulong_bits))) != 0)) #define TEST_VISITED_LOCAL(v) ((visited[(v) / ulong_bits] & (1UL << ((v) % ulong_bits))) != 0) // #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); __sync_fetch_and_or(&visited[word_idx], mask); __sync_fetch_and_or(&out_queue[word_idx], mask);} while (0) #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); visited[word_idx] |= mask; out_queue[word_idx] |= mask;} while (0) SET_IN(root); {ptrdiff_t i; _Pragma("omp parallel for schedule(static)") for (i = 0; i < nlocalverts; ++i) pred[i] = -1;} if (VERTEX_OWNER(root) == rank) { pred[VERTEX_LOCAL(root)] = root; SET_VISITED_LOCAL(VERTEX_LOCAL(root)); } uint16_t cur_level = 0; while (1) { ++cur_level; #if 0 if (rank == 0) fprintf(stderr, "BFS level %" PRIu16 "\n", cur_level); #endif memset(out_queue, 0, local_queue_size * sizeof(unsigned long)); // memset(out_queue_summary, 0, local_queue_summary_size * sizeof(unsigned long)); ptrdiff_t i, ii; #if 0 #pragma omp parallel for schedule(static) for (i = 0; i < global_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { if (in_queue[i * ulong_bits + j]) val |= mask; } in_queue_summary[i] = val; } #endif unsigned long not_done = 0; #pragma omp parallel for schedule(static) reduction(|:not_done) for (ii = 0; ii < nlocalverts; ii += ulong_bits) { size_t i, i_end = ii + ulong_bits; if (i_end > nlocalverts) i_end = nlocalverts; for (i = ii; i < i_end; ++i) { if (!TEST_VISITED_LOCAL(i)) { size_t j, j_end = rowstarts[i + 1]; for (j = rowstarts[i]; j < j_end; ++j) { int64_t v1 = column[j]; int64_t v1_swizzled = SWIZZLE_VERTEX(v1); if (TEST_IN(v1_swizzled)) { pred[i] = (v1 & INT64_C(0xFFFFFFFFFFFF)) | ((int64_t)cur_level << 48); not_done |= 1; SET_VISITED_LOCAL(i); break; } } } } } #if 1 #pragma omp parallel for schedule(static) for (i = 0; i < local_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { unsigned long full_val = out_queue[i * ulong_bits + j]; visited[i * ulong_bits + j] |= full_val; if (full_val) val |= mask; } out_queue_summary[i] = val; // not_done |= val; } #endif MPI_Allreduce(MPI_IN_PLACE, ¬_done, 1, MPI_UNSIGNED_LONG, MPI_BOR, MPI_COMM_WORLD); if (not_done == 0) break; MPI_Allgather(out_queue, local_queue_size, MPI_UNSIGNED_LONG, in_queue, local_queue_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); MPI_Allgather(out_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, in_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); } deallocate_memory(); }
void readGraph_singleFile_MPI(graph_t *G, char *filename) { uint8_t align; int rank, size; int offset,offset_row ,offset_col,offset_weight; edge_id_t my_edges[2]; int local_n=0; int local_m=0; int k; uint32_t TotVertices; MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); G->rank = rank; G->nproc = size; G->filename[0] = '\0'; sprintf(G->filename, "%s", filename); int lgsize; for (lgsize = 0; lgsize < size; ++lgsize) { if ((1 << lgsize) == size) break; } MPI_File fh; MPI_Status status; MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); MPI_File_read(fh, &G->n, 1, MPI_UINT32_T, &status); offset = sizeof(vertex_id_t); TotVertices = G->n; MPI_File_read_at(fh, offset, &G->m, 1, MPI_UINT64_T, &status); offset += sizeof(edge_id_t); MPI_File_read_at(fh, offset, &G->directed, 1, MPI_C_BOOL, &status); offset += sizeof(bool); MPI_File_read_at(fh, offset, &align, 1, MPI_UINT8_T, &status); offset += sizeof(uint8_t); offset_row = offset; for( uint32_t i = 0; i < G->n; i++ ) { if( rank == VERTEX_OWNER(i,TotVertices,size) ) { MPI_File_read_at(fh,offset_row + (i)*sizeof(edge_id_t),&my_edges[0], 2, MPI_UINT64_T, &status); local_n++; local_m += my_edges[1] - my_edges[0]; } } G->local_n = local_n; G->local_m = local_m; offset_col = offset_row + (G->n+1) * sizeof(edge_id_t); offset_weight = offset_col + G->m * sizeof(vertex_id_t); G->rowsIndices = (edge_id_t *)malloc((G->local_n + 1) * sizeof(edge_id_t) ); G->endV = (vertex_id_t *)malloc((G->local_m)*sizeof(vertex_id_t)); G->weights = (weight_t *)malloc((G->local_m)*sizeof(weight_t)); G->rowsIndices[0] = 0; k = 1; for( uint32_t i = 0; i < G->n; i++ ) { if( rank == VERTEX_OWNER(i,TotVertices,size) ) { MPI_File_read_at(fh,offset_row + (i)*sizeof(edge_id_t),&my_edges[0], 2, MPI_UINT64_T, &status); G->rowsIndices[k] = G->rowsIndices[k-1] + my_edges[1] - my_edges[0]; MPI_File_read_at(fh,offset_col + my_edges[0] * sizeof(vertex_id_t), &G->endV[G->rowsIndices[k-1]], G->rowsIndices[k]-G->rowsIndices[k-1], MPI_UINT32_T, &status); MPI_File_read_at(fh,offset_weight + my_edges[0] * sizeof(weight_t), &G->weights[G->rowsIndices[k-1]], G->rowsIndices[k]-G->rowsIndices[k-1], MPI_DOUBLE, &status); k++; } } MPI_File_close(&fh); }