static void allocate_memory(void) { int64_t maxlocalverts = g.max_nlocalverts; int lg_local_queue_size = lg_int64_t((maxlocalverts + ulong_bits_squared - 1) / ulong_bits_squared * ulong_bits_squared); g_lg_local_queue_size = lg_local_queue_size; int64_t local_queue_summary_size = (INT64_C(1) << lg_local_queue_size) / ulong_bits_squared; int64_t local_queue_size = local_queue_summary_size * ulong_bits; g_local_queue_summary_size = local_queue_summary_size; g_local_queue_size = local_queue_size; int64_t global_queue_summary_size = MUL_SIZE(local_queue_summary_size); int64_t global_queue_size = MUL_SIZE(local_queue_size); g_global_queue_summary_size = global_queue_summary_size; g_global_queue_size = global_queue_size; g_in_queue = (unsigned long*)xmalloc(global_queue_size * sizeof(unsigned long)); g_in_queue_summary = (unsigned long*)xmalloc(global_queue_summary_size * sizeof(unsigned long)); g_out_queue = (unsigned long*)xmalloc(local_queue_size * sizeof(unsigned long)); g_out_queue_summary = (unsigned long*)xmalloc(local_queue_summary_size * sizeof(unsigned long)); g_visited = (unsigned long*)xmalloc(local_queue_size * sizeof(unsigned long)); }
/* This version is the traditional level-synchronized BFS using two queues. A * bitmap is used to indicate which vertices have been visited. Messages are * sent and processed asynchronously throughout the code to hopefully overlap * communication with computation. */ void run_bfs(int64_t root, int64_t* pred) { allocate_memory(); const ptrdiff_t nlocalverts = g.nlocalverts; const size_t* const restrict rowstarts = g.rowstarts; const int64_t* const restrict column = g.column; int64_t maxlocalverts = g.max_nlocalverts; /* Set up the visited bitmap. */ const int ulong_bits = sizeof(unsigned long) * CHAR_BIT; const int ulong_bits_squared = ulong_bits * ulong_bits; int64_t local_queue_summary_size = (maxlocalverts + ulong_bits_squared - 1) / ulong_bits_squared; int64_t local_queue_size = local_queue_summary_size * ulong_bits; int lg_local_queue_size = lg_int64_t(local_queue_size); int64_t global_queue_summary_size = MUL_SIZE(local_queue_summary_size); int64_t global_queue_size = MUL_SIZE(local_queue_size); #define SWIZZLE_VERTEX(c) ((VERTEX_OWNER(c) << lg_local_queue_size) * ulong_bits | VERTEX_LOCAL(c)) #if 0 int64_t* restrict column_swizzled = (int64_t*)xmalloc(nlocaledges * sizeof(int64_t)); { size_t i; for (i = 0; i < nlocaledges; ++i) { int64_t c = column[i]; column_swizzled[i] = SWIZZLE_VERTEX(c); } } #endif unsigned long* restrict in_queue = g_in_queue; memset(in_queue, 0, global_queue_size * sizeof(unsigned long)); unsigned long* restrict in_queue_summary = g_in_queue_summary; memset(in_queue_summary, 0, global_queue_summary_size * sizeof(unsigned long)); unsigned long* restrict out_queue = g_out_queue; unsigned long* restrict out_queue_summary = g_out_queue_summary; unsigned long* restrict visited = g_visited; memset(visited, 0, local_queue_size * sizeof(unsigned long)); #define SET_IN(v) do {int64_t vs = SWIZZLE_VERTEX(v); size_t word_idx = vs / ulong_bits; int bit_idx = vs % ulong_bits; unsigned long mask = (1UL << bit_idx); in_queue_summary[word_idx / ulong_bits] |= (1UL << (word_idx % ulong_bits)); in_queue[word_idx] |= mask;} while (0) #define TEST_IN(vs) (((in_queue_summary[vs / ulong_bits / ulong_bits] & (1UL << ((vs / ulong_bits) % ulong_bits))) != 0) && ((in_queue[vs / ulong_bits] & (1UL << (vs % ulong_bits))) != 0)) #define TEST_VISITED_LOCAL(v) ((visited[(v) / ulong_bits] & (1UL << ((v) % ulong_bits))) != 0) // #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); __sync_fetch_and_or(&visited[word_idx], mask); __sync_fetch_and_or(&out_queue[word_idx], mask);} while (0) #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); visited[word_idx] |= mask; out_queue[word_idx] |= mask;} while (0) SET_IN(root); {ptrdiff_t i; _Pragma("omp parallel for schedule(static)") for (i = 0; i < nlocalverts; ++i) pred[i] = -1;} if (VERTEX_OWNER(root) == rank) { pred[VERTEX_LOCAL(root)] = root; SET_VISITED_LOCAL(VERTEX_LOCAL(root)); } uint16_t cur_level = 0; while (1) { ++cur_level; #if 0 if (rank == 0) fprintf(stderr, "BFS level %" PRIu16 "\n", cur_level); #endif memset(out_queue, 0, local_queue_size * sizeof(unsigned long)); // memset(out_queue_summary, 0, local_queue_summary_size * sizeof(unsigned long)); ptrdiff_t i, ii; #if 0 #pragma omp parallel for schedule(static) for (i = 0; i < global_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { if (in_queue[i * ulong_bits + j]) val |= mask; } in_queue_summary[i] = val; } #endif unsigned long not_done = 0; #pragma omp parallel for schedule(static) reduction(|:not_done) for (ii = 0; ii < nlocalverts; ii += ulong_bits) { size_t i, i_end = ii + ulong_bits; if (i_end > nlocalverts) i_end = nlocalverts; for (i = ii; i < i_end; ++i) { if (!TEST_VISITED_LOCAL(i)) { size_t j, j_end = rowstarts[i + 1]; for (j = rowstarts[i]; j < j_end; ++j) { int64_t v1 = column[j]; int64_t v1_swizzled = SWIZZLE_VERTEX(v1); if (TEST_IN(v1_swizzled)) { pred[i] = (v1 & INT64_C(0xFFFFFFFFFFFF)) | ((int64_t)cur_level << 48); not_done |= 1; SET_VISITED_LOCAL(i); break; } } } } } #if 1 #pragma omp parallel for schedule(static) for (i = 0; i < local_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { unsigned long full_val = out_queue[i * ulong_bits + j]; visited[i * ulong_bits + j] |= full_val; if (full_val) val |= mask; } out_queue_summary[i] = val; // not_done |= val; } #endif MPI_Allreduce(MPI_IN_PLACE, ¬_done, 1, MPI_UNSIGNED_LONG, MPI_BOR, MPI_COMM_WORLD); if (not_done == 0) break; MPI_Allgather(out_queue, local_queue_size, MPI_UNSIGNED_LONG, in_queue, local_queue_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); MPI_Allgather(out_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, in_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); } deallocate_memory(); }