/* This version is for sequential machines, OpenMP, and the XMT. */ void scramble_edges_shared(uint64_t userseed1, uint64_t userseed2, int64_t nedges, int64_t* result /* Input and output array of edges (size = 2 * nedges) */) { mrg_state st; uint_fast32_t seed[5]; int64_t* new_result; int64_t i; int64_t* perm = (int64_t*)xmalloc(nedges * sizeof(int64_t)); make_mrg_seed(userseed1, userseed2, seed); mrg_seed(&st, seed); mrg_skip(&st, 5, 0, 0); /* To make offset different from other PRNG uses */ rand_sort_shared(&st, nedges, perm); new_result = (int64_t*)xmalloc(nedges * 2 * sizeof(int64_t)); #ifdef __MTA__ #pragma mta assert parallel #pragma mta block schedule #endif #ifdef GRAPH_GENERATOR_OMP #pragma omp parallel for #endif for (i = 0; i < nedges; ++i) { int64_t p = perm[i]; new_result[i * 2 + 0] = result[p * 2 + 0]; new_result[i * 2 + 1] = result[p * 2 + 1]; } free(perm); memcpy(result, new_result, nedges * 2 * sizeof(int64_t)); free(new_result); }
void make_graph(int log_numverts, int64_t M, uint64_t userseed1, uint64_t userseed2, int64_t* nedges_ptr, packed_edge** result_ptr) { int rank, size; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); int64_t start_idx, end_idx; compute_edge_range(rank, size, M, &start_idx, &end_idx); int64_t nedges = end_idx - start_idx; packed_edge* local_edges = (packed_edge*)xmalloc(nedges * sizeof(packed_edge)); double start = MPI_Wtime(); generate_kronecker_range(seed, log_numverts, start_idx, end_idx, local_edges); double gen_time = MPI_Wtime() - start; *result_ptr = local_edges; *nedges_ptr = nedges; if (rank == 0) { fprintf(stdout, "graph_generation: %f s\n", gen_time); } }
void make_graph(int log_numverts, int64_t M, uint64_t userseed1, uint64_t userseed2, int64_t* nedges_ptr_in, packed_edge** result_ptr_in) { /* Add restrict to input pointers. */ int64_t* restrict nedges_ptr = nedges_ptr_in; packed_edge* restrict* restrict result_ptr = result_ptr_in; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); int fd = 1; /* if (dumpname) fd = open (dumpname, O_WRONLY|O_CREAT|O_TRUNC, 0666); else fd = 1; if (fd < 0) { fprintf (stderr, "Cannot open output file : %s\n", (dumpname? dumpname : "stdout")); exit(EXIT_FAILURE); } */ int edges_per_time = 10000000; int64_t edge_start = 0; for(int64_t iter = edges_per_time ;; iter += edges_per_time) { if(iter % 100000000 == 0) { fprintf(stderr , "Made:%ld/%ld\n" , iter , M); } if(iter > M) { iter = M; } *nedges_ptr = iter; packed_edge* edges = (packed_edge*)xmalloc( (iter-edge_start) * sizeof(packed_edge)); *result_ptr = edges; /* In OpenMP and XMT versions, the inner loop in generate_kronecker_range is * parallel. */ generate_kronecker_range(seed, log_numverts, edge_start , iter , edges); write (fd, edges, (iter-edge_start) * sizeof (packed_edge)); edge_start = iter; free(edges); edges = NULL; if(iter == M) { break; } } close (fd); }
void make_graph(int log_numverts, int64_t M, uint64_t userseed1, uint64_t userseed2, int64_t* nedges_ptr_in, packed_edge** result_ptr_in) { /* Add restrict to input pointers. */ int64_t* restrict nedges_ptr = nedges_ptr_in; packed_edge* restrict* restrict result_ptr = result_ptr_in; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); *nedges_ptr = M; packed_edge* edges = (packed_edge*)xmalloc(M * sizeof(packed_edge)); *result_ptr = edges; /* In OpenMP and XMT versions, the inner loop in generate_kronecker_range is * parallel. */ generate_kronecker_range(seed, log_numverts, 0, M, edges); }
/* PRNG interface for implementations; takes seed in same format as given by * users, and creates a vector of doubles in a reproducible (and * random-access) way. */ void make_random_numbers( /* in */ int64_t nvalues /* Number of values to generate */, /* in */ uint64_t userseed1 /* Arbitrary 64-bit seed value */, /* in */ uint64_t userseed2 /* Arbitrary 64-bit seed value */, /* in */ int64_t position /* Start index in random number stream */, /* out */ double* result /* Returned array of values */ ) { int64_t i; uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); mrg_state st; mrg_seed(&st, seed); mrg_skip(&st, 2, 0, 2 * position); /* Each double takes two PRNG outputs */ for (i = 0; i < nvalues; ++i) { result[i] = mrg_get_double_orig(&st); } }
/* For MPI distributed memory. */ void scramble_edges_mpi(MPI_Comm comm, const uint64_t userseed1, const uint64_t userseed2, const int64_t local_nedges_in, const int64_t* const local_edges_in, int64_t* const local_nedges_out_ptr, int64_t** const local_edges_out_ptr /* Allocated using xmalloc() by scramble_edges_mpi */) { int rank, size; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); mrg_state st; uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); mrg_seed(&st, seed); mrg_skip(&st, 5, 0, 0); /* To make offset different from other PRNG uses */ int64_t total_nedges; MPI_Allreduce((void*)&local_nedges_in, &total_nedges, 1, INT64_T_MPI_TYPE, MPI_SUM, comm); int64_t local_nedges_out; /* = local permutation size */ int64_t* local_perm; rand_sort_mpi(comm, &st, total_nedges, &local_nedges_out, &local_perm); *local_nedges_out_ptr = local_nedges_out; /* Gather permutation information and fast owner lookup cache (code in * apply_permutation_mpi.c). */ int64_t* edge_displs = (int64_t*)xmalloc((size + 1) * sizeof(int64_t)); int* edge_owner_table; int64_t* edge_owner_cutoff; int lg_minedgecount; int64_t maxedgecount; gather_block_distribution_info(comm, local_nedges_in, total_nedges, edge_displs, &edge_owner_table, &edge_owner_cutoff, &lg_minedgecount, &maxedgecount); /* Originally from apply_permutation_mpi.c */ #define LOOKUP_EDGE_OWNER(v) \ (edge_owner_table[(v) >> lg_minedgecount] + \ ((v) >= edge_owner_cutoff[(v) >> lg_minedgecount])) /* Apply permutation. Output distribution is same as distribution of * generated edge permutation. */ /* Count number of requests to send to each destination. */ int* send_counts = (int*)xcalloc(size, sizeof(int)); /* Uses zero-init */ int64_t i; for (i = 0; i < local_nedges_out; ++i) { ++send_counts[LOOKUP_EDGE_OWNER(local_perm[i])]; } /* Prefix sum to get displacements. */ int* send_displs = (int*)xmalloc((size + 1) * sizeof(int)); send_displs[0] = 0; for (i = 0; i < size; ++i) { send_displs[i + 1] = send_displs[i] + send_counts[i]; } assert (send_displs[size] == local_nedges_out); /* Put edges into buffer by destination; also keep around index values for * where to write the result. */ int64_t* sendbuf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t)); int64_t* reply_loc_buf = (int64_t*)xmalloc(local_nedges_out * sizeof(int64_t)); int* send_offsets = (int*)xmalloc((size + 1) * sizeof(int)); memcpy(send_offsets, send_displs, (size + 1) * sizeof(int)); for (i = 0; i < local_nedges_out; ++i) { int write_index = send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])]; sendbuf[write_index] = local_perm[i]; reply_loc_buf[write_index] = i; ++send_offsets[LOOKUP_EDGE_OWNER(local_perm[i])]; } for (i = 0; i < size; ++i) assert (send_offsets[i] == send_displs[i + 1]); free(send_offsets); send_offsets = NULL; free(local_perm); local_perm = NULL; #undef LOOKUP_EDGE_OWNER free(edge_owner_table); edge_owner_table = NULL; free(edge_owner_cutoff); edge_owner_cutoff = NULL; /* Find out how many requests I will be receiving. */ int* recv_counts = (int*)xmalloc(size * sizeof(int)); MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm); /* Compute their displacements. */ int* recv_displs = (int*)xmalloc((size + 1) * sizeof(int)); recv_displs[0] = 0; for (i = 0; i < size; ++i) { recv_displs[i + 1] = recv_displs[i] + recv_counts[i]; } /* Make receive and reply buffers. */ int64_t* recvbuf = (int64_t*)xmalloc(recv_displs[size] * sizeof(int64_t)); int64_t* replybuf = (int64_t*)xmalloc(recv_displs[size] * 2 * sizeof(int64_t)); /* Move requests for edges into receive buffer. */ MPI_Alltoallv(sendbuf, send_counts, send_displs, INT64_T_MPI_TYPE, recvbuf, recv_counts, recv_displs, INT64_T_MPI_TYPE, comm); free(sendbuf); sendbuf = NULL; /* Put requested edges into response buffer. */ int64_t my_edge_offset = edge_displs[rank]; for (i = 0; i < recv_displs[size]; ++i) { replybuf[i * 2 + 0] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 0]; replybuf[i * 2 + 1] = local_edges_in[(recvbuf[i] - my_edge_offset) * 2 + 1]; } free(recvbuf); recvbuf = NULL; free(edge_displs); edge_displs = NULL; /* Send replies back. */ int64_t* reply_edges = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t)); for (i = 0; i < size; ++i) { /* Sending back two values for each request */ recv_counts[i] *= 2; recv_displs[i] *= 2; send_counts[i] *= 2; send_displs[i] *= 2; } MPI_Alltoallv(replybuf, recv_counts, recv_displs, INT64_T_MPI_TYPE, reply_edges, send_counts, send_displs, INT64_T_MPI_TYPE, comm); free(replybuf); replybuf = NULL; free(recv_counts); recv_counts = NULL; free(recv_displs); recv_displs = NULL; free(send_counts); send_counts = NULL; free(send_displs); send_displs = NULL; /* Make output array of edges. */ int64_t* local_edges_out = (int64_t*)xmalloc(local_nedges_out * 2 * sizeof(int64_t)); *local_edges_out_ptr = local_edges_out; /* Put edges into output array. */ for (i = 0; i < local_nedges_out; ++i) { local_edges_out[reply_loc_buf[i] * 2 + 0] = reply_edges[2 * i + 0]; local_edges_out[reply_loc_buf[i] * 2 + 1] = reply_edges[2 * i + 1]; } free(reply_loc_buf); reply_loc_buf = NULL; free(reply_edges); reply_edges = NULL; }
int main(int argc, char** argv) { struct options options; if (process_options(argc, argv, true, &options) != 0) return 0; if (options.rmat.a + options.rmat.b + options.rmat.c >= 1) { printf("Error: The sum of probabilities must equal 1\n"); return 0; } double d = 1 - (options.rmat.a + options.rmat.b + options.rmat.c); xscale_node = options.rmat.xscale_node; xscale_interval = options.rmat.xscale_interval; uint_fast32_t seed[5]; make_mrg_seed(options.rng.userseed1, options.rng.userseed2, seed); mrg_state state; mrg_seed(&state, seed); //mrg_skip(&new_state, 50, 7, 0); // Do an initial skip? edge_t total_edges = options.rmat.edges; if((total_edges % options.rmat.xscale_interval) > options.rmat.xscale_node) { total_edges /= options.rmat.xscale_interval; total_edges++; } else { total_edges /= options.rmat.xscale_interval; } if (options.global.symmetric) { total_edges *= 2; } printf("Generator type: R-MAT\n"); printf("Scale: %d (%" PRIu64 " vertices)\n", options.rmat.scale, ((uint64_t)1 << options.rmat.scale)); printf("Edges: %" PRIet "\n", total_edges); printf("Probabilities: A=%4.2f, B=%4.2f, C=%4.2f, D=%4.2f\n", options.rmat.a, options.rmat.b, options.rmat.c, d); double start = get_time(); // io thread size_t buffer_size = calculate_buffer_size(options.global.buffer_size); buffer_queue flushq; buffer_manager manager(&flushq, options.global.buffers_per_thread, buffer_size); io_thread_func io_func(options.global.graphname.c_str(), total_edges, &flushq, &manager, buffer_size); boost::thread io_thread(boost::ref(io_func)); // worker threads int nthreads = options.global.nthreads; edge_t edges_per_thread = options.rmat.edges / nthreads; threadid_t* workers[nthreads]; boost::thread* worker_threads[nthreads]; for (int i = 0; i < nthreads; i++) { workers[i] = new threadid_t(i); thread_buffer* buffer = manager.register_thread(*workers[i]); // last thread gets the remainder (if any) edge_t start = i * edges_per_thread; edge_t end = (i == nthreads-1) ? (options.rmat.edges) : ((i+1) * edges_per_thread); worker_threads[i] = new boost::thread(generate, buffer, state, options.rmat.scale, start, end, options.rmat.a, options.rmat.b, options.rmat.c, /*d,*/ options.global.symmetric); } // Wait until work completes for (int i = 0; i < nthreads; i++) { worker_threads[i]->join(); } io_func.stop(); io_thread.join(); // cleanup for (int i = 0; i < nthreads; i++) { manager.unregister_thread(*workers[i]); delete worker_threads[i]; delete workers[i]; } double elapsed = get_time() - start; printf("Generation time: %fs\n", elapsed); make_ini_file(options.global.graphname.c_str(), (uint64_t)1 << options.rmat.scale, total_edges); return 0; }
int main(int argc, char** argv) { MPI_Init(&argc, &argv); setup_globals(); /* Parse arguments. */ int SCALE = 16; int edgefactor = 16; /* nedges / nvertices, i.e., 2*avg. degree */ // if (argc >= 2) SCALE = atoi(argv[1]); // if (argc >= 3) edgefactor = atoi(argv[2]); char* name = argv[1]; if (argc >= 3) SCALE = atoi(argv[2]); if (argc >= 4) edgefactor = atoi(argv[3]); // if (argc <= 1 || argc >= 4 || SCALE == 0 || edgefactor == 0) { // if (rank == 0) { // fprintf(stderr, "Usage: %s SCALE edgefactor\n SCALE = log_2(# vertices) [integer, required]\n edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]); // } if (argc <= 2 || argc >= 5 || SCALE == 0 || edgefactor == 0) { if (rank == 0) { fprintf(stderr, "Usage: %s filename SCALE edgefactor\n SCALE = log_2(# vertices) [integer, required]\n edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]); } MPI_Abort(MPI_COMM_WORLD, 1); } uint64_t seed1 = 2, seed2 = 3; // const char* filename = getenv("TMPFILE"); const char* filename = name; /* If filename is NULL, store data in memory */ tuple_graph tg; tg.nglobaledges = (int64_t)(edgefactor) << SCALE; int64_t nglobalverts = (int64_t)(1) << SCALE; tg.data_in_file = (filename != NULL); if (tg.data_in_file) { printf("data in file \n"); MPI_File_set_errhandler(MPI_FILE_NULL, MPI_ERRORS_ARE_FATAL); // MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_DELETE_ON_CLOSE | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile); MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile); MPI_File_set_size(tg.edgefile, tg.nglobaledges * sizeof(packed_edge)); MPI_File_set_view(tg.edgefile, 0, packed_edge_mpi_type, packed_edge_mpi_type, "native", MPI_INFO_NULL); MPI_File_set_atomicity(tg.edgefile, 0); } /* Make the raw graph edges. */ /* Get roots for BFS runs, plus maximum vertex with non-zero degree (used by * validator). */ int num_bfs_roots = 64; int64_t* bfs_roots = (int64_t*)xmalloc(num_bfs_roots * sizeof(int64_t)); int64_t max_used_vertex = 0; double make_graph_start = MPI_Wtime(); { /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(seed1, seed2, seed); /* As the graph is being generated, also keep a bitmap of vertices with * incident edges. We keep a grid of processes, each row of which has a * separate copy of the bitmap (distributed among the processes in the * row), and then do an allreduce at the end. This scheme is used to avoid * non-local communication and reading the file separately just to find BFS * roots. */ MPI_Offset nchunks_in_file = (tg.nglobaledges + FILE_CHUNKSIZE - 1) / FILE_CHUNKSIZE; int64_t bitmap_size_in_bytes = int64_min(BITMAPSIZE, (nglobalverts + CHAR_BIT - 1) / CHAR_BIT); if (bitmap_size_in_bytes * size * CHAR_BIT < nglobalverts) { bitmap_size_in_bytes = (nglobalverts + size * CHAR_BIT - 1) / (size * CHAR_BIT); } int ranks_per_row = ((nglobalverts + CHAR_BIT - 1) / CHAR_BIT + bitmap_size_in_bytes - 1) / bitmap_size_in_bytes; int nrows = size / ranks_per_row; int my_row = -1, my_col = -1; unsigned char* restrict has_edge = NULL; MPI_Comm cart_comm; { int dims[2] = {size / ranks_per_row, ranks_per_row}; int periods[2] = {0, 0}; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &cart_comm); } int in_generating_rectangle = 0; if (cart_comm != MPI_COMM_NULL) { in_generating_rectangle = 1; { int dims[2], periods[2], coords[2]; MPI_Cart_get(cart_comm, 2, dims, periods, coords); my_row = coords[0]; my_col = coords[1]; } MPI_Comm this_col; MPI_Comm_split(cart_comm, my_col, my_row, &this_col); MPI_Comm_free(&cart_comm); has_edge = (unsigned char*)xMPI_Alloc_mem(bitmap_size_in_bytes); memset(has_edge, 0, bitmap_size_in_bytes); /* Every rank in a given row creates the same vertices (for updating the * bitmap); only one writes them to the file (or final memory buffer). */ packed_edge* buf = (packed_edge*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge)); MPI_Offset block_limit = (nchunks_in_file + nrows - 1) / nrows; // fprintf(stderr, "%d: nchunks_in_file = %" PRId64 ", block_limit = %" PRId64 " in grid of %d rows, %d cols\n", rank, (int64_t)nchunks_in_file, (int64_t)block_limit, nrows, ranks_per_row); if (tg.data_in_file) { tg.edgememory_size = 0; tg.edgememory = NULL; } else { int my_pos = my_row + my_col * nrows; int last_pos = (tg.nglobaledges % ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row) != 0) ? (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row) : -1; int64_t edges_left = tg.nglobaledges % FILE_CHUNKSIZE; int64_t nedges = FILE_CHUNKSIZE * (tg.nglobaledges / ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row)) + FILE_CHUNKSIZE * (my_pos < (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row)) + (my_pos == last_pos ? edges_left : 0); /* fprintf(stderr, "%d: nedges = %" PRId64 " of %" PRId64 "\n", rank, (int64_t)nedges, (int64_t)tg.nglobaledges); */ tg.edgememory_size = nedges; tg.edgememory = (packed_edge*)xmalloc(nedges * sizeof(packed_edge)); } MPI_Offset block_idx; for (block_idx = 0; block_idx < block_limit; ++block_idx) { /* fprintf(stderr, "%d: On block %d of %d\n", rank, (int)block_idx, (int)block_limit); */ MPI_Offset start_edge_index = int64_min(FILE_CHUNKSIZE * (block_idx * nrows + my_row), tg.nglobaledges); MPI_Offset edge_count = int64_min(tg.nglobaledges - start_edge_index, FILE_CHUNKSIZE); packed_edge* actual_buf = (!tg.data_in_file && block_idx % ranks_per_row == my_col) ? tg.edgememory + FILE_CHUNKSIZE * (block_idx / ranks_per_row) : buf; /* fprintf(stderr, "%d: My range is [%" PRId64 ", %" PRId64 ") %swriting into index %" PRId64 "\n", rank, (int64_t)start_edge_index, (int64_t)(start_edge_index + edge_count), (my_col == (block_idx % ranks_per_row)) ? "" : "not ", (int64_t)(FILE_CHUNKSIZE * (block_idx / ranks_per_row))); */ if (!tg.data_in_file && block_idx % ranks_per_row == my_col) { assert (FILE_CHUNKSIZE * (block_idx / ranks_per_row) + edge_count <= tg.edgememory_size); } // debug char* wtxbuf = (char*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge)); // generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf); generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf); if (tg.data_in_file && my_col == (block_idx % ranks_per_row)) { /* Try to spread writes among ranks */ // MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE); // debug printf("%d: %d, %d\n", rank, start_edge_index, edge_count); int i; // for (i = start_edge_index; i < start_edge_index + 3; i++) { // if(block_idx == 0) { // for (i = 0; i < 3; i++) { // if (edge_count > 3) // printf("%d: %d\t%d\n", rank, actual_buf[i].v0, actual_buf[i].v1); // } // } MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE); } ptrdiff_t i; #ifdef _OPENMP #pragma omp parallel for #endif for (i = 0; i < edge_count; ++i) { int64_t src = get_v0_from_edge(&actual_buf[i]); int64_t tgt = get_v1_from_edge(&actual_buf[i]); if (src == tgt) continue; if (src / bitmap_size_in_bytes / CHAR_BIT == my_col) { #ifdef _OPENMP #pragma omp atomic #endif has_edge[(src / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (src % CHAR_BIT)); } if (tgt / bitmap_size_in_bytes / CHAR_BIT == my_col) { #ifdef _OPENMP #pragma omp atomic #endif has_edge[(tgt / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (tgt % CHAR_BIT)); } } } free(buf); #if 0 /* The allreduce for each root acts like we did this: */ MPI_Allreduce(MPI_IN_PLACE, has_edge, bitmap_size_in_bytes, MPI_UNSIGNED_CHAR, MPI_BOR, this_col); #endif MPI_Comm_free(&this_col); } else { tg.edgememory = NULL; tg.edgememory_size = 0; } MPI_Allreduce(&tg.edgememory_size, &tg.max_edgememory_size, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); #ifndef GEN_ONLY /* Find roots and max used vertex */ { uint64_t counter = 0; int bfs_root_idx; for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) { int64_t root; while (1) { double d[2]; make_random_numbers(2, seed1, seed2, counter, d); root = (int64_t)((d[0] + d[1]) * nglobalverts) % nglobalverts; counter += 2; if (counter > 2 * nglobalverts) break; int is_duplicate = 0; int i; for (i = 0; i < bfs_root_idx; ++i) { if (root == bfs_roots[i]) { is_duplicate = 1; break; } } if (is_duplicate) continue; /* Everyone takes the same path here */ int root_ok = 0; if (in_generating_rectangle && (root / CHAR_BIT / bitmap_size_in_bytes) == my_col) { root_ok = (has_edge[(root / CHAR_BIT) % bitmap_size_in_bytes] & (1 << (root % CHAR_BIT))) != 0; } MPI_Allreduce(MPI_IN_PLACE, &root_ok, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if (root_ok) break; } bfs_roots[bfs_root_idx] = root; } num_bfs_roots = bfs_root_idx; /* Find maximum non-zero-degree vertex. */ { int64_t i; max_used_vertex = 0; if (in_generating_rectangle) { for (i = bitmap_size_in_bytes * CHAR_BIT; i > 0; --i) { if (i > nglobalverts) continue; if (has_edge[(i - 1) / CHAR_BIT] & (1 << ((i - 1) % CHAR_BIT))) { max_used_vertex = (i - 1) + my_col * CHAR_BIT * bitmap_size_in_bytes; break; } } } MPI_Allreduce(MPI_IN_PLACE, &max_used_vertex, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); } } #endif if (in_generating_rectangle) { MPI_Free_mem(has_edge); } if (tg.data_in_file) { MPI_File_sync(tg.edgefile); } } double make_graph_stop = MPI_Wtime(); double make_graph_time = make_graph_stop - make_graph_start; if (rank == 0) { /* Not an official part of the results */ fprintf(stderr, "graph_generation: %f s\n", make_graph_time); } //debug #ifndef GEN_ONLY //!GEN_ONLY /* Make user's graph data structure. */ double data_struct_start = MPI_Wtime(); make_graph_data_structure(&tg); double data_struct_stop = MPI_Wtime(); double data_struct_time = data_struct_stop - data_struct_start; if (rank == 0) { /* Not an official part of the results */ fprintf(stderr, "construction_time: %f s\n", data_struct_time); } /* Number of edges visited in each BFS; a double so get_statistics can be * used directly. */ double* edge_counts = (double*)xmalloc(num_bfs_roots * sizeof(double)); /* Run BFS. */ int validation_passed = 1; double* bfs_times = (double*)xmalloc(num_bfs_roots * sizeof(double)); double* validate_times = (double*)xmalloc(num_bfs_roots * sizeof(double)); uint64_t nlocalverts = get_nlocalverts_for_pred(); int64_t* pred = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t)); int bfs_root_idx; for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) { int64_t root = bfs_roots[bfs_root_idx]; if (rank == 0) fprintf(stderr, "Running BFS %d\n", bfs_root_idx); /* Clear the pred array. */ memset(pred, 0, nlocalverts * sizeof(int64_t)); /* Do the actual BFS. */ double bfs_start = MPI_Wtime(); run_bfs(root, &pred[0]); double bfs_stop = MPI_Wtime(); bfs_times[bfs_root_idx] = bfs_stop - bfs_start; if (rank == 0) fprintf(stderr, "Time for BFS %d is %f\n", bfs_root_idx, bfs_times[bfs_root_idx]); /* Validate result. */ if (rank == 0) fprintf(stderr, "Validating BFS %d\n", bfs_root_idx); double validate_start = MPI_Wtime(); int64_t edge_visit_count; int validation_passed_one = validate_bfs_result(&tg, max_used_vertex + 1, nlocalverts, root, pred, &edge_visit_count); double validate_stop = MPI_Wtime(); validate_times[bfs_root_idx] = validate_stop - validate_start; if (rank == 0) fprintf(stderr, "Validate time for BFS %d is %f\n", bfs_root_idx, validate_times[bfs_root_idx]); edge_counts[bfs_root_idx] = (double)edge_visit_count; if (rank == 0) fprintf(stderr, "TEPS for BFS %d is %g\n", bfs_root_idx, edge_visit_count / bfs_times[bfs_root_idx]); if (!validation_passed_one) { validation_passed = 0; if (rank == 0) fprintf(stderr, "Validation failed for this BFS root; skipping rest.\n"); break; } } MPI_Free_mem(pred); free(bfs_roots); free_graph_data_structure(); #endif //!GEN_ONLY if (tg.data_in_file) { MPI_File_close(&tg.edgefile); } else { free(tg.edgememory); tg.edgememory = NULL; } #ifndef GEN_ONLY /* Print results. */ if (rank == 0) { if (!validation_passed) { fprintf(stdout, "No results printed for invalid run.\n"); } else { int i; fprintf(stdout, "SCALE: %d\n", SCALE); fprintf(stdout, "edgefactor: %d\n", edgefactor); fprintf(stdout, "NBFS: %d\n", num_bfs_roots); fprintf(stdout, "graph_generation: %g\n", make_graph_time); fprintf(stdout, "num_mpi_processes: %d\n", size); fprintf(stdout, "construction_time: %g\n", data_struct_time); double stats[s_LAST]; get_statistics(bfs_times, num_bfs_roots, stats); fprintf(stdout, "min_time: %g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_time: %g\n", stats[s_firstquartile]); fprintf(stdout, "median_time: %g\n", stats[s_median]); fprintf(stdout, "thirdquartile_time: %g\n", stats[s_thirdquartile]); fprintf(stdout, "max_time: %g\n", stats[s_maximum]); fprintf(stdout, "mean_time: %g\n", stats[s_mean]); fprintf(stdout, "stddev_time: %g\n", stats[s_std]); get_statistics(edge_counts, num_bfs_roots, stats); fprintf(stdout, "min_nedge: %.11g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_nedge: %.11g\n", stats[s_firstquartile]); fprintf(stdout, "median_nedge: %.11g\n", stats[s_median]); fprintf(stdout, "thirdquartile_nedge: %.11g\n", stats[s_thirdquartile]); fprintf(stdout, "max_nedge: %.11g\n", stats[s_maximum]); fprintf(stdout, "mean_nedge: %.11g\n", stats[s_mean]); fprintf(stdout, "stddev_nedge: %.11g\n", stats[s_std]); double* secs_per_edge = (double*)xmalloc(num_bfs_roots * sizeof(double)); for (i = 0; i < num_bfs_roots; ++i) secs_per_edge[i] = bfs_times[i] / edge_counts[i]; get_statistics(secs_per_edge, num_bfs_roots, stats); fprintf(stdout, "min_TEPS: %g\n", 1. / stats[s_maximum]); fprintf(stdout, "firstquartile_TEPS: %g\n", 1. / stats[s_thirdquartile]); fprintf(stdout, "median_TEPS: %g\n", 1. / stats[s_median]); fprintf(stdout, "thirdquartile_TEPS: %g\n", 1. / stats[s_firstquartile]); fprintf(stdout, "max_TEPS: %g\n", 1. / stats[s_minimum]); fprintf(stdout, "harmonic_mean_TEPS: %g\n", 1. / stats[s_mean]); /* Formula from: * Title: The Standard Errors of the Geometric and Harmonic Means and * Their Application to Index Numbers * Author(s): Nilan Norris * Source: The Annals of Mathematical Statistics, Vol. 11, No. 4 (Dec., 1940), pp. 445-448 * Publisher(s): Institute of Mathematical Statistics * Stable URL: http://www.jstor.org/stable/2235723 * (same source as in specification). */ fprintf(stdout, "harmonic_stddev_TEPS: %g\n", stats[s_std] / (stats[s_mean] * stats[s_mean] * sqrt(num_bfs_roots - 1))); free(secs_per_edge); secs_per_edge = NULL; free(edge_counts); edge_counts = NULL; get_statistics(validate_times, num_bfs_roots, stats); fprintf(stdout, "min_validate: %g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_validate: %g\n", stats[s_firstquartile]); fprintf(stdout, "median_validate: %g\n", stats[s_median]); fprintf(stdout, "thirdquartile_validate: %g\n", stats[s_thirdquartile]); fprintf(stdout, "max_validate: %g\n", stats[s_maximum]); fprintf(stdout, "mean_validate: %g\n", stats[s_mean]); fprintf(stdout, "stddev_validate: %g\n", stats[s_std]); #if 0 for (i = 0; i < num_bfs_roots; ++i) { fprintf(stdout, "Run %3d: %g s, validation %g s\n", i + 1, bfs_times[i], validate_times[i]); } #endif } } free(bfs_times); free(validate_times); #endif cleanup_globals(); MPI_Finalize(); return 0; }
void make_graph(int log_numverts, int64_t desired_nedges, uint64_t userseed1, uint64_t userseed2, const double initiator[4], int64_t* nedges_ptr, int64_t** result_ptr) { int64_t N, M; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES generated_edge* edges; #else int64_t* edges; #endif int64_t nedges; int64_t* vertex_perm; int64_t* result; int64_t i; mrg_state state; int64_t v1; int64_t v2; N = (int64_t)pow(GRAPHGEN_INITIATOR_SIZE, log_numverts); M = desired_nedges; make_mrg_seed(userseed1, userseed2, seed); nedges = compute_edge_array_size(0, 1, M); *nedges_ptr = nedges; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES edges = (generated_edge*)xcalloc(nedges, sizeof(generated_edge)); /* multiplicity set to 0 for unused edges */ #else edges = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #endif generate_kronecker(0, 1, seed, log_numverts, M, initiator, edges); vertex_perm = (int64_t*)xmalloc(N * sizeof(int64_t)); /* result; AL: this is a needless warning about unused code. */ #ifdef GRAPHGEN_KEEP_MULTIPLICITIES result = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #else result = edges; #endif *result_ptr = result; mrg_seed(&state, seed); rand_sort_shared(&state, N, vertex_perm); /* Apply vertex permutation to graph, optionally copying into user's result * array. */ #ifdef GRAPHGEN_KEEP_MULTIPLICITIES for (i = 0; i < nedges; ++i) { if (edges[i].multiplicity != 0) { v1 = vertex_perm[edges[i].src]; v2 = vertex_perm[edges[i].tgt]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ result[i * 2] = (v1 < v2) ? v1 : v2; result[i * 2 + 1] = (v1 < v2) ? v2 : v1; } else { result[i * 2] = result[i * 2 + 1] = (int64_t)(-1); } } free(edges); #else for (i = 0; i < 2 * nedges; i += 2) { if (edges[i] != (int64_t)(-1)) { v1 = vertex_perm[edges[i]]; v2 = vertex_perm[edges[i + 1]]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ edges[i] = (v1 < v2) ? v1 : v2; edges[i + 1] = (v1 < v2) ? v2 : v1; } } #endif free(vertex_perm); /* Randomly mix up the order of the edges. */ scramble_edges_shared(userseed1, userseed2, nedges, edges); }
void make_graph(int log_numverts, int64_t desired_nedges, uint64_t userseed1, uint64_t userseed2, const double initiator[4], int64_t* nedges_ptr, int64_t** result_ptr) { int64_t N, M; int rank, size; N = (int64_t)pow(GRAPHGEN_INITIATOR_SIZE, log_numverts); M = desired_nedges; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); int64_t nedges = compute_edge_array_size(rank, size, M); #ifdef GRAPHGEN_KEEP_MULTIPLICITIES generated_edge* local_edges = (generated_edge*)xmalloc(nedges * sizeof(generated_edge)); #else int64_t* local_edges = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #endif double start = MPI_Wtime(); generate_kronecker(rank, size, seed, log_numverts, M, initiator, local_edges); double gen_time = MPI_Wtime() - start; int64_t* local_vertex_perm = NULL; mrg_state state; mrg_seed(&state, seed); start = MPI_Wtime(); int64_t perm_local_size; rand_sort_mpi(MPI_COMM_WORLD, &state, N, &perm_local_size, &local_vertex_perm); double perm_gen_time = MPI_Wtime() - start; /* Copy the edge endpoints into the result array if necessary. */ int64_t* result; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES result = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); for (i = 0; i < nedges; ++i) { if (local_edges[i].multiplicity != 0) { result[i * 2] = local_edges[i].src; result[i * 2 + 1] = local_edges[i].tgt; } else { result[i * 2] = result[i * 2 + 1] = (int64_t)(-1); } } free(local_edges); local_edges = NULL; #else result = local_edges; *result_ptr = result; local_edges = NULL; /* Freed by caller */ #endif /* Apply vertex permutation to graph. */ start = MPI_Wtime(); apply_permutation_mpi(MPI_COMM_WORLD, perm_local_size, local_vertex_perm, N, nedges, result); double perm_apply_time = MPI_Wtime() - start; free(local_vertex_perm); local_vertex_perm = NULL; /* Randomly mix up the order of the edges. */ start = MPI_Wtime(); int64_t* new_result; int64_t nedges_out; scramble_edges_mpi(MPI_COMM_WORLD, userseed1, userseed2, nedges, result, &nedges_out, &new_result); double edge_scramble_time = MPI_Wtime() - start; free(result); result = NULL; *result_ptr = new_result; *nedges_ptr = nedges_out; if (rank == 0) { fprintf(stdout, "unpermuted_graph_generation: %f s\n", gen_time); fprintf(stdout, "vertex_permutation_generation: %f s\n", perm_gen_time); fprintf(stdout, "vertex_permutation_application: %f s\n", perm_apply_time); fprintf(stdout, "edge_scrambling: %f s\n", edge_scramble_time); } }
void make_graph(int log_numverts, int64_t desired_nedges, uint64_t userseed1, uint64_t userseed2, const double initiator[4], int64_t* nedges_ptr, int64_t** result_ptr) { int64_t N, M; N = (int64_t)pow(GRAPHGEN_INITIATOR_SIZE, log_numverts); M = (int64_t)desired_nedges; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); int64_t nedges = compute_edge_array_size(0, 1, M); *nedges_ptr = nedges; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES generated_edge* edges = (generated_edge*)xcalloc(nedges, sizeof(generated_edge)); /* multiplicity set to 0 for unused edges */ #else int64_t* edges = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #endif int rank, size; /* The "for all streams" here is in compiler versions >= 6.4 */ #pragma mta use 100 streams #pragma mta for all streams rank of size { double my_initiator[GRAPHGEN_INITIATOR_SIZE * GRAPHGEN_INITIATOR_SIZE]; /* Local copy */ int i; for (i = 0; i < GRAPHGEN_INITIATOR_SIZE * GRAPHGEN_INITIATOR_SIZE; ++i) { my_initiator[i] = initiator[i]; } generate_kronecker(rank, size, seed, log_numverts, M, my_initiator, edges); } int64_t* vertex_perm = (int64_t*)xmalloc(N * sizeof(int64_t)); int64_t* result; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES result = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #else result = edges; #endif *result_ptr = result; mrg_state state; mrg_seed(&state, seed); rand_sort_shared(&state, N, vertex_perm); int64_t i; /* Apply vertex permutation to graph, optionally copying into user's result * array. */ #ifdef GRAPHGEN_KEEP_MULTIPLICITIES #pragma mta assert parallel #pragma mta block schedule for (i = 0; i < nedges; ++i) { if (edges[i].multiplicity != 0) { int64_t v1 = vertex_perm[edges[i].src]; int64_t v2 = vertex_perm[edges[i].tgt]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ result[i * 2] = MTA_INT_MIN(v1, v2); result[i * 2 + 1] = MTA_INT_MAX(v1, v2); } else { result[i * 2] = result[i * 2 + 1] = (int64_t)(-1); } } free(edges); #else #pragma mta assert parallel #pragma mta block schedule for (i = 0; i < 2 * nedges; i += 2) { if (edges[i] != (int64_t)(-1)) { int64_t v1 = vertex_perm[edges[i]]; int64_t v2 = vertex_perm[edges[i + 1]]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ edges[i] = MTA_INT_MIN(v1, v2); edges[i + 1] = MTA_INT_MAX(v1, v2); } } #endif free(vertex_perm); /* Randomly mix up the order of the edges. */ scramble_edges_shared(userseed1, userseed2, nedges, edges); }