void bitonic_merge(_Iterator begin, _Iterator end, _Compare comp, const mxx::comm& comm, int pbeg, int pend, int dir) { MXX_ASSERT(pbeg <= comm.rank() && comm.rank() < pend); // get size and terminate at recursive base-case int size = pend - pbeg; if (size <= 1) return; // get next greater power of 2 int p2 = pow(2, ceil(log(size)/log(2))); // merge with splits as is done in the power of 2 case int pmid = pbeg + p2/2; if (comm.rank() < pmid && comm.rank() + p2/2 < pend) { // this processor has a partner in the second half int partner_rank = comm.rank() + p2/2; bitonic_split(begin, end, comp, comm, partner_rank, dir); bitonic_merge(begin, end, comp, comm, pbeg, pmid, dir); } else if (comm.rank() < pmid) { // this process doesn't have a partner but has to recursively // participate in the next merge bitonic_merge(begin, end, comp, comm, pbeg, pmid, dir); } else { // if (comm.rank() >= pmid) int partner_rank = comm.rank() - p2/2; bitonic_split(begin, end, comp, comm, partner_rank, dir); bitonic_merge(begin, end, comp, comm, pmid, pend, dir); } }
static void unpack_envelope(MPI_Datatype type, flat_repr& f) { int num_ints, num_addr, num_dt, comb; MPI_Type_get_envelope(type, &num_ints, &num_addr, &num_dt, &comb); if (comb == MPI_COMBINER_NAMED) { //std::cout << "Type: " << builtin_typename_map::get_typeid_name(type) << std::endl; f.m.emplace(f.cur_offset, type); return; } // allocate the output for get_contents std::vector<int> ints; ints.resize(num_ints); std::vector<MPI_Aint> addrs; addrs.resize(num_addr); std::vector<MPI_Datatype> types; types.resize(num_dt); MPI_Type_get_contents(type, num_ints, num_addr, num_dt, &ints[0], &addrs[0], &types[0]); switch(comb) { case MPI_COMBINER_DUP: MXX_ASSERT(num_ints == 0 && num_addr == 0 && num_dt == 1); unpack_envelope(types[0], f); break; case MPI_COMBINER_CONTIGUOUS: std::cout << "Contiguous: " << ints[0] << " x "; unpack_envelope(types[0], f); break; case MPI_COMBINER_VECTOR: case MPI_COMBINER_HVECTOR: case MPI_COMBINER_INDEXED: case MPI_COMBINER_HINDEXED: case MPI_COMBINER_INDEXED_BLOCK: case MPI_COMBINER_HINDEXED_BLOCK: std::cout << "NOT YET SUPPORTED vector/indexed/indexed_block" << std::endl; break; case MPI_COMBINER_STRUCT: { int count = ints[0]; std::vector<int> blen(&ints[1], &ints[0]+count); std::vector<MPI_Aint> displ = addrs; std::cout << "Struct: " << std::endl; MPI_Aint offset = f.cur_offset; for (int i = 0; i < count; ++i) { f.cur_offset = offset + displ[i]; unpack_envelope(types[i], f); } f.cur_offset = offset; } break; case MPI_COMBINER_RESIZED: // TODO std::cout << "resized to [" << addrs[0] << "," << addrs[1] << "): " << std::endl; unpack_envelope(types[0], f); break; case MPI_COMBINER_SUBARRAY: case MPI_COMBINER_DARRAY: std::cout << "NOT YET SUPPORTED subarray/darray" << std::endl; break; } }
size_t offset_from_ptr(M* m) { size_t offset = reinterpret_cast<size_t>(m); MXX_ASSERT(0 <= offset && offset + sizeof(M) <= sizeof(U)); return offset; }
int main(int argc, char* argv[]) { mxx::env e(argc, argv); mxx::comm comm; // print out node and rank distribution mxx::print_node_distribution(comm); // create shared-mem MPI+MPI hybrid communicator mxx::hybrid_comm hc(comm); // assert same number processors per node int proc_per_node = hc.local.size(); if (!mxx::all_same(proc_per_node, comm)) { std::cerr << "Error: this benchmark assumes the same number of processors per node" << std::endl; MPI_Abort(comm, -1); } // assert we have an even number of nodes int num_nodes = hc.num_nodes(); if (num_nodes > 1 && num_nodes % 2 != 0) { std::cerr << "Error: this benchmark assumes an even number of nodes" << std::endl; MPI_Abort(comm, -1); } // default args size_t mem_per_node_gb = 32; // setting the max experiment at 32 GB per node std::string filename = "all2all_benchmark.csv"; // parse input arguments exec_name = argv[0]; argv++; argc--; if (argc >= 2) { std::string x(argv[0]); if (x == "-m") { mem_per_node_gb = atoi(argv[1]); argv+=2; argc-=2; } if (x != "-m" || mem_per_node_gb > 1024 || mem_per_node_gb == 0) { print_usage(); MPI_Abort(comm, -1); } } if (argc > 0) { filename = argv[0]; argv++; argc--; } if (argc > 0) { print_usage(); MPI_Abort(comm, -1); } MXX_ASSERT(mxx::all_same(mem_per_node_gb, comm)); // benchmark all: std::ofstream of; if (hc.global.rank() == 0) { of.open(filename); of << "p,nnodes,q,m,n,min,avg,max" << std::endl; } // 32 GB/node max? size_t mempernode = mem_per_node_gb << 30; mxx::forall_p2_nnodes_and_ppn(hc, [&](const mxx::hybrid_comm& hc){ bm_all2all(hc, of, mempernode); }); return 0; }