void Test(api::Context& ctx) { for (size_t outer = 0; outer < outer_repeats_; ++outer) { common::StatsTimerStopped t; size_t dummy = +4915221495089; t.Start(); for (size_t inner = 0; inner < inner_repeats_; ++inner) { dummy = ctx.net.Broadcast(dummy); } t.Stop(); size_t n = ctx.num_workers(); size_t time = t.Microseconds(); // calculate maximum time. time = ctx.net.AllReduce(time, common::maximum<size_t>()); if (ctx.my_rank() == 0) { std::cout << "RESULT" << " datatype=" << "size_t" << " operation=" << "broadcast" << " workers=" << n << " inner_repeats=" << inner_repeats_ << " time[us]=" << time << " time_per_op[us]=" << static_cast<double>(time) / inner_repeats_ << std::endl; } } }
void Sender(api::Context& ctx, size_t peer_id, size_t inner_repeat) { net::Group& group = ctx.net.group(); net::Connection& peer = group.connection(peer_id); common::StatsTimerStart inner_timer; // send blocks to peer for (size_t i = 0; i != block_count_; ++i) { data_block_.front() = counter_; data_block_.back() = counter_; ++counter_; peer.SyncSend(data_block_.data(), block_size_); } // wait for response pong size_t value; peer.Receive(&value); die_unequal(value, counter_); inner_timer.Stop(); double bw = CalcMiBs(block_count_ * block_size_, inner_timer); sLOG0 << "bandwidth" << ctx.host_rank() << "->" << peer_id << "inner_repeat" << inner_repeat << bw << "MiB/s" << "time" << (static_cast<double>(inner_timer.Microseconds()) * 1e-6); bandwidth_(ctx.host_rank(), peer_id).add(bw); }
void Test(api::Context& ctx) { common::StatsTimerStopped t; // only work with first thread on this host. if (ctx.local_worker_id() == 0) { mem::Manager mem_manager(nullptr, "Dispatcher"); group_ = &ctx.net.group(); std::unique_ptr<net::Dispatcher> dispatcher = group_->ConstructDispatcher(); dispatcher_ = dispatcher.get(); t.Start(); for (size_t outer = 0; outer < outer_repeats_; ++outer) { rnd_ = std::default_random_engine(123456); active_ = 0; remaining_requests_ = num_requests_; while (active_ < limit_active_ && remaining_requests_ > 0) { if (MaybeStartRequest()) { ++active_; } } dispatcher_->Loop(); } t.Stop(); // must clean up dispatcher prior to using group for other things. } size_t time = t.Microseconds(); // calculate maximum time. time = ctx.net.AllReduce(time, common::maximum<size_t>()); if (ctx.my_rank() == 0) { std::cout << "RESULT" << " operation=" << "rblocks" << " hosts=" << group_->num_hosts() << " requests=" << num_requests_ << " block_size=" << block_size_ << " limit_active=" << limit_active_ << " time[us]=" << time << " time_per_op[us]=" << static_cast<double>(time) / num_requests_ << " total_bytes=" << block_size_ * num_requests_ << " total_bandwidth[MiB/s]=" << CalcMiBs(block_size_ * num_requests_, time) << std::endl; } }
void ConductExperiment(uint64_t bytes, int iterations, api::Context& ctx1, api::Context& ctx2, const std::string& type_as_string) { auto data = generate<Type>(bytes, 1, 100); common::ThreadPool pool; for (int i = 0; i < iterations; i++) { StatsTimer<true> write_timer; pool.Enqueue([&data, &ctx1, &write_timer]() { auto stream = ctx1.GetNewCatStream(); auto writers = stream->OpenWriters(); assert(writers.size() == 2); write_timer.Start(); auto& writer = writers[1]; for (auto& s : data) { writer(s); } writer.Close(); writers[0].Close(); write_timer.Stop(); }); StatsTimer<true> read_timer; pool.Enqueue([&ctx2, &read_timer]() { auto stream = ctx2.GetNewCatStream(); auto readers = stream->OpenReaders(); assert(readers.size() == 2); auto& reader = readers[0]; read_timer.Start(); while (reader.HasNext()) { reader.Next<Type>(); } read_timer.Stop(); }); pool.LoopUntilEmpty(); std::cout << "RESULT" << " datatype=" << type_as_string << " size=" << bytes << " write_time=" << write_timer << " read_time=" << read_timer << std::endl; } }
void Sender(api::Context& ctx, size_t peer, size_t iteration) { net::Group& group = ctx.net.group(); // do an extra ping/pong round to synchronize. { // send ping to peer size_t value = counter_++; group.SendTo(peer, value); // wait for ping group.ReceiveFrom(peer, &value); die_unequal(value, counter_); } common::StatsTimerStart inner_timer; for (size_t inner = 0; inner < inner_repeats_; ++inner) { // send ping to peer size_t value = counter_++; group.SendTo(peer, value); // wait for ping group.ReceiveFrom(peer, &value); die_unequal(value, counter_); } inner_timer.Stop(); double avg = static_cast<double>(inner_timer.Microseconds()) / static_cast<double>(inner_repeats_); sLOG0 << "bandwidth" << ctx.host_rank() << "->" << peer << "iteration" << iteration << "latency" << avg; latency_(ctx.host_rank(), peer).add(avg); }
void Test(api::Context& ctx) { for (size_t outer = 0; outer < outer_repeats_; ++outer) { common::StatsTimerStopped t; size_t n = ctx.num_workers(); t.Start(); for (size_t inner = 0; inner < inner_repeats_; ++inner) { // allreduce a different value in each iteration size_t value = inner + ctx.my_rank(); value = ctx.net.AllReduce(value); size_t expected = (n + inner) * ((n + inner) - 1) / 2 - inner * (inner - 1) / 2; die_unequal(value, expected); } t.Stop(); size_t time = t.Microseconds(); // calculate maximum time. time = ctx.net.AllReduce(time, common::maximum<size_t>()); if (ctx.my_rank() == 0) { std::cout << "RESULT" << " datatype=" << "size_t" << " operation=" << "allreduce" << " workers=" << n << " inner_repeats=" << inner_repeats_ << " time[us]=" << time << " time_per_op[us]=" << static_cast<double>(time) / inner_repeats_ << std::endl; } } }
void ExperimentFull( api::Context& ctx, const std::string& type_as_string) { // transmit data to all workers. auto stream = ctx.GetNewCatStream(); // write phase StatsTimer<true> write_timer(true); { auto writers = stream->OpenWriters(); auto data = Generator<Type>(g_bytes); while (data.HasNext()) { Type value = data.Next(); for (size_t tgt = 0; tgt < ctx.num_workers(); ++tgt) { writers[tgt](value); } } } write_timer.Stop(); // read phase StatsTimer<true> read_timer(true); { auto reader = stream->OpenCatReader(true); while (reader.HasNext()) { reader.Next<Type>(); } } read_timer.Stop(); size_t read_microsecs = read_timer.Microseconds(); read_microsecs = ctx.AllReduce(read_microsecs, common::maximum<size_t>()); size_t write_microsecs = write_timer.Microseconds(); write_microsecs = ctx.AllReduce(write_microsecs, common::maximum<size_t>()); uint64_t host_volume = ctx.num_workers() * g_bytes; uint64_t total_volume = ctx.num_workers() * ctx.num_workers() * g_bytes; if (ctx.my_rank() == 0) { std::cout << "RESULT" << " datatype=" << type_as_string << " size=" << g_bytes << " write_time=" << write_microsecs << " read_time=" << read_microsecs << " write_speed_MiBs=" << (g_bytes / write_microsecs * 1000000 / 1024 / 1024) << " read_speed_MiBs=" << (g_bytes / read_microsecs * 1000000 / 1024 / 1024) << " host_write_speed_MiBs=" << (host_volume / write_microsecs * 1000000 / 1024 / 1024) << " host_read_speed_MiBs=" << (host_volume / read_microsecs * 1000000 / 1024 / 1024) << " total_write_speed_MiBs=" << (total_volume / write_microsecs * 1000000 / 1024 / 1024) << " total_read_speed_MiBs=" << (total_volume / read_microsecs * 1000000 / 1024 / 1024) << std::endl; } }
void ExperimentAllPairs( api::Context& ctx, const std::string& type_as_string) { for (size_t src = 0; src < ctx.num_workers(); ++src) { for (size_t tgt = 0; tgt < ctx.num_workers(); ++tgt) { // transmit data from worker src -> tgt: only send data if we are // tgt, but as tgt receive from all. auto stream = ctx.GetNewCatStream(); // write phase StatsTimer<true> write_timer(true); { auto writers = stream->OpenWriters(); if (ctx.my_rank() == src) { auto data = Generator<Type>(g_bytes); auto& writer = writers[tgt]; while (data.HasNext()) { writer(data.Next()); } } } write_timer.Stop(); // read phase StatsTimer<true> read_timer(true); { auto reader = stream->OpenCatReader(true); while (reader.HasNext()) { reader.Next<Type>(); } } read_timer.Stop(); size_t read_microsecs = read_timer.Microseconds(); read_microsecs = ctx.AllReduce(read_microsecs, common::maximum<size_t>()); size_t write_microsecs = write_timer.Microseconds(); write_microsecs = ctx.AllReduce(write_microsecs, common::maximum<size_t>()); if (ctx.my_rank() == 0) { std::cout << "RESULT" << " datatype=" << type_as_string << " size=" << g_bytes << " src=" << src << " tgt=" << tgt << " write_time=" << write_microsecs << " read_time=" << read_microsecs << " write_speed_MiBs=" << (g_bytes / write_microsecs * 1000000 / 1024 / 1024) << " read_speed_MiBs=" << (g_bytes / read_microsecs * 1000000 / 1024 / 1024) << std::endl; } } } }
void ConductExperiment(uint64_t bytes, int iterations, api::Context& ctx0, api::Context& ctx1, api::Context& ctx2, const std::string& type_as_string) { // prepare file with random data auto data0 = generate<Type>(bytes / 2, 1, 100); auto data1 = generate<Type>(bytes / 2, 1, 100); std::vector<data::File> files; files.reserve(3); { files.emplace_back(ctx0.GetFile()); auto writer0 = files[0].GetWriter(); for (auto& d : data0) writer0(d); files.emplace_back(ctx1.GetFile()); auto writer1 = files[1].GetWriter(); for (auto& d : data1) writer1(d); files.emplace_back(ctx2.GetFile()); auto writer2 = files[2].GetWriter(); } // worker 0 and worker 1 hold 50% each // worker 0 keeps 2/3 of his data, sends 1/3 to worker 1 // worker 1 keeps first 1/3 of his data, sends 2/3 to worker 2 // worker 2 receives 2/3 from worker 1 // afterwards everybody holds 33% of the data std::vector<std::vector<size_t> > offsets; offsets.push_back({ (size_t)(2 * data0.size() / 3), data0.size(), data0.size() }); offsets.push_back({ 0, (size_t)(data1.size() / 3), data1.size() }); offsets.push_back({ 0, 0, 0 }); std::vector<std::shared_ptr<data::CatStream> > streams; streams.push_back(ctx0.GetNewCatStream()); streams.push_back(ctx1.GetNewCatStream()); streams.push_back(ctx2.GetNewCatStream()); std::vector<StatsTimer<true> > read_timers(3); std::vector<StatsTimer<true> > write_timers(3); common::ThreadPool pool; for (int i = 0; i < iterations; i++) { for (int id = 0; id < 3; id++) { pool.Enqueue([&files, &streams, &offsets, &read_timers, &write_timers, id]() { write_timers[id].Start(); streams[id]->Scatter<Type>(files[id], offsets[id]); write_timers[id].Stop(); auto reader = streams[id]->OpenCatReader(true); read_timers[id].Start(); while (reader.HasNext()) { reader.Next<Type>(); } read_timers[id].Stop(); }); } pool.LoopUntilEmpty(); std::cout << "RESULT" << " datatype=" << type_as_string << " size=" << bytes << " write_time_worker0=" << write_timers[0].Microseconds() << " read_time_worker0=" << read_timers[0].Microseconds() << " write_time_worker1=" << write_timers[1].Microseconds() << " read_time_worker1=" << read_timers[1].Microseconds() << " write_time_worker2=" << write_timers[2].Microseconds() << " read_time_worker2=" << read_timers[2].Microseconds() << std::endl; } }
void Bandwidth::Test(api::Context& ctx) { // only work with first thread on this host. if (ctx.local_worker_id() != 0) return; net::Group& group = ctx.net.group(); bandwidth_ = AggMatrix(group.num_hosts()); // data block to send or receive block_count_ = data_size_ / block_size_; data_block_.resize(block_size_ / sizeof(size_t), 42u); for (size_t outer_repeat = 0; outer_repeat < outer_repeats_; ++outer_repeat) { common::StatsTimerStopped timer; timer.Start(); for (size_t inner_repeat = 0; inner_repeat < inner_repeats_; inner_repeat++) { // perform 1-factor ping pongs (without barriers) for (size_t round = 0; round < group.OneFactorSize(); ++round) { size_t peer = group.OneFactorPeer(round); sLOG0 << "round" << round << "me" << ctx.host_rank() << "peer_id" << peer; if (ctx.host_rank() < peer) { Sender(ctx, peer, inner_repeat); Receiver(ctx, peer); } else if (ctx.host_rank() > peer) { Receiver(ctx, peer); Sender(ctx, peer, inner_repeat); } else { // not participating in this round counter_ += 2 * block_count_; } } } timer.Stop(); size_t time = timer.Microseconds(); // calculate maximum time. group.AllReduce(time, common::maximum<size_t>()); if (ctx.my_rank() == 0) { std::cout << "RESULT" << " benchmark=" << benchmark << " hosts=" << ctx.num_hosts() << " outer_repeat=" << outer_repeat << " inner_repeats=" << inner_repeats_ << " time[us]=" << time << " time_per_ping_pong[us]=" << static_cast<double>(time) / static_cast<double>(counter_) << std::endl; } } // reduce (add) matrix to root. group.Reduce(bandwidth_); // print matrix if (ctx.my_rank() == 0) PrintMatrix(bandwidth_); }