예제 #1
0
    void Test(api::Context& ctx) {

        for (size_t outer = 0; outer < outer_repeats_; ++outer) {

            common::StatsTimerStopped t;

            size_t dummy = +4915221495089;

            t.Start();
            for (size_t inner = 0; inner < inner_repeats_; ++inner) {
                dummy = ctx.net.Broadcast(dummy);
            }
            t.Stop();

            size_t n = ctx.num_workers();
            size_t time = t.Microseconds();
            // calculate maximum time.
            time = ctx.net.AllReduce(time, common::maximum<size_t>());

            if (ctx.my_rank() == 0) {
                std::cout
                    << "RESULT"
                    << " datatype=" << "size_t"
                    << " operation=" << "broadcast"
                    << " workers=" << n
                    << " inner_repeats=" << inner_repeats_
                    << " time[us]=" << time
                    << " time_per_op[us]="
                    << static_cast<double>(time) / inner_repeats_
                    << std::endl;
            }
        }
    }
예제 #2
0
    void Sender(api::Context& ctx, size_t peer_id, size_t inner_repeat) {

        net::Group& group = ctx.net.group();
        net::Connection& peer = group.connection(peer_id);

        common::StatsTimerStart inner_timer;
        // send blocks to peer
        for (size_t i = 0; i != block_count_; ++i) {
            data_block_.front() = counter_;
            data_block_.back() = counter_;
            ++counter_;
            peer.SyncSend(data_block_.data(), block_size_);
        }

        // wait for response pong
        size_t value;
        peer.Receive(&value);
        die_unequal(value, counter_);

        inner_timer.Stop();

        double bw = CalcMiBs(block_count_ * block_size_, inner_timer);

        sLOG0 << "bandwidth" << ctx.host_rank() << "->" << peer_id
              << "inner_repeat" << inner_repeat
              << bw << "MiB/s"
              << "time"
              << (static_cast<double>(inner_timer.Microseconds()) * 1e-6);

        bandwidth_(ctx.host_rank(), peer_id).add(bw);
    }
예제 #3
0
    void Test(api::Context& ctx) {

        common::StatsTimerStopped t;

        // only work with first thread on this host.
        if (ctx.local_worker_id() == 0)
        {
            mem::Manager mem_manager(nullptr, "Dispatcher");

            group_ = &ctx.net.group();
            std::unique_ptr<net::Dispatcher> dispatcher =
                group_->ConstructDispatcher();
            dispatcher_ = dispatcher.get();

            t.Start();

            for (size_t outer = 0; outer < outer_repeats_; ++outer)
            {
                rnd_ = std::default_random_engine(123456);

                active_ = 0;
                remaining_requests_ = num_requests_;

                while (active_ < limit_active_ && remaining_requests_ > 0)
                {
                    if (MaybeStartRequest()) {
                        ++active_;
                    }
                }

                dispatcher_->Loop();
            }

            t.Stop();

            // must clean up dispatcher prior to using group for other things.
        }

        size_t time = t.Microseconds();
        // calculate maximum time.
        time = ctx.net.AllReduce(time, common::maximum<size_t>());

        if (ctx.my_rank() == 0) {
            std::cout
                << "RESULT"
                << " operation=" << "rblocks"
                << " hosts=" << group_->num_hosts()
                << " requests=" << num_requests_
                << " block_size=" << block_size_
                << " limit_active=" << limit_active_
                << " time[us]=" << time
                << " time_per_op[us]="
                << static_cast<double>(time) / num_requests_
                << " total_bytes=" << block_size_ * num_requests_
                << " total_bandwidth[MiB/s]="
                << CalcMiBs(block_size_ * num_requests_, time)
                << std::endl;
        }
    }
예제 #4
0
void ConductExperiment(uint64_t bytes, int iterations, api::Context& ctx1, api::Context& ctx2, const std::string& type_as_string) {

    auto data = generate<Type>(bytes, 1, 100);
    common::ThreadPool pool;
    for (int i = 0; i < iterations; i++) {
        StatsTimer<true> write_timer;
        pool.Enqueue([&data, &ctx1, &write_timer]() {
                         auto stream = ctx1.GetNewCatStream();
                         auto writers = stream->OpenWriters();
                         assert(writers.size() == 2);
                         write_timer.Start();
                         auto& writer = writers[1];
                         for (auto& s : data) {
                             writer(s);
                         }
                         writer.Close();
                         writers[0].Close();
                         write_timer.Stop();
                     });

        StatsTimer<true> read_timer;
        pool.Enqueue([&ctx2, &read_timer]() {
                         auto stream = ctx2.GetNewCatStream();
                         auto readers = stream->OpenReaders();
                         assert(readers.size() == 2);
                         auto& reader = readers[0];
                         read_timer.Start();
                         while (reader.HasNext()) {
                             reader.Next<Type>();
                         }
                         read_timer.Stop();
                     });
        pool.LoopUntilEmpty();
        std::cout << "RESULT"
                  << " datatype=" << type_as_string
                  << " size=" << bytes
                  << " write_time=" << write_timer
                  << " read_time=" << read_timer
                  << std::endl;
    }
}
예제 #5
0
    void Sender(api::Context& ctx, size_t peer, size_t iteration) {

        net::Group& group = ctx.net.group();

        // do an extra ping/pong round to synchronize.
        {
            // send ping to peer
            size_t value = counter_++;
            group.SendTo(peer, value);

            // wait for ping
            group.ReceiveFrom(peer, &value);
            die_unequal(value, counter_);
        }

        common::StatsTimerStart inner_timer;

        for (size_t inner = 0; inner < inner_repeats_; ++inner) {

            // send ping to peer
            size_t value = counter_++;
            group.SendTo(peer, value);

            // wait for ping
            group.ReceiveFrom(peer, &value);
            die_unequal(value, counter_);
        }
        inner_timer.Stop();

        double avg =
            static_cast<double>(inner_timer.Microseconds()) /
            static_cast<double>(inner_repeats_);

        sLOG0 << "bandwidth" << ctx.host_rank() << "->" << peer
              << "iteration" << iteration
              << "latency" << avg;

        latency_(ctx.host_rank(), peer).add(avg);
    }
예제 #6
0
    void Test(api::Context& ctx) {

        for (size_t outer = 0; outer < outer_repeats_; ++outer) {

            common::StatsTimerStopped t;

            size_t n = ctx.num_workers();

            t.Start();
            for (size_t inner = 0; inner < inner_repeats_; ++inner) {
                // allreduce a different value in each iteration
                size_t value = inner + ctx.my_rank();
                value = ctx.net.AllReduce(value);
                size_t expected = (n + inner) * ((n + inner) - 1) / 2 - inner * (inner - 1) / 2;
                die_unequal(value, expected);
            }
            t.Stop();

            size_t time = t.Microseconds();
            // calculate maximum time.
            time = ctx.net.AllReduce(time, common::maximum<size_t>());

            if (ctx.my_rank() == 0) {
                std::cout
                    << "RESULT"
                    << " datatype=" << "size_t"
                    << " operation=" << "allreduce"
                    << " workers=" << n
                    << " inner_repeats=" << inner_repeats_
                    << " time[us]=" << time
                    << " time_per_op[us]="
                    << static_cast<double>(time) / inner_repeats_
                    << std::endl;
            }
        }
    }
예제 #7
0
void ExperimentFull(
    api::Context& ctx, const std::string& type_as_string) {

    // transmit data to all workers.

    auto stream = ctx.GetNewCatStream();

    // write phase
    StatsTimer<true> write_timer(true);
    {
        auto writers = stream->OpenWriters();
        auto data = Generator<Type>(g_bytes);

        while (data.HasNext()) {
            Type value = data.Next();
            for (size_t tgt = 0; tgt < ctx.num_workers(); ++tgt) {
                writers[tgt](value);
            }
        }
    }
    write_timer.Stop();

    // read phase
    StatsTimer<true> read_timer(true);
    {
        auto reader = stream->OpenCatReader(true);

        while (reader.HasNext()) {
            reader.Next<Type>();
        }
    }
    read_timer.Stop();

    size_t read_microsecs = read_timer.Microseconds();
    read_microsecs =
        ctx.AllReduce(read_microsecs, common::maximum<size_t>());

    size_t write_microsecs = write_timer.Microseconds();
    write_microsecs =
        ctx.AllReduce(write_microsecs, common::maximum<size_t>());

    uint64_t host_volume = ctx.num_workers() * g_bytes;
    uint64_t total_volume = ctx.num_workers() * ctx.num_workers() * g_bytes;

    if (ctx.my_rank() == 0) {
        std::cout
            << "RESULT"
            << " datatype=" << type_as_string
            << " size=" << g_bytes
            << " write_time=" << write_microsecs
            << " read_time=" << read_microsecs
            << " write_speed_MiBs="
            << (g_bytes / write_microsecs * 1000000 / 1024 / 1024)
            << " read_speed_MiBs="
            << (g_bytes / read_microsecs * 1000000 / 1024 / 1024)
            << " host_write_speed_MiBs="
            << (host_volume / write_microsecs * 1000000 / 1024 / 1024)
            << " host_read_speed_MiBs="
            << (host_volume / read_microsecs * 1000000 / 1024 / 1024)
            << " total_write_speed_MiBs="
            << (total_volume / write_microsecs * 1000000 / 1024 / 1024)
            << " total_read_speed_MiBs="
            << (total_volume / read_microsecs * 1000000 / 1024 / 1024)
            << std::endl;
    }
}
예제 #8
0
void ExperimentAllPairs(
    api::Context& ctx, const std::string& type_as_string) {

    for (size_t src = 0; src < ctx.num_workers(); ++src) {
        for (size_t tgt = 0; tgt < ctx.num_workers(); ++tgt) {
            // transmit data from worker src -> tgt: only send data if we are
            // tgt, but as tgt receive from all.

            auto stream = ctx.GetNewCatStream();

            // write phase
            StatsTimer<true> write_timer(true);
            {
                auto writers = stream->OpenWriters();

                if (ctx.my_rank() == src) {
                    auto data = Generator<Type>(g_bytes);

                    auto& writer = writers[tgt];
                    while (data.HasNext()) {
                        writer(data.Next());
                    }
                }
            }
            write_timer.Stop();

            // read phase
            StatsTimer<true> read_timer(true);
            {
                auto reader = stream->OpenCatReader(true);

                while (reader.HasNext()) {
                    reader.Next<Type>();
                }
            }
            read_timer.Stop();

            size_t read_microsecs = read_timer.Microseconds();
            read_microsecs =
                ctx.AllReduce(read_microsecs, common::maximum<size_t>());

            size_t write_microsecs = write_timer.Microseconds();
            write_microsecs =
                ctx.AllReduce(write_microsecs, common::maximum<size_t>());

            if (ctx.my_rank() == 0) {
                std::cout
                    << "RESULT"
                    << " datatype=" << type_as_string
                    << " size=" << g_bytes
                    << " src=" << src << " tgt=" << tgt
                    << " write_time=" << write_microsecs
                    << " read_time=" << read_microsecs
                    << " write_speed_MiBs="
                    << (g_bytes / write_microsecs * 1000000 / 1024 / 1024)
                    << " read_speed_MiBs="
                    << (g_bytes / read_microsecs * 1000000 / 1024 / 1024)
                    << std::endl;
            }
        }
    }
}
예제 #9
0
void ConductExperiment(uint64_t bytes, int iterations,
                       api::Context& ctx0, api::Context& ctx1, api::Context& ctx2,
                       const std::string& type_as_string) {

    // prepare file with random data
    auto data0 = generate<Type>(bytes / 2, 1, 100);
    auto data1 = generate<Type>(bytes / 2, 1, 100);
    std::vector<data::File> files;
    files.reserve(3);
    {
        files.emplace_back(ctx0.GetFile());
        auto writer0 = files[0].GetWriter();
        for (auto& d : data0)
            writer0(d);

        files.emplace_back(ctx1.GetFile());
        auto writer1 = files[1].GetWriter();
        for (auto& d : data1)
            writer1(d);

        files.emplace_back(ctx2.GetFile());
        auto writer2 = files[2].GetWriter();
    }

    // worker 0 and worker 1 hold 50% each
    // worker 0 keeps 2/3 of his data, sends 1/3 to worker 1
    // worker 1 keeps first 1/3 of his data, sends 2/3 to worker 2
    // worker 2 receives 2/3 from worker 1
    // afterwards everybody holds 33% of the data
    std::vector<std::vector<size_t> > offsets;
    offsets.push_back({ (size_t)(2 * data0.size() / 3), data0.size(), data0.size() });
    offsets.push_back({ 0, (size_t)(data1.size() / 3), data1.size() });
    offsets.push_back({ 0, 0, 0 });

    std::vector<std::shared_ptr<data::CatStream> > streams;
    streams.push_back(ctx0.GetNewCatStream());
    streams.push_back(ctx1.GetNewCatStream());
    streams.push_back(ctx2.GetNewCatStream());

    std::vector<StatsTimer<true> > read_timers(3);
    std::vector<StatsTimer<true> > write_timers(3);

    common::ThreadPool pool;
    for (int i = 0; i < iterations; i++) {
        for (int id = 0; id < 3; id++) {
            pool.Enqueue([&files, &streams, &offsets, &read_timers, &write_timers, id]() {
                             write_timers[id].Start();
                             streams[id]->Scatter<Type>(files[id], offsets[id]);
                             write_timers[id].Stop();
                             auto reader = streams[id]->OpenCatReader(true);
                             read_timers[id].Start();
                             while (reader.HasNext()) {
                                 reader.Next<Type>();
                             }
                             read_timers[id].Stop();
                         });
        }
        pool.LoopUntilEmpty();
        std::cout << "RESULT"
                  << " datatype=" << type_as_string
                  << " size=" << bytes
                  << " write_time_worker0=" << write_timers[0].Microseconds()
                  << " read_time_worker0=" << read_timers[0].Microseconds()
                  << " write_time_worker1=" << write_timers[1].Microseconds()
                  << " read_time_worker1=" << read_timers[1].Microseconds()
                  << " write_time_worker2=" << write_timers[2].Microseconds()
                  << " read_time_worker2=" << read_timers[2].Microseconds()
                  << std::endl;
    }
}
예제 #10
0
void Bandwidth::Test(api::Context& ctx) {

    // only work with first thread on this host.
    if (ctx.local_worker_id() != 0) return;

    net::Group& group = ctx.net.group();

    bandwidth_ = AggMatrix(group.num_hosts());

    // data block to send or receive
    block_count_ = data_size_ / block_size_;
    data_block_.resize(block_size_ / sizeof(size_t), 42u);

    for (size_t outer_repeat = 0;
         outer_repeat < outer_repeats_; ++outer_repeat) {

        common::StatsTimerStopped timer;

        timer.Start();
        for (size_t inner_repeat = 0;
             inner_repeat < inner_repeats_; inner_repeat++) {
            // perform 1-factor ping pongs (without barriers)
            for (size_t round = 0; round < group.OneFactorSize(); ++round) {

                size_t peer = group.OneFactorPeer(round);

                sLOG0 << "round" << round
                      << "me" << ctx.host_rank() << "peer_id" << peer;

                if (ctx.host_rank() < peer) {
                    Sender(ctx, peer, inner_repeat);
                    Receiver(ctx, peer);
                }
                else if (ctx.host_rank() > peer) {
                    Receiver(ctx, peer);
                    Sender(ctx, peer, inner_repeat);
                }
                else {
                    // not participating in this round
                    counter_ += 2 * block_count_;
                }
            }
        }
        timer.Stop();

        size_t time = timer.Microseconds();
        // calculate maximum time.
        group.AllReduce(time, common::maximum<size_t>());

        if (ctx.my_rank() == 0) {
            std::cout
                << "RESULT"
                << " benchmark=" << benchmark
                << " hosts=" << ctx.num_hosts()
                << " outer_repeat=" << outer_repeat
                << " inner_repeats=" << inner_repeats_
                << " time[us]=" << time
                << " time_per_ping_pong[us]="
                << static_cast<double>(time) / static_cast<double>(counter_)
                << std::endl;
        }
    }

    // reduce (add) matrix to root.
    group.Reduce(bandwidth_);

    // print matrix
    if (ctx.my_rank() == 0)
        PrintMatrix(bandwidth_);
}