int hpx_main(
    variables_map& vm
    )
{
    if (vm.count("no-header"))
        header = false;

    {
        if (0 == tasks)
            throw std::invalid_argument("count of 0 tasks specified\n");

        // Start the clock.
        high_resolution_timer t;

        for (boost::uint64_t i = 0; i < tasks; ++i)
            register_work(HPX_STD_BIND(&invoke_worker, delay));

        // Reschedule hpx_main until all other hpx-threads have finished. We
        // should be resumed after most of the null px-threads have been
        // executed. If we haven't, we just reschedule ourselves again.
        do {
            suspend();
        } while (get_thread_count(hpx::threads::thread_priority_normal) > 1);

        print_results(get_os_thread_count(), t.elapsed());
    }

    return finalize();
}
int hpx_main(
    variables_map& vm
    )
{
    if (vm.count("no-header"))
        header = false;

    {
        if (0 == tasks)
            throw std::invalid_argument("count of 0 tasks specified\n");

        if (0 == feeders || feeders > get_os_thread_count())
            throw std::invalid_argument("number of feeders must be between 1 and OS-thread-count\n");

        // Start the clock.
        thread_aware_timer t;

        if (0 == vm.count("no-stack")) {
            for (boost::uint64_t i = 0; i != feeders; ++i)
                register_work(HPX_STD_BIND(&create_tasks, tasks/feeders));
        }
        else {
            for (boost::uint64_t i = 0; i != feeders; ++i)
                register_work(HPX_STD_BIND(&create_stackless_tasks, tasks/feeders));
        }

        // Reschedule hpx_main until all other hpx-threads have finished. We
        // should be resumed after most of the null HPX-threads have been
        // executed. If we haven't, we just reschedule ourselves again.
        do {
            suspend();
        } while (get_thread_count(hpx::threads::thread_priority_normal) > 1);

        print_results(get_os_thread_count(), t.elapsed());
    }

    return finalize();
}
int hpx_main(
    variables_map& vm
    )
{
    if (vm.count("no-header"))
        header = false;

    ///////////////////////////////////////////////////////////////////////////
    // Initialize the PRNG seed.
    if (!seed)
        seed = boost::uint64_t(std::time(0));

    {

        ///////////////////////////////////////////////////////////////////////
        // Validate command-line arguments.
        if (0 == tasks)
            throw std::invalid_argument("count of 0 tasks specified\n");

        if (min_delay > max_delay)
            throw std::invalid_argument("minimum delay cannot be larger than "
                                        "maximum delay\n");

        if (min_delay > total_delay)
            throw std::invalid_argument("minimum delay cannot be larger than"
                                        "total delay\n");

        if (max_delay > total_delay)
            throw std::invalid_argument("maximum delay cannot be larger than "
                                        "total delay\n");

        if ((min_delay * tasks) > total_delay)
            throw std::invalid_argument("minimum delay is too small for the "
                                        "specified total delay and number of "
                                        "tasks\n");

        if ((max_delay * tasks) < total_delay)
            throw std::invalid_argument("maximum delay is too small for the "
                                        "specified total delay and number of "
                                        "tasks\n");

        ///////////////////////////////////////////////////////////////////////
        // Randomly generate a description of the heterogeneous workload.
        std::vector<boost::uint64_t> payloads;
        payloads.reserve(tasks);

        // For random numbers, we use a 64-bit specialization of Boost.Random's
        // mersenne twister engine (good uniform distribution up to 311
        // dimensions, cycle length 2 ^ 19937 - 1)
        boost::random::mt19937_64 prng(seed);

        boost::uint64_t current_sum = 0;

        for (boost::uint64_t i = 0; i < tasks; ++i)
        {
            // Credit to Spencer Ruport for putting this algorithm on
            // stackoverflow.
            boost::uint64_t const low_calc
                = (total_delay - current_sum) - (max_delay * (tasks - 1 - i));

            bool const negative
                = (total_delay - current_sum) < (max_delay * (tasks - 1 - i));

            boost::uint64_t const low
                = (negative || (low_calc < min_delay)) ? min_delay : low_calc;

            boost::uint64_t const high_calc
                = (total_delay - current_sum) - (min_delay * (tasks - 1 - i));

            boost::uint64_t const high
                = (high_calc > max_delay) ? max_delay : high_calc;

            // Our range is [low, high].
            boost::random::uniform_int_distribution<boost::uint64_t>
                dist(low, high);

            boost::uint64_t const payload = dist(prng);

            if (payload < min_delay)
                throw std::logic_error("task delay is below minimum");

            if (payload > max_delay)
                throw std::logic_error("task delay is above maximum");

            current_sum += payload;
            payloads.push_back(payload);
        }

        // Randomly shuffle the entire sequence to deal with drift.
        boost::function<boost::uint64_t(boost::uint64_t)> shuffler_f =
            boost::bind(&shuffler, boost::ref(prng), _1);
        std::random_shuffle(payloads.begin(), payloads.end()
                          , shuffler_f);

        ///////////////////////////////////////////////////////////////////////
        // Validate the payloads.
        if (payloads.size() != tasks)
            throw std::logic_error("incorrect number of tasks generated");

        boost::uint64_t const payloads_sum =
            std::accumulate(payloads.begin(), payloads.end(), 0ULL);
        if (payloads_sum != total_delay)
            throw std::logic_error("incorrect total delay generated");

        ///////////////////////////////////////////////////////////////////////
        // Start the clock.
        high_resolution_timer t;

        ///////////////////////////////////////////////////////////////////////
        // Queue the tasks in a serial loop.
        for (boost::uint64_t i = 0; i < tasks; ++i)
            register_work(HPX_STD_BIND(&invoke_worker, payloads[i]));

        ///////////////////////////////////////////////////////////////////////
        // Wait for the work to finish.
        do {
            // Reschedule hpx_main until all other px-threads have finished. We
            // should be resumed after most of the null px-threads have been
            // executed. If we haven't, we just reschedule ourselves again.
            suspend();
        } while (get_thread_count(hpx::threads::thread_priority_normal) > 1);

        ///////////////////////////////////////////////////////////////////////
        // Print the results.
        print_results(get_os_thread_count(), t.elapsed());
    }

    finalize();
    return 0;
}
int hpx_main(
    variables_map& vm
    )
{
    {
        if (vm.count("no-header"))
            header = false;

        if (vm.count("csv-header"))
            csv_header = true;

        if (0 == tasks)
            throw std::invalid_argument("count of 0 tasks specified\n");

        if (suspended_tasks > tasks)
            throw std::invalid_argument(
                "suspended tasks must be smaller than tasks\n");

        std::uint64_t const os_thread_count = get_os_thread_count();

        ///////////////////////////////////////////////////////////////////////
        stage_worker_function stage_worker;

        if ("static-balanced-stackbased" == distribution)
            stage_worker = &stage_worker_static_balanced_stackbased;
        else if ("static-imbalanced" == distribution)
            stage_worker = &stage_worker_static_imbalanced;
        else if ("round-robin" == distribution)
            stage_worker = &stage_worker_round_robin;
        else
            throw std::invalid_argument(
                "invalid distribution type specified (valid options are "
                "\"static-balanced\", \"static-imbalanced\" or \"round-robin\")"
                );

        ///////////////////////////////////////////////////////////////////////
        std::uint64_t tasks_per_feeder = 0;
        //std::uint64_t total_tasks = 0;
        std::uint64_t suspended_tasks_per_feeder = 0;
        std::uint64_t total_suspended_tasks = 0;

        if ("strong" == scaling)
        {
            if (tasks % os_thread_count)
                throw std::invalid_argument(
                    "tasks must be cleanly divisable by OS-thread count\n");

            if (suspended_tasks % os_thread_count)
                throw std::invalid_argument(
                    "suspended tasks must be cleanly divisable by OS-thread "
                    "count\n");

            tasks_per_feeder = tasks / os_thread_count;
            //total_tasks      = tasks;
            suspended_tasks_per_feeder = suspended_tasks / os_thread_count;
            total_suspended_tasks      = suspended_tasks;
        }
        else if ("weak" == scaling)
        {
            tasks_per_feeder = tasks;
            //total_tasks      = tasks * os_thread_count;
            suspended_tasks_per_feeder = suspended_tasks;
            total_suspended_tasks      = suspended_tasks * os_thread_count;
        }
        else
            throw std::invalid_argument(
                "invalid scaling type specified (valid options are \"strong\" "
                "or \"weak\")");

        ///////////////////////////////////////////////////////////////////////
        if (suspended_tasks != 0)
        {
            std::uint64_t gcd = boost::math::gcd(tasks_per_feeder
                                                 , suspended_tasks_per_feeder);

            suspend_step = suspended_tasks_per_feeder / gcd;
            // We check earlier to make sure that there are never more
            // suspended tasks than tasks requested.
            no_suspend_step = (tasks_per_feeder / gcd) - suspend_step;
        }

        ///////////////////////////////////////////////////////////////////////
        std::vector<std::string> counter_shortnames;
        std::vector<std::string> counters;
        if (vm.count("counter"))
        {
            std::vector<std::string> raw_counters =
                vm["counter"].as<std::vector<std::string> >();

            for (std::uint64_t i = 0; i < raw_counters.size(); ++i)
            {
                std::vector<std::string> entry;
                boost::algorithm::split(entry, raw_counters[i],
                    boost::algorithm::is_any_of(","),
                    boost::algorithm::token_compress_on);

                HPX_ASSERT(entry.size() == 2);

                counter_shortnames.push_back(entry[0]);
                counters.push_back(entry[1]);
            }
        }

        std::shared_ptr<hpx::util::activate_counters> ac;
        if (!counters.empty())
            ac.reset(new hpx::util::activate_counters(counters));

        ///////////////////////////////////////////////////////////////////////
        // Start the clock.
        high_resolution_timer t;

        if (ac)
            ac->reset_counters();

        // This needs to stay here; we may have suspended as recently as the
        // performance counter reset (which is called just before the staging
        // function).
        std::uint64_t const num_thread = hpx::get_worker_thread_num();

        for (std::uint64_t i = 0; i < os_thread_count; ++i)
        {
            if (num_thread == i) continue;

            register_work(hpx::util::bind(&stage_workers
                                    , i
                                    , tasks_per_feeder
                                    , stage_worker
                                      )
                , "stage_workers"
                , hpx::threads::pending
                , hpx::threads::thread_priority_normal
                , i
                  );
        }

        stage_workers(num_thread, tasks_per_feeder, stage_worker);

        double warmup_estimate = t.elapsed();

        // Schedule a low-priority thread; when it is executed, it checks to
        // make sure all the tasks (which are normal priority) have been
        // executed, and then it
        hpx::lcos::local::barrier finished(2);

        register_work(hpx::util::bind(&wait_for_tasks
                                , std::ref(finished)
                                , total_suspended_tasks
                                 )
            , "wait_for_tasks", hpx::threads::pending
            , hpx::threads::thread_priority_low);

        finished.wait();

        // Stop the clock
        double time_elapsed = t.elapsed();

        print_results(os_thread_count, time_elapsed, warmup_estimate
                    , counter_shortnames, ac);
    }

    if (suspended_tasks != 0)
        // Force termination of all suspended tasks.
        hpx::get_runtime().get_thread_manager().abort_all_suspended_threads();

    return finalize();
}
int hpx_main(
    variables_map& vm
)
{
    if (vm.count("no-header"))
        header = false;

    // delay in seconds
    delay_sec = delay * 1.0E-6;

    std::size_t num_os_threads = hpx::get_os_thread_count();

    int num_executors = vm["executors"].as<int>();
    if (num_executors <= 0)
        throw std::invalid_argument("number of executors to use must be larger than 0");

    if (num_executors > std::size_t(num_os_threads))
        throw std::invalid_argument("number of executors to use must be smaller than number of OS threads");

    std::size_t num_cores_per_executor = vm["cores"].as<int>();

    if ((num_executors - 1) * num_cores_per_executor > num_os_threads)
        throw std::invalid_argument("number of cores per executor should not cause oversubscription");

    if (0 == tasks)
        throw std::invalid_argument("count of 0 tasks specified\n");

    // Reset performance counters (if specified on command line)
    reset_active_counters();

    // Start the clock.
    high_resolution_timer t;

    // create the executor instances
    using hpx::threads::executors::local_priority_queue_executor;

    {
        std::vector<local_priority_queue_executor> executors;
        for (std::size_t i = 0; i != std::size_t(num_executors); ++i)
        {
            // make sure we don't oversubscribe the cores, the last executor will
            // be bound to the remaining number of cores
            if ((i + 1) * num_cores_per_executor > num_os_threads)
            {
                HPX_ASSERT(i == num_executors - 1);
                num_cores_per_executor = num_os_threads - i * num_cores_per_executor;
            }
            executors.push_back(local_priority_queue_executor(num_cores_per_executor));
        }

        t.restart();

        for (boost::uint64_t i = 0; i < tasks; ++i)
            executors[i % num_executors].add(HPX_STD_BIND(&invoke_worker_timed, delay_sec));

        // destructors of executors will wait for all tasks to finish executing
    }

    // Stop the clock
    double time_elapsed = t.elapsed();

    // Stop Performance Counters
    stop_active_counters();

    print_results(get_os_thread_count(), time_elapsed);

    return finalize();
}