int hpx_main(
    variables_map& vm
)
{
    if (vm.count("no-header"))
        header = false;

    // delay in seconds
    delay_sec = delay * 1.0E-6;

    std::size_t num_os_threads = hpx::get_os_thread_count();

    int num_executors = vm["executors"].as<int>();
    if (num_executors <= 0)
        throw std::invalid_argument("number of executors to use must be larger than 0");

    if (num_executors > std::size_t(num_os_threads))
        throw std::invalid_argument("number of executors to use must be smaller than number of OS threads");

    std::size_t num_cores_per_executor = vm["cores"].as<int>();

    if ((num_executors - 1) * num_cores_per_executor > num_os_threads)
        throw std::invalid_argument("number of cores per executor should not cause oversubscription");

    if (0 == tasks)
        throw std::invalid_argument("count of 0 tasks specified\n");

    // Reset performance counters (if specified on command line)
    reset_active_counters();

    // Start the clock.
    high_resolution_timer t;

    // create the executor instances
    using hpx::threads::executors::local_priority_queue_executor;

    {
        std::vector<local_priority_queue_executor> executors;
        for (std::size_t i = 0; i != std::size_t(num_executors); ++i)
        {
            // make sure we don't oversubscribe the cores, the last executor will
            // be bound to the remaining number of cores
            if ((i + 1) * num_cores_per_executor > num_os_threads)
            {
                HPX_ASSERT(i == num_executors - 1);
                num_cores_per_executor = num_os_threads - i * num_cores_per_executor;
            }
            executors.push_back(local_priority_queue_executor(num_cores_per_executor));
        }

        t.restart();

        for (boost::uint64_t i = 0; i < tasks; ++i)
            executors[i % num_executors].add(HPX_STD_BIND(&invoke_worker_timed, delay_sec));

        // destructors of executors will wait for all tasks to finish executing
    }

    // Stop the clock
    double time_elapsed = t.elapsed();

    // Stop Performance Counters
    stop_active_counters();

    print_results(get_os_thread_count(), time_elapsed);

    return finalize();
}