void decode_scatter_distribution(hwloc_topology& t,
        std::vector<mask_type>& affinities,
        std::size_t used_cores, std::size_t max_cores,
        std::vector<std::size_t>& num_pus, error_code& ec)
    {
        std::size_t num_threads = affinities.size();
        std::size_t num_cores = (std::min)(max_cores, t.get_number_of_cores());

        std::vector<std::size_t> num_pus_cores(num_cores, 0);
        num_pus.resize(num_threads);

        for (std::size_t num_thread = 0; num_thread != num_threads; /**/)
        {
            for(std::size_t num_core = 0; num_core != num_cores; ++num_core)
            {
                if (any(affinities[num_thread]))
                {
                    HPX_THROWS_IF(ec, bad_parameter, "decode_scatter_distribution",
                        boost::str(boost::format("affinity mask for thread %1% has "
                            "already been set") % num_thread));
                    return;
                }

                num_pus[num_thread] = t.get_pu_number(num_core + used_cores,
                    num_pus_cores[num_core]);
                affinities[num_thread] = t.init_thread_affinity_mask(
                    num_core + used_cores, num_pus_cores[num_core]++);

                if(++num_thread == num_threads)
                    return;
            }
        }
    }
    void decode_balanced_distribution(hwloc_topology& t,
        std::vector<mask_type>& affinities,
        std::size_t used_cores, std::size_t max_cores,
        std::vector<std::size_t>& num_pus, error_code& ec)
    {
        std::size_t num_threads = affinities.size();
        std::size_t num_cores = (std::min)(max_cores, t.get_number_of_cores());

        std::vector<std::size_t> num_pus_cores(num_cores, 0);
        num_pus.resize(num_threads);

        // At first, calculate the number of used pus per core.
        // This needs to be done to make sure that we occupy all the available
        // cores
        for (std::size_t num_thread = 0; num_thread != num_threads; /**/)
        {
            for(std::size_t num_core = 0; num_core != num_cores; ++num_core)
            {
                num_pus_cores[num_core]++;
                if(++num_thread == num_threads)
                    break;
            }
        }

        // Iterate over the cores and assigned pus per core. this additional
        // loop is needed so that we have consecutive worker thread numbers
        std::size_t num_thread = 0;
        for(std::size_t num_core = 0; num_core != num_cores; ++num_core)
        {
            for(std::size_t num_pu = 0; num_pu != num_pus_cores[num_core]; ++num_pu)
            {
                if (any(affinities[num_thread]))
                {
                    HPX_THROWS_IF(ec, bad_parameter,
                        "decode_balanced_distribution",
                        boost::str(boost::format(
                            "affinity mask for thread %1% has "
                            "already been set"
                        ) % num_thread));
                    return;
                }
                num_pus[num_thread] = t.get_pu_number(num_core + used_cores, num_pu);
                affinities[num_thread] = t.init_thread_affinity_mask(
                    num_core + used_cores, num_pu);
                ++num_thread;
            }
        }
    }