static hpx::future<FwdIter> call(ExPolicy policy, FwdIter first, std::size_t count, F1 && f1, std::size_t chunk_size) { typedef typename ExPolicy::executor_type executor_type; typedef typename hpx::parallel::executor_traits<executor_type> executor_traits; typedef typename hpx::util::tuple<FwdIter, std::size_t> tuple; FwdIter last = first; std::advance(last, count); std::vector<hpx::future<Result> > inititems, workitems; std::list<boost::exception_ptr> errors; std::vector<tuple> shape; try { // estimates a chunk size based on number of cores used shape = get_bulk_iteration_shape(policy, inititems, f1, first, count, chunk_size); workitems.reserve(shape.size()); using hpx::util::bind; using hpx::util::functional::invoke_fused; using hpx::util::placeholders::_1; workitems = executor_traits::async_execute( policy.executor(), bind(invoke_fused(), std::forward<F1>(f1), _1), shape); } catch (std::bad_alloc const&) { return hpx::make_exceptional_future<FwdIter>( boost::current_exception()); } catch (...) { errors.push_back(boost::current_exception()); } // wait for all tasks to finish return hpx::dataflow( [last, errors](std::vector<hpx::future<Result> > && r1, std::vector<hpx::future<Result> > && r2) mutable -> FwdIter { detail::handle_local_exceptions<ExPolicy>::call(r1, errors); detail::handle_local_exceptions<ExPolicy>::call(r2, errors); return last; }, std::move(inititems), std::move(workitems)); }
// requires traits::is_future<Future> std::size_t get_static_chunk_size(ExPolicy policy, std::vector<Future>& workitems, F1 && f1, FwdIter& first, std::size_t& count, std::size_t chunk_size) { if (chunk_size == 0) { chunk_size = policy.get_chunk_size(); if (chunk_size == 0) { typedef typename ExPolicy::executor_type executor_type; std::size_t const cores = executor_traits<executor_type>:: os_thread_count(policy.executor()); if (count > 100*cores) chunk_size = auto_chunk_size(workitems, f1, first, count); if (chunk_size == 0) chunk_size = (count + cores - 1) / cores; } } return chunk_size; }
// requires traits::is_future<Future> std::vector<hpx::util::tuple<FwdIter, std::size_t> > get_bulk_iteration_shape( ExPolicy policy, std::vector<Future>& workitems, F1 && f1, FwdIter& first, std::size_t& count, std::size_t chunk_size) { typedef typename ExPolicy::executor_parameters_type parameters_type; typedef executor_parameter_traits<parameters_type> traits; typedef hpx::util::tuple<FwdIter, std::size_t> tuple_type; typedef typename ExPolicy::executor_type executor_type; std::size_t const cores = executor_traits<executor_type>:: processing_units_count(policy.executor(), policy.parameters()); bool variable_chunk_sizes = traits::variable_chunk_size( policy.parameters(), policy.executor()); std::vector<tuple_type> shape; if (!variable_chunk_sizes || chunk_size != 0) { if (chunk_size == 0) { auto test_function = [&]() -> std::size_t { std::size_t test_chunk_size = count / 100; if (test_chunk_size == 0) return 0; add_ready_future(workitems, f1, first, test_chunk_size); std::advance(first, test_chunk_size); count -= test_chunk_size; return test_chunk_size; }; chunk_size = traits::get_chunk_size(policy.parameters(), policy.executor(), test_function, count); } if (chunk_size == 0) chunk_size = (count + cores - 1) / cores; shape.reserve(count / chunk_size + 1); while (count != 0) { std::size_t chunk = (std::min)(chunk_size, count); shape.push_back(hpx::util::make_tuple(first, chunk)); count -= chunk; std::advance(first, chunk); } } else { while (count != 0) { chunk_size = traits::get_chunk_size( policy.parameters(), policy.executor(), [](){ return 0; }, count); if (chunk_size == 0) chunk_size = (count + cores - 1) / cores; std::size_t chunk = (std::min)(chunk_size, count); shape.push_back(hpx::util::make_tuple(first, chunk)); count -= chunk; std::advance(first, chunk); } } return shape; }
typename util::detail::algorithm_result<ExPolicy, OutIter>::type set_operation(ExPolicy policy, RanIter1 first1, RanIter1 last1, RanIter2 first2, RanIter2 last2, OutIter dest, F && f, Combiner && combiner, SetOp && setop) { typedef typename std::iterator_traits<RanIter1>::difference_type difference_type1; typedef typename std::iterator_traits<RanIter2>::difference_type difference_type2; // allocate intermediate buffers difference_type1 len1 = std::distance(first1, last1); difference_type2 len2 = std::distance(first2, last2); typedef typename set_operations_buffer<OutIter>::type buffer_type; boost::shared_array<buffer_type> buffer( new buffer_type[combiner(len1, len2)]); typedef typename ExPolicy::executor_type executor_type; std::size_t cores = executor_information_traits<executor_type>:: processing_units_count(policy.executor(), policy.parameters()); std::size_t step = (len1 + cores - 1) / cores; boost::shared_array<set_chunk_data> chunks(new set_chunk_data[cores]); // fill the buffer piecewise return parallel::util::partitioner<ExPolicy, OutIter, void>::call( policy, chunks.get(), cores, // first step, is applied to all partitions [=](set_chunk_data* curr_chunk, std::size_t part_size) { HPX_ASSERT(part_size == 1); // find start in sequence 1 std::size_t start1 = (curr_chunk - chunks.get()) * step; std::size_t end1 = (std::min)(start1 + step, std::size_t(len1)); bool first_partition = (start1 == 0); bool last_partition = (end1 == std::size_t(len1)); // all but the last chunk require special handling if (!last_partition) { // this chunk will be handled by the next one if all // elements of this partition are equal if (!f(first1[start1], first1[end1 + 1])) return; // move backwards to find earliest element which is equal to // the last element of the current chunk while (end1 != 0 && !f(first1[end1 - 1], first1[end1])) --end1; } // move backwards to find earliest element which is equal to // the first element of the current chunk while (start1 != 0 && !f(first1[start1 - 1], first1[start1])) --start1; // find start and end in sequence 2 std::size_t start2 = 0; if (!first_partition) { start2 = std::lower_bound( first2, first2 + len2, first1[start1], f ) - first2; } std::size_t end2 = len2; if (!last_partition) { end2 = std::lower_bound( first2 + start2, first2 + len2, first1[end1], f ) - first2; } // perform requested set-operation into the proper place of the // intermediate buffer curr_chunk->start = combiner(start1, start2); auto buffer_dest = buffer.get() + curr_chunk->start; curr_chunk->len = setop(first1 + start1, first1 + end1, first2 + start2, first2 + end2, buffer_dest, f ) - buffer_dest; }, // second step, is executed after all partitions are done running [buffer, chunks, cores, dest](std::vector<future<void> >&&) -> OutIter { // accumulate real length set_chunk_data* chunk = chunks.get(); chunk->start_index = 0; for (size_t i = 1; i != cores; ++i) { set_chunk_data* curr_chunk = chunk++; chunk->start_index = curr_chunk->start_index + curr_chunk->len; } // finally, copy data to destination parallel::util::foreach_partitioner< hpx::parallel::parallel_execution_policy >::call(par, chunks.get(), cores, [buffer, dest]( set_chunk_data* chunk, std::size_t, std::size_t) { std::copy(buffer.get() + chunk->start, buffer.get() + chunk->start + chunk->len, dest + chunk->start_index); }, [](set_chunk_data* last) -> set_chunk_data* { return last; }); return dest; }); }