/// Maps the buffer into the host address space. /// /// \see_opencl_ref{clEnqueueMapBuffer} void map(cl_map_flags flags, command_queue &queue) { BOOST_ASSERT(m_mapped_ptr == 0); m_mapped_ptr = queue.enqueue_map_buffer( m_buffer, flags, 0, m_buffer.size() ); }
inline DeviceIterator copy_to_device_map(HostIterator first, HostIterator last, DeviceIterator result, command_queue &queue, const wait_list &events) { typedef typename std::iterator_traits<DeviceIterator>::value_type value_type; typedef typename std::iterator_traits<DeviceIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); if(count == 0){ return result; } size_t offset = result.get_index(); // map result buffer to host value_type *pointer = static_cast<value_type*>( queue.enqueue_map_buffer( result.get_buffer(), CL_MAP_WRITE, offset * sizeof(value_type), count * sizeof(value_type), events ) ); // copy [first; last) to result buffer std::copy(first, last, pointer); // unmap result buffer boost::compute::event unmap_event = queue.enqueue_unmap_buffer( result.get_buffer(), static_cast<void*>(pointer) ); unmap_event.wait(); return result + static_cast<difference_type>(count); }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, ::boost::compute::less< typename std::iterator_traits< InputIterator >::value_type > compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate // zero-copy buffers are used to eliminate copying data back to host vector<input_type, ::boost::compute::pinned_allocator<input_type> > candidates(work_groups_no, context); vector<uint_, ::boost::compute::pinned_allocator <uint_> > candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // mapping candidates and their indices to host input_type* candidates_host_ptr = static_cast<input_type*>( queue.enqueue_map_buffer( candidates.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(input_type) ) ); uint_* candidates_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( candidates_idx.get_buffer(), command_queue::map_read, 0, work_groups_no * sizeof(uint_) ) ); input_type* i = candidates_host_ptr; uint_* idx = candidates_idx_host_ptr; uint_* extremum_idx = idx; input_type extremum = *candidates_host_ptr; i++; idx++; // find extremum (serial) from among the candidates on host if(!find_minimum) { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next > extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } else { while(idx != (candidates_idx_host_ptr + work_groups_no)) { input_type next = *i; bool compare_result = next < extremum; bool equal = next == extremum; extremum = compare_result ? next : extremum; extremum_idx = compare_result ? idx : extremum_idx; extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; idx++, i++; } } return first + static_cast<difference_type>(*extremum_idx); }
InputIterator find_extrema_with_reduce(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::value_type input_type; const context &context = queue.get_context(); const device &device = queue.get_device(); // Getting information about used queue and device const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); const size_t count = detail::iterator_range_size(first, last); std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name<input_type>(); // load parameters boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); // get preferred work group size and preferred number // of work groups per compute unit size_t work_group_size = parameters->get(cache_key, "wgsize", 256); size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); // calculate work group size and number of work groups work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( work_groups_no, static_cast<size_t>(std::ceil(float(count) / work_group_size)) ); // phase I: finding candidates for extremum // device buffors for extremum candidates and their indices // each work-group computes its candidate vector<input_type> candidates(work_groups_no, context); vector<uint_> candidates_idx(work_groups_no, context); // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue ); // phase II: finding extremum from among the candidates // zero-copy buffers for final result (value and index) vector<input_type, ::boost::compute::pinned_allocator<input_type> > result(1, context); vector<uint_, ::boost::compute::pinned_allocator<uint_> > result_idx(1, context); // get extremum from among the candidates find_extrema_with_reduce( candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue ); // mapping extremum index to host uint_* result_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( result_idx.get_buffer(), command_queue::map_read, 0, sizeof(uint_) ) ); return first + static_cast<difference_type>(*result_idx_host_ptr); }
inline InputIterator find_extrema_on_cpu(InputIterator first, InputIterator last, Compare compare, const bool find_minimum, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = iterator_range_size(first, last); const device &device = queue.get_device(); const uint_ compute_units = queue.get_device().compute_units(); boost::shared_ptr<parameter_cache> parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_find_extrema_cpu_" + boost::lexical_cast<std::string>(sizeof(input_type)); // for inputs smaller than serial_find_extrema_threshold // serial_find_extrema algorithm is used uint_ serial_find_extrema_threshold = parameters->get( cache_key, "serial_find_extrema_threshold", 16384 * sizeof(input_type) ); serial_find_extrema_threshold = (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); const context &context = queue.get_context(); if(count < serial_find_extrema_threshold) { return serial_find_extrema(first, last, compare, find_minimum, queue); } meta_kernel k("find_extrema_on_cpu"); buffer output(context, sizeof(input_type) * compute_units); buffer output_idx( context, sizeof(uint_) * compute_units, buffer::read_write | buffer::alloc_host_ptr ); size_t count_arg = k.add_arg<uint_>("count"); size_t output_arg = k.add_arg<input_type *>(memory_object::global_memory, "output"); size_t output_idx_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "uint value_index = index;\n" << k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << "index++;\n" << "while(index < end){\n" << k.decl<input_type>("candidate") << " = " << first[k.var<uint_>("index")] << ";\n" << "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << "bool compare = " << compare(k.var<input_type>("candidate"), k.var<input_type>("value")) << ";\n" << "#else\n" << "bool compare = " << compare(k.var<input_type>("value"), k.var<input_type>("candidate")) << ";\n" << "#endif\n" << "value = compare ? candidate : value;\n" << "value_index = compare ? index : value_index;\n" << "index++;\n" << "}\n" << "output[get_global_id(0)] = value;\n" << "output_idx[get_global_id(0)] = value_index;\n"; size_t global_work_size = compute_units; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast<uint_>(count)); kernel.set_arg(output_arg, output); kernel.set_arg(output_idx_arg, output_idx); queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); buffer_iterator<input_type> result = serial_find_extrema( make_buffer_iterator<input_type>(output), make_buffer_iterator<input_type>(output, global_work_size), compare, find_minimum, queue ); uint_* output_idx_host_ptr = static_cast<uint_*>( queue.enqueue_map_buffer( output_idx, command_queue::map_read, 0, global_work_size * sizeof(uint_) ) ); difference_type extremum_idx = static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); return first + extremum_idx; }