inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill(vector.begin(), vector.end(), v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); vector.resize(1000, queue); bc::fill(vector.begin(), vector.end(), v2, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector.back(), v2); bc::fill(vector.begin() + 500, vector.end(), v3, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector[499], v2); BOOST_CHECK_EQUAL(vector[500], v3); BOOST_CHECK_EQUAL(vector.back(), v3); }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill_n<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill_n(vector.begin(), 4, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); bc::fill_n(vector.begin(), 3, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1)); bc::fill_n(vector.begin() + 1, 2, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1)); bc::fill_n(vector.begin(), 4, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2)); // fill last element bc::fill_n(vector.end() - 1, 1, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3)); // fill first element bc::fill_n(vector.begin(), 1, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3)); }
/// Select best launch configuration for the given shared memory requirements. void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) { boost::compute::device dev = queue.get_device(); size_t ws; if ( is_cpu(queue) ) { ws = 1; } else { // Select workgroup size that would fit into the device. ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; } config(num_workgroups(queue), ws); }
void tune_accumulate(const compute::vector<T>& data, const size_t trials, compute::command_queue& queue) { boost::shared_ptr<compute::detail::parameter_cache> params = compute::detail::parameter_cache::get_global_cache(queue.get_device()); const std::string cache_key = std::string("__boost_reduce_on_gpu_") + compute::type_name<T>(); const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 }; const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; double min_time = std::numeric_limits<double>::max(); compute::uint_ best_tpb = 0; compute::uint_ best_vpt = 0; for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){ params->set(cache_key, "tpb", tpbs[i]); for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){ params->set(cache_key, "vpt", vpts[j]); try { const double t = perf_accumulate(data, trials, queue); if(t < min_time){ best_tpb = tpbs[i]; best_vpt = vpts[j]; min_time = t; } } catch(compute::opencl_error&){ // invalid parameters for this device, skip } } } // store optimal parameters params->set(cache_key, "tpb", best_tpb); params->set(cache_key, "vpt", best_vpt); }
size_t preferred_work_group_size_multiple(const boost::compute::command_queue &q) const { return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE); }
/// The size in bytes of shared memory per block available for this kernel. size_t max_shared_memory_per_block(const boost::compute::command_queue &q) const { boost::compute::device d = q.get_device(); return d.local_memory_size() - K.get_work_group_info<cl_ulong>(d, CL_KERNEL_LOCAL_MEM_SIZE); }
/// The maximum number of threads per block, beyond which a launch of the kernel would fail. size_t max_threads_per_block(const boost::compute::command_queue &q) const { return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_WORK_GROUP_SIZE); }
/// Standard number of workgroups to launch on a device. static inline size_t num_workgroups(const boost::compute::command_queue &q) { // This is a simple heuristic-based estimate. More advanced technique may // be employed later. return 8 * q.get_device().compute_units(); }