inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill(vector.begin(), vector.end(), v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); vector.resize(1000, queue); bc::fill(vector.begin(), vector.end(), v2, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector.back(), v2); bc::fill(vector.begin() + 500, vector.end(), v3, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector[499], v2); BOOST_CHECK_EQUAL(vector[500], v3); BOOST_CHECK_EQUAL(vector.back(), v3); }
static decltype(auto) call(std::vector<neu::layer::any_layer>& layers, InputRange const& initial_delta, OutputRange& result_prev_delta, boost::compute::command_queue& queue) { gpu_vector delta(initial_delta.begin(), initial_delta.end(), queue); gpu_vector prev_delta(queue.get_context()); for(int i = layers.size()-1; i >= 0; --i) { auto& l = layers.at(i); prev_delta.resize(::neu::layer::whole_input_size(l), queue); auto prev_delta_range = range::to_range(prev_delta); #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.backward( range::to_range(delta), prev_delta_range, queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\tbackward\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE delta.swap(prev_delta); } range::copy(delta, result_prev_delta, queue); }
decltype(auto) matrix_transpose( InputRange const& input, OutputRange& output, int row_size, int col_size, boost::compute::command_queue& queue) { NEU_ASSERT(row_size*col_size == range::distance(input)); static auto transpose_kernel = neu::make_kernel(neu::layer::impl::matrix_transpose_kernel_source, "matrix_transpose", queue.get_context()); transpose_kernel.set_args( range::get_buffer(input), static_cast<cl_int>(range::get_begin_index(input)), range::get_buffer(output), static_cast<cl_int>(range::get_begin_index(output)), static_cast<cl_int>(row_size), static_cast<cl_int>(col_size)); std::size_t global[2] = { static_cast<std::size_t>(((col_size-1)/32+1)*32), static_cast<std::size_t>(((row_size-1)/32+1)*32) }; std::size_t local[2] = { static_cast<std::size_t>(32), static_cast<std::size_t>(32) }; queue.enqueue_nd_range_kernel(transpose_kernel, 2, nullptr, global, local); }
void perf_random_number_engine(const size_t size, const size_t trials, compute::command_queue& queue) { typedef typename Engine::result_type T; // create random number engine Engine engine(queue); // create vector on the device std::cout << "size = " << size << std::endl; compute::vector<T> vector(size, queue.get_context()); // generate random numbers perf_timer t; for(size_t i = 0; i < trials; i++){ t.start(); engine.generate(vector.begin(), vector.end(), queue); queue.finish(); t.stop(); } // print result std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; std::cout << "rate: " << perf_rate<T>(size, t.min_time()) << " MB/s" << std::endl; }
static decltype(auto) call( std::vector<neu::layer::any_layer>& layers, int batch_size, InputRange const& initial_input, OutputRange& result_output, boost::compute::command_queue& queue) { gpu_vector input(initial_input.begin(), initial_input.end(), queue); gpu_vector output(queue.get_context()); int i = 0; for(auto& l : layers) { output.resize(::neu::layer::output_dim(l)*batch_size, queue); /* std::cout << "whole" << ::neu::layer::whole_output_size(l) << std::endl; std::cout << "i" << i << std::endl; std::cout << "aa" << output.size() << std::endl; */ auto output_range = range::to_range(output); #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.test_forward(batch_size, range::to_range(input), output_range, queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\ttest_forward\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE input.swap(output); ++i; } range::copy(input, result_output, queue); }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill_n<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill_n(vector.begin(), 4, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); bc::fill_n(vector.begin(), 3, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1)); bc::fill_n(vector.begin() + 1, 2, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1)); bc::fill_n(vector.begin(), 4, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2)); // fill last element bc::fill_n(vector.end() - 1, 1, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3)); // fill first element bc::fill_n(vector.begin(), 1, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3)); }
void read(boost::compute::command_queue q, size_t offset, size_t size, T *host, bool blocking = false ) const { if (size) { if (blocking) { q.enqueue_read_buffer( buffer, sizeof(T) * offset, sizeof(T) * size, host ); } else { q.enqueue_read_buffer_async( buffer, sizeof(T) * offset, sizeof(T) * size, host ); } } }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) { bc::vector<T> vector(4, queue.get_context()); bc::fill_n(vector.begin(), 4, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); bc::fill_n(vector.begin(), 3, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1)); bc::fill_n(vector.begin() + 1, 2, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1)); bc::fill_n(vector.begin(), 4, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2)); // fill last element bc::fill_n(vector.end() - 1, 1, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3)); // fill first element bc::fill_n(vector.begin(), 1, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3)); }
inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) { bc::vector<T> vector(4, queue.get_context()); bc::fill(vector.begin(), vector.end(), v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); vector.resize(1000, queue); bc::fill(vector.begin(), vector.end(), v2, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector.back(), v2); bc::fill(vector.begin() + 500, vector.end(), v3, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector[499], v2); BOOST_CHECK_EQUAL(vector[500], v3); BOOST_CHECK_EQUAL(vector.back(), v3); }
device_vector(const boost::compute::command_queue &q, size_t n, const T *host = 0, mem_flags flags = MEM_READ_WRITE) { if (host && !(flags & CL_MEM_USE_HOST_PTR)) flags |= CL_MEM_COPY_HOST_PTR; if (n) buffer = boost::compute::buffer(q.get_context(), n * sizeof(T), flags, static_cast<void*>(const_cast<T*>(host))); }
void generate(OutputIterator first, OutputIterator last, Function op, boost::compute::command_queue &queue) { boost::compute::vector<T> tmp(std::distance(first, last), queue.get_context()); BOOST_COMPUTE_FUNCTION(T, max_random, (const T x), { if(get_global_id(0) < 1) return (ValueType) MAX_RANDOM; else return (ValueType) 0; });
mapped_array map(boost::compute::command_queue q) const { return mapped_array( static_cast<T*>( q.enqueue_map_buffer( buffer, CL_MAP_READ, 0, size() * sizeof(T) ) ), buffer_unmapper(q, buffer) ); }
void saxpy(const int num, bool gen = true, int iter = 0) { static compute::device gpu; static compute::context context; static compute::command_queue queue; static compute::vector<T> x; static compute::vector<T> y; static compute::vector<T> res; static T alpha = 3.5; using compute::lambda::_1; using compute::lambda::_2; if (gen) { gpu = compute::system::default_device(); context = compute::context(gpu); queue = compute::command_queue(context, gpu); x = compute::vector<T>(num, context); std::vector<T> h_x(num); std::generate(h_x.begin(), h_x.end(), rand); compute::copy(h_x.begin(), h_x.end(), x.begin(), queue); y = compute::vector<T>(num, context); std::vector<T> h_y(num); std::generate(h_y.begin(), h_y.end(), rand); compute::copy(h_y.begin(), h_y.end(), y.begin(), queue); res = compute::vector<T>(num, context); queue.finish(); } for (int i = 0; i < iter; i++) { compute::transform(x.begin(), x.end(), y.begin(), res.begin(), alpha * _1 + _2, queue); } queue.finish(); }
double perf_accumulate(const compute::vector<T>& data, const size_t trials, compute::command_queue& queue) { perf_timer t; for(size_t trial = 0; trial < trials; trial++){ t.start(); compute::accumulate(data.begin(), data.end(), T(0), queue); queue.finish(); t.stop(); } return t.min_time(); }
static decltype(auto) call(std::vector<neu::layer::any_layer>& layers, boost::compute::command_queue& queue) { int i = 0; for(auto& l : layers) { #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.update(queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\tupdate\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE ++i; } }
void test_copy_if_odd(compute::command_queue &queue) { // create input and output vectors on the device const compute::context &context = queue.get_context(); compute::vector<int> input(PERF_N, context); compute::vector<int> output(PERF_N, context); // generate random numbers between 1 and 10 compute::default_random_engine rng(queue); compute::uniform_int_distribution<int> d(1, 10); d.generate(input.begin(), input.end(), rng, queue); BOOST_COMPUTE_FUNCTION(bool, is_odd, (int x), { return x & 1; });
// tesselates a sphere with radius, phi_slices, and theta_slices. returns // a shared opencl/opengl buffer containing the vertex data. compute::opengl_buffer tesselate_sphere(float radius, size_t phi_slices, size_t theta_slices, compute::command_queue &queue) { using compute::dim; const compute::context &context = queue.get_context(); const size_t vertex_count = phi_slices * theta_slices; // create opengl buffer GLuint vbo; vtkgl::GenBuffersARB(1, &vbo); vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, vbo); vtkgl::BufferDataARB(vtkgl::ARRAY_BUFFER, sizeof(float) * 4 * vertex_count, NULL, vtkgl::STREAM_DRAW); vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, 0); // create shared opengl/opencl buffer compute::opengl_buffer vertex_buffer(context, vbo); // tesselate_sphere kernel source const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( __kernel void tesselate_sphere(float radius, uint phi_slices, uint theta_slices, __global float4 *vertex_buffer) { const uint phi_i = get_global_id(0); const uint theta_i = get_global_id(1); const float phi = phi_i * 2.f * M_PI_F / phi_slices; const float theta = theta_i * 2.f * M_PI_F / theta_slices; float4 v; v.x = radius * cos(theta) * cos(phi); v.y = radius * cos(theta) * sin(phi); v.z = radius * sin(theta); v.w = 1.f; vertex_buffer[phi_i*phi_slices+theta_i] = v; } );
/** * If VEXCL_CACHE_KERNELS macro is defined, then program binaries are cached * in filesystem and reused in the following runs. */ inline boost::compute::program build_sources( const boost::compute::command_queue &queue, const std::string &source, const std::string &options = "" ) { #ifdef VEXCL_SHOW_KERNELS std::cout << source << std::endl; #else if (getenv("VEXCL_SHOW_KERNELS")) std::cout << source << std::endl; #endif return boost::compute::program::build_with_source( source, queue.get_context(), options + " " + get_compile_options(queue) ); }
inline void box_filter_image(const compute::image2d &input, compute::image2d &output, compute::uint_ box_height, compute::uint_ box_width, compute::command_queue &queue) { using compute::dim; const compute::context &context = queue.get_context(); // simple box filter kernel source const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( __kernel void box_filter(__read_only image2d_t input, __write_only image2d_t output, uint box_height, uint box_width) { int x = get_global_id(0); int y = get_global_id(1); int h = get_image_height(input); int w = get_image_width(input); int k = box_width; int l = box_height; if(x < k/2 || y < l/2 || x >= w-(k/2) || y >= h-(l/2)){ write_imagef(output, (int2)(x, y), (float4)(0, 0, 0, 1)); } else { const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; float4 sum = { 0, 0, 0, 0 }; for(int i = 0; i < k; i++){ for(int j = 0; j < l; j++){ sum += read_imagef(input, sampler, (int2)(x+i-k, y+j-l)); } } sum /= (float) k * l; float4 value = (float4)( sum.x, sum.y, sum.z, 1.f ); write_imagef(output, (int2)(x, y), value); } }
/// Select best launch configuration for the given shared memory requirements. void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) { boost::compute::device dev = queue.get_device(); size_t ws; if ( is_cpu(queue) ) { ws = 1; } else { // Select workgroup size that would fit into the device. ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; } config(num_workgroups(queue), ws); }
void tune_accumulate(const compute::vector<T>& data, const size_t trials, compute::command_queue& queue) { boost::shared_ptr<compute::detail::parameter_cache> params = compute::detail::parameter_cache::get_global_cache(queue.get_device()); const std::string cache_key = std::string("__boost_reduce_on_gpu_") + compute::type_name<T>(); const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 }; const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; double min_time = std::numeric_limits<double>::max(); compute::uint_ best_tpb = 0; compute::uint_ best_vpt = 0; for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){ params->set(cache_key, "tpb", tpbs[i]); for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){ params->set(cache_key, "vpt", vpts[j]); try { const double t = perf_accumulate(data, trials, queue); if(t < min_time){ best_tpb = tpbs[i]; best_vpt = vpts[j]; min_time = t; } } catch(compute::opencl_error&){ // invalid parameters for this device, skip } } } // store optimal parameters params->set(cache_key, "tpb", best_tpb); params->set(cache_key, "vpt", best_vpt); }
vector(vector_expression<E, gpu_tag> const& e, boost::compute::command_queue& queue) : m_storage(e().size(), queue.get_context()) , m_queue(&queue){ assign(*this, e); }
void medianFilter2D_wrapper(compute::command_queue queue,boost::compute::program foo_program,compute::buffer gpu_in,compute::buffer gpu_out,compute::buffer gpu_histogram,int heightImage,int widthImage,int implementation) { try{ boost::compute::kernel foo_kernel; switch(implementation) { case 1: std::cout<<"running naive median filter"<<std::endl; foo_kernel = foo_program.create_kernel("MedianFilter2D"); break; case 2: std::cout<<"running histogram median filter"<<std::endl; foo_kernel = foo_program.create_kernel("MedianFilter2D_histogram"); break; case 3: std::cout<<"running median filter with partial selection"<<std::endl; foo_kernel = foo_program.create_kernel("MedianFilter2D_partial"); break; case 4: std::cout<<"running median filter with forgetful selection"<<std::endl; foo_kernel = foo_program.create_kernel("MedianFilter2D_forgetful"); break; case 5: std::cout<<"running median filter with fast histogram"<<std::endl; foo_kernel = foo_program.create_kernel("histogram2d"); break; } if(implementation!=5) { // TODO these are the arguments for the first kernel foo_kernel.set_arg(0,gpu_in); foo_kernel.set_arg(1,gpu_out); foo_kernel.set_arg(2,sizeof(int),&widthImage); foo_kernel.set_arg(3,sizeof(int),&heightImage); // foo_kernel.set_arg(4,sizeof(unsigned int),&window_size); // Launch kernel const size_t offset[] = { 0, 0 }; const size_t bounds[] = { heightImage, widthImage }; timer kernel_timer1; queue.enqueue_nd_range_kernel(foo_kernel, 2, offset, bounds, 0); double time_elapsed1=kernel_timer1.elapsed(); printf("total time elapsed for the kernel implementation %d is %f \n",implementation,time_elapsed1); } else { foo_kernel.set_arg(0,gpu_in); foo_kernel.set_arg(1,gpu_out); foo_kernel.set_arg(2,gpu_histogram); foo_kernel.set_arg(3,sizeof(int),&widthImage); foo_kernel.set_arg(4,sizeof(int),&heightImage); // foo_kernel.set_arg(4,sizeof(unsigned int),&window_size); // Launch kernel timer kernel_timer; queue.enqueue_1d_range_kernel(foo_kernel, 0,heightImage, 0); double time_elapsed=kernel_timer.elapsed(); printf("total time elapsed for the kernel implementation %d is %f \n",implementation,time_elapsed); } } catch(boost::compute::opencl_error &e){ std::cout<<"something went wrong with kernel execution"<<std::endl; } }
size_t preferred_work_group_size_multiple(const boost::compute::command_queue &q) const { return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE); }
/// The size in bytes of shared memory per block available for this kernel. size_t max_shared_memory_per_block(const boost::compute::command_queue &q) const { boost::compute::device d = q.get_device(); return d.local_memory_size() - K.get_work_group_info<cl_ulong>(d, CL_KERNEL_LOCAL_MEM_SIZE); }
/// The maximum number of threads per block, beyond which a launch of the kernel would fail. size_t max_threads_per_block(const boost::compute::command_queue &q) const { return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_WORK_GROUP_SIZE); }
/// Standard number of workgroups to launch on a device. static inline size_t num_workgroups(const boost::compute::command_queue &q) { // This is a simple heuristic-based estimate. More advanced technique may // be employed later. return 8 * q.get_device().compute_units(); }
/// \brief Constructor of a vector with a default queue /// ///note that for all operations for which vector is on the left hand side, ///the kernels are enqueued on the supplied queue in case of a multi-queue setup. vector(boost::compute::command_queue& queue = boost::compute::system::default_queue()) :m_storage(queue.get_context()), m_queue(&queue){}
/// Enqueue the kernel to the specified command queue. void operator()(boost::compute::command_queue q) { q.enqueue_nd_range_kernel(K, 3, NULL, g_size.dim, w_size.dim); argpos = 0; }
/// \brief Constructor of a vector with a predefined size /// By default, its elements are uninitialized. /// \param size initial size of the vector /// \param queue the opencl queue to use by this vector explicit vector(size_type size, boost::compute::command_queue& queue = boost::compute::system::default_queue()) : m_storage(size,queue.get_context()), m_queue(&queue){}