kernel compile(const context &context, const std::string &options = std::string()) { // generate the program source std::string source = this->source(); // generate cache key std::string cache_key = detail::sha1(source); // try to look the program up in the cache boost::shared_ptr<program_cache> cache = get_program_cache(context); ::boost::compute::program program = cache->get(cache_key); // build the program if it was not in the cache if(!program.get()){ program = ::boost::compute::program::build_with_source( source, context, options ); cache->insert(cache_key, program); } // create kernel ::boost::compute::kernel kernel = program.create_kernel(name()); // bind stored args for(size_t i = 0; i < m_stored_args.size(); i++){ const detail::meta_kernel_stored_arg &arg = m_stored_args[i]; if(arg.m_size != 0){ kernel.set_arg(i, arg.m_size, arg.m_value); } } // bind buffer args for(size_t i = 0; i < m_stored_buffers.size(); i++){ const detail::meta_kernel_buffer_info &bi = m_stored_buffers[i]; kernel.set_arg(bi.index, bi.m_mem); } return kernel; }
inline void reduce_on_gpu(InputIterator first, InputIterator last, buffer_iterator<T> result, Function function, command_queue &queue) { const device &device = queue.get_device(); detail::meta_kernel k("reduce"); k.add_arg<T*>("__global const","input"); k.add_arg<const uint_>("offset"); k.add_arg<const uint_>("count"); k.add_arg<T*>("__global","output"); k.add_arg<const uint_>("output_offset"); k << k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << k.decl<const uint_>("lid") << " = get_local_id(0);\n" << "__local " << type_name<T>() << " scratch[TPB];\n" << // private reduction k.decl<T>("sum") << " = 0;\n" << "for(uint i = 0; i < VPT; i++){\n" << " if(block_offset + lid + i*TPB < count){\n" << " sum = sum + block[lid+i*TPB]; \n" << " }\n" << "}\n" << "scratch[lid] = sum;\n"; // discrimination on vendor name if(is_nvidia_device(device)) k << ReduceBody<T,true>::body(); else k << ReduceBody<T,false>::body(); k << // write sum to output "if(lid == 0){\n" << " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; uint_ vpt = 8; uint_ tpb = 128; size_t count = std::distance(first, last); const context &context = queue.get_context(); boost::shared_ptr<program_cache> cache = get_program_cache(context); std::string cache_key = std::string("boost_reduce_on_gpu_") + type_name<T>(); program reduce_program = cache->get(cache_key); if(!reduce_program.get()){ // create reduce program std::stringstream options; options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb; reduce_program = program::build_with_source(k.source(), context, options.str()); cache->insert(cache_key, reduce_program); } // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); // update count after initial reduce count = std::ceil(float(count) / vpt / tpb); // middle pass(es), reduce between ping and pong const buffer *input_buffer = &ping; buffer pong(context, count / vpt / tpb * sizeof(T)); const buffer *output_buffer = &pong; if(count > vpt * tpb){ while(count > vpt * tpb){ reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, *output_buffer); reduce_kernel.set_arg(4, uint_(0)); size_t work_size = std::ceil(float(count) / vpt); if(work_size % tpb != 0){ work_size += tpb - work_size % tpb; } queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); std::swap(input_buffer, output_buffer); count = std::ceil(float(count) / vpt / tpb); } } // final pass, reduce from ping/pong to result reduce_kernel.set_arg(0, *input_buffer); reduce_kernel.set_arg(1, uint_(0)); reduce_kernel.set_arg(2, uint_(count)); reduce_kernel.set_arg(3, result.get_buffer()); reduce_kernel.set_arg(4, uint_(result.get_index())); queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); }