std::vector<size_t> kernel::optimal_block_size(const command_queue &q, const std::vector<size_t> &grid_size) const { return factor::find_grid_optimal_factor<size_t>( q.device().max_threads_per_block(), q.device().max_block_size(), grid_size); }
/// Select best launch configuration for the given shared memory requirements. void config(const command_queue &q, std::function<size_t(size_t)> smem) { // Select workgroup size that would fit into the device. size_t ws = q.device().max_threads_per_block() / 2; size_t max_ws = max_threads_per_block(q); size_t max_smem = max_shared_memory_per_block(q); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; config(num_workgroups(q), ws); }
void kernel::launch(command_queue &q, const std::vector<size_t> &grid_offset, const std::vector<size_t> &grid_size, const std::vector<size_t> &block_size) { const auto m = program().build(q.device()).binary; const auto reduced_grid_size = map(divides(), grid_size, block_size); void *st = exec.bind(&q, grid_offset); struct pipe_grid_info info = {}; // The handles are created during exec_context::bind(), so we need make // sure to call exec_context::bind() before retrieving them. std::vector<uint32_t *> g_handles = map([&](size_t h) { return (uint32_t *)&exec.input[h]; }, exec.g_handles); q.pipe->bind_compute_state(q.pipe, st); q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.samplers.size(), exec.samplers.data()); q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.sviews.size(), exec.sviews.data()); q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), exec.resources.data()); q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), exec.g_buffers.data(), g_handles.data()); // Fill information for the launch_grid() call. info.work_dim = grid_size.size(); copy(pad_vector(q, block_size, 1), info.block); copy(pad_vector(q, reduced_grid_size, 1), info.grid); info.pc = find(name_equals(_name), m.syms).offset; info.input = exec.input.data(); q.pipe->launch_grid(q.pipe, &info); q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL); q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL); q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.sviews.size(), NULL); q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.samplers.size(), NULL); q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER); exec.unbind(); }
void kernel::launch(command_queue &q, const std::vector<size_t> &grid_offset, const std::vector<size_t> &grid_size, const std::vector<size_t> &block_size) { const auto m = program().binary(q.device()); const auto reduced_grid_size = map(divides(), grid_size, block_size); void *st = exec.bind(&q, grid_offset); // The handles are created during exec_context::bind(), so we need make // sure to call exec_context::bind() before retrieving them. std::vector<uint32_t *> g_handles = map([&](size_t h) { return (uint32_t *)&exec.input[h]; }, exec.g_handles); q.pipe->bind_compute_state(q.pipe, st); q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.samplers.size(), exec.samplers.data()); q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.sviews.size(), exec.sviews.data()); q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), exec.resources.data()); q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), exec.g_buffers.data(), g_handles.data()); q.pipe->launch_grid(q.pipe, pad_vector(q, block_size, 1).data(), pad_vector(q, reduced_grid_size, 1).data(), find(name_equals(_name), m.syms).offset, exec.input.data()); q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL); q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL); q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.sviews.size(), NULL); q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, exec.samplers.size(), NULL); exec.unbind(); }
static inline std::vector<uint> pad_vector(command_queue &q, const V &v, uint x) { std::vector<uint> w { v.begin(), v.end() }; w.resize(q.device().max_block_size().size(), x); return w; }
/// The size in bytes of shared memory per block available for this kernel. size_t max_shared_memory_per_block(const command_queue &q) const { return q.device().max_shared_memory_per_block() - shared_size_bytes(); }
/// Standard number of workgroups to launch on a device. static inline size_t num_workgroups(const command_queue &q) { return 8 * q.device().multiprocessor_count(); }
/// Create and build a program from source string. inline vex::backend::program build_sources( const command_queue &queue, const std::string &source, const std::string &options = "" ) { #ifdef VEXCL_SHOW_KERNELS std::cout << source << std::endl; #else if (getenv("VEXCL_SHOW_KERNELS")) std::cout << source << std::endl; #endif std::string compile_options = options + " " + get_compile_options(queue); queue.context().set_current(); auto cc = queue.device().compute_capability(); std::ostringstream ccstr; ccstr << std::get<0>(cc) << std::get<1>(cc); sha1_hasher sha1; sha1.process(source) .process(queue.device().name()) .process(compile_options) .process(ccstr.str()) ; std::string hash = static_cast<std::string>(sha1); // Write source to a .cu file std::string basename = program_binaries_path(hash, true) + "kernel"; std::string ptxfile = basename + ".ptx"; if ( !boost::filesystem::exists(ptxfile) ) { std::string cufile = basename + ".cu"; { std::ofstream f(cufile); f << source; } // Compile the source to ptx. std::ostringstream cmdline; cmdline << "nvcc -ptx -O3" << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc) << " " << compile_options << " -o " << ptxfile << " " << cufile; if (0 != system(cmdline.str().c_str()) ) { #ifndef VEXCL_SHOW_KERNELS std::cerr << source << std::endl; #endif vex::detail::print_backtrace(); throw std::runtime_error("nvcc invocation failed"); } } // Load the compiled ptx. CUmodule prg; cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) ); return program(queue.context(), prg); }
/// Create command queue on the same context and device as the given one. inline command_queue duplicate_queue(const command_queue &q) { return command_queue(q.context(), q.device(), q.flags()); }
/// Returns id of the device associated with the given queue. inline device_id get_device_id(const command_queue &q) { return q.device().raw(); }
/// Returns device associated with the given queue. inline device get_device(const command_queue &q) { return q.device(); }