std::vector<size_t>
kernel::optimal_block_size(const command_queue &q,
                           const std::vector<size_t> &grid_size) const {
   return factor::find_grid_optimal_factor<size_t>(
      q.device().max_threads_per_block(), q.device().max_block_size(),
      grid_size);
}
Example #2
0
        /// Select best launch configuration for the given shared memory requirements.
        void config(const command_queue &q, std::function<size_t(size_t)> smem) {
            // Select workgroup size that would fit into the device.
            size_t ws = q.device().max_threads_per_block() / 2;

            size_t max_ws   = max_threads_per_block(q);
            size_t max_smem = max_shared_memory_per_block(q);

            // Reduce workgroup size until it satisfies resource requirements:
            while( (ws > max_ws) || (smem(ws) > max_smem) )
                ws /= 2;

            config(num_workgroups(q), ws);
        }
Example #3
0
void
kernel::launch(command_queue &q,
               const std::vector<size_t> &grid_offset,
               const std::vector<size_t> &grid_size,
               const std::vector<size_t> &block_size) {
   const auto m = program().build(q.device()).binary;
   const auto reduced_grid_size =
      map(divides(), grid_size, block_size);
   void *st = exec.bind(&q, grid_offset);
   struct pipe_grid_info info = {};

   // The handles are created during exec_context::bind(), so we need make
   // sure to call exec_context::bind() before retrieving them.
   std::vector<uint32_t *> g_handles = map([&](size_t h) {
         return (uint32_t *)&exec.input[h];
      }, exec.g_handles);

   q.pipe->bind_compute_state(q.pipe, st);
   q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
                               0, exec.samplers.size(),
                               exec.samplers.data());

   q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                             exec.sviews.size(), exec.sviews.data());
   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
                                 exec.resources.data());
   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
                              exec.g_buffers.data(), g_handles.data());

   // Fill information for the launch_grid() call.
   info.work_dim = grid_size.size();
   copy(pad_vector(q, block_size, 1), info.block);
   copy(pad_vector(q, reduced_grid_size, 1), info.grid);
   info.pc = find(name_equals(_name), m.syms).offset;
   info.input = exec.input.data();

   q.pipe->launch_grid(q.pipe, &info);

   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
   q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                             exec.sviews.size(), NULL);
   q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
                               exec.samplers.size(), NULL);

   q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
   exec.unbind();
}
void
kernel::launch(command_queue &q,
               const std::vector<size_t> &grid_offset,
               const std::vector<size_t> &grid_size,
               const std::vector<size_t> &block_size) {
   const auto m = program().binary(q.device());
   const auto reduced_grid_size =
      map(divides(), grid_size, block_size);
   void *st = exec.bind(&q, grid_offset);

   // The handles are created during exec_context::bind(), so we need make
   // sure to call exec_context::bind() before retrieving them.
   std::vector<uint32_t *> g_handles = map([&](size_t h) {
         return (uint32_t *)&exec.input[h];
      }, exec.g_handles);

   q.pipe->bind_compute_state(q.pipe, st);
   q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
                               0, exec.samplers.size(),
                               exec.samplers.data());

   q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                             exec.sviews.size(), exec.sviews.data());
   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
                                 exec.resources.data());
   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
                              exec.g_buffers.data(), g_handles.data());

   q.pipe->launch_grid(q.pipe,
                       pad_vector(q, block_size, 1).data(),
                       pad_vector(q, reduced_grid_size, 1).data(),
                       find(name_equals(_name), m.syms).offset,
                       exec.input.data());

   q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
   q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
   q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
                             exec.sviews.size(), NULL);
   q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
                               exec.samplers.size(), NULL);
   exec.unbind();
}
static inline std::vector<uint>
pad_vector(command_queue &q, const V &v, uint x) {
   std::vector<uint> w { v.begin(), v.end() };
   w.resize(q.device().max_block_size().size(), x);
   return w;
}
Example #6
0
 /// The size in bytes of shared memory per block available for this kernel.
 size_t max_shared_memory_per_block(const command_queue &q) const {
     return q.device().max_shared_memory_per_block() -
         shared_size_bytes();
 }
Example #7
0
 /// Standard number of workgroups to launch on a device.
 static inline size_t num_workgroups(const command_queue &q) {
     return 8 * q.device().multiprocessor_count();
 }
Example #8
0
/// Create and build a program from source string.
inline vex::backend::program build_sources(
        const command_queue &queue, const std::string &source,
        const std::string &options = ""
        )
{
#ifdef VEXCL_SHOW_KERNELS
    std::cout << source << std::endl;
#else
    if (getenv("VEXCL_SHOW_KERNELS"))
        std::cout << source << std::endl;
#endif

    std::string compile_options = options + " " + get_compile_options(queue);

    queue.context().set_current();

    auto cc = queue.device().compute_capability();
    std::ostringstream ccstr;
    ccstr << std::get<0>(cc) << std::get<1>(cc);

    sha1_hasher sha1;
    sha1.process(source)
        .process(queue.device().name())
        .process(compile_options)
        .process(ccstr.str())
        ;

    std::string hash = static_cast<std::string>(sha1);

    // Write source to a .cu file
    std::string basename = program_binaries_path(hash, true) + "kernel";
    std::string ptxfile  = basename + ".ptx";

    if ( !boost::filesystem::exists(ptxfile) ) {
        std::string cufile = basename + ".cu";

        {
            std::ofstream f(cufile);
            f << source;
        }

        // Compile the source to ptx.
        std::ostringstream cmdline;
        cmdline
            << "nvcc -ptx -O3"
            << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc)
            << " " << compile_options
            << " -o " << ptxfile << " " << cufile;
        if (0 != system(cmdline.str().c_str()) ) {
#ifndef VEXCL_SHOW_KERNELS
            std::cerr << source << std::endl;
#endif

            vex::detail::print_backtrace();
            throw std::runtime_error("nvcc invocation failed");
        }
    }

    // Load the compiled ptx.
    CUmodule prg;
    cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) );

    return program(queue.context(), prg);
}
Example #9
0
/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
    return command_queue(q.context(), q.device(), q.flags());
}
Example #10
0
/// Returns id of the device associated with the given queue.
inline device_id get_device_id(const command_queue &q) {
    return q.device().raw();
}
Example #11
0
/// Returns device associated with the given queue.
inline device get_device(const command_queue &q) {
    return q.device();
}