/// Allocates memory buffer on the device associated with the given queue. device_vector(const command_queue &q, size_t n) : n(n) { if (n) { q.context().set_current(); CUdeviceptr ptr; cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) ); buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() ); } }
/// Copies data from host memory to device. void write(const command_queue &q, size_t offset, size_t size, const T *host, bool blocking = false) const { (void)blocking; if (size) { q.context().set_current(); cuda_check( cuMemcpyHtoD(raw() + offset * sizeof(T), host, size * sizeof(T)) ); } }
/// Copies data from device to host memory. void read(const command_queue &q, size_t offset, size_t size, T *host, bool blocking = false) const { (void)blocking; if (size) { q.context().set_current(); cuda_check( cuMemcpyDtoH(host, raw() + offset * sizeof(T), size * sizeof(T)) ); } }
hard_event::hard_event(command_queue &q, cl_command_type command, const ref_vector<event> &deps, action action) : event(q.context(), deps, profile(q, action), [](event &ev){}), _queue(q), _command(command), _fence(NULL) { if (q.profiling_enabled()) _time_queued = timestamp::current(q); q.sequence(*this); trigger(); }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, std::function<size_t(size_t)> smem ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, std::function<size_t(size_t)> smem, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, size_t smem_per_thread = 0 ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, size_t smem_per_thread = 0, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
device_vector(const command_queue &q, size_t n, const H *host = 0, mem_flags flags = MEM_READ_WRITE) : n(n) { (void)flags; if (n) { q.context().set_current(); CUdeviceptr ptr; cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) ); buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() ); if (host) { if (std::is_same<T, H>::value) write(q, 0, n, reinterpret_cast<const T*>(host), true); else write(q, 0, n, std::vector<T>(host, host + n).data(), true); } } }
/// Create and build a program from source string. inline vex::backend::program build_sources( const command_queue &queue, const std::string &source, const std::string &options = "" ) { #ifdef VEXCL_SHOW_KERNELS std::cout << source << std::endl; #else if (getenv("VEXCL_SHOW_KERNELS")) std::cout << source << std::endl; #endif std::string compile_options = options + " " + get_compile_options(queue); queue.context().set_current(); auto cc = queue.device().compute_capability(); std::ostringstream ccstr; ccstr << std::get<0>(cc) << std::get<1>(cc); sha1_hasher sha1; sha1.process(source) .process(queue.device().name()) .process(compile_options) .process(ccstr.str()) ; std::string hash = static_cast<std::string>(sha1); // Write source to a .cu file std::string basename = program_binaries_path(hash, true) + "kernel"; std::string ptxfile = basename + ".ptx"; if ( !boost::filesystem::exists(ptxfile) ) { std::string cufile = basename + ".cu"; { std::ofstream f(cufile); f << source; } // Compile the source to ptx. std::ostringstream cmdline; cmdline << "nvcc -ptx -O3" << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc) << " " << compile_options << " -o " << ptxfile << " " << cufile; if (0 != system(cmdline.str().c_str()) ) { #ifndef VEXCL_SHOW_KERNELS std::cerr << source << std::endl; #endif vex::detail::print_backtrace(); throw std::runtime_error("nvcc invocation failed"); } } // Load the compiled ptx. CUmodule prg; cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) ); return program(queue.context(), prg); }
/// Create command queue on the same context and device as the given one. inline command_queue duplicate_queue(const command_queue &q) { return command_queue(q.context(), q.device(), q.flags()); }
/// Returns context for the given queue. inline context get_context(const command_queue &q) { return q.context(); }
/// Returns raw context id for the given queue. inline context_id get_context_id(const command_queue &q) { return q.context().raw(); }
/// Binds the specified CUDA context to the calling CPU thread. inline void select_context(const command_queue &q) { q.context().set_current(); }