예제 #1
0
        /// Allocates memory buffer on the device associated with the given queue.
        device_vector(const command_queue &q, size_t n) : n(n) {
            if (n) {
                q.context().set_current();

                CUdeviceptr ptr;
                cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) );

                buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() );
            }
        }
예제 #2
0
        /// Copies data from host memory to device.
        void write(const command_queue &q, size_t offset, size_t size, const T *host,
                bool blocking = false) const
        {
            (void)blocking;

            if (size) {
                q.context().set_current();
                cuda_check( cuMemcpyHtoD(raw() + offset * sizeof(T), host, size * sizeof(T)) );
            }
        }
예제 #3
0
        /// Copies data from device to host memory.
        void read(const command_queue &q, size_t offset, size_t size, T *host,
                bool blocking = false) const
        {
            (void)blocking;

            if (size) {
                q.context().set_current();
                cuda_check( cuMemcpyDtoH(host, raw() + offset * sizeof(T), size * sizeof(T)) );
            }
        }
예제 #4
0
hard_event::hard_event(command_queue &q, cl_command_type command,
                       const ref_vector<event> &deps, action action) :
   event(q.context(), deps, profile(q, action), [](event &ev){}),
   _queue(q), _command(command), _fence(NULL) {
   if (q.profiling_enabled())
      _time_queued = timestamp::current(q);

   q.sequence(*this);
   trigger();
}
예제 #5
0
 /// Constructor. Extracts a backend::kernel instance from backend::program.
 kernel(const command_queue &queue,
        const program &P,
        const std::string &name,
        std::function<size_t(size_t)> smem
        )
     : ctx(queue.context()), P(P), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
예제 #6
0
 /// Constructor. Creates a backend::kernel instance from source.
 kernel(const command_queue &queue,
        const std::string &src, const std::string &name,
        std::function<size_t(size_t)> smem,
        const std::string &options = ""
        )
     : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
예제 #7
0
        /// Constructor. Extracts a backend::kernel instance from backend::program.
        kernel(const command_queue &queue,
               const program &P,
               const std::string &name,
               size_t smem_per_thread = 0
               )
            : ctx(queue.context()), P(P), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
예제 #8
0
        /// Constructor. Creates a backend::kernel instance from source.
        kernel(const command_queue &queue,
               const std::string &src,
               const std::string &name,
               size_t smem_per_thread = 0,
               const std::string &options = ""
               )
            : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
예제 #9
0
        device_vector(const command_queue &q, size_t n,
                const H *host = 0, mem_flags flags = MEM_READ_WRITE)
            : n(n)
        {
            (void)flags;

            if (n) {
                q.context().set_current();

                CUdeviceptr ptr;
                cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) );

                buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() );

                if (host) {
                    if (std::is_same<T, H>::value)
                        write(q, 0, n, reinterpret_cast<const T*>(host), true);
                    else
                        write(q, 0, n, std::vector<T>(host, host + n).data(), true);
                }
            }
        }
예제 #10
0
/// Create and build a program from source string.
inline vex::backend::program build_sources(
        const command_queue &queue, const std::string &source,
        const std::string &options = ""
        )
{
#ifdef VEXCL_SHOW_KERNELS
    std::cout << source << std::endl;
#else
    if (getenv("VEXCL_SHOW_KERNELS"))
        std::cout << source << std::endl;
#endif

    std::string compile_options = options + " " + get_compile_options(queue);

    queue.context().set_current();

    auto cc = queue.device().compute_capability();
    std::ostringstream ccstr;
    ccstr << std::get<0>(cc) << std::get<1>(cc);

    sha1_hasher sha1;
    sha1.process(source)
        .process(queue.device().name())
        .process(compile_options)
        .process(ccstr.str())
        ;

    std::string hash = static_cast<std::string>(sha1);

    // Write source to a .cu file
    std::string basename = program_binaries_path(hash, true) + "kernel";
    std::string ptxfile  = basename + ".ptx";

    if ( !boost::filesystem::exists(ptxfile) ) {
        std::string cufile = basename + ".cu";

        {
            std::ofstream f(cufile);
            f << source;
        }

        // Compile the source to ptx.
        std::ostringstream cmdline;
        cmdline
            << "nvcc -ptx -O3"
            << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc)
            << " " << compile_options
            << " -o " << ptxfile << " " << cufile;
        if (0 != system(cmdline.str().c_str()) ) {
#ifndef VEXCL_SHOW_KERNELS
            std::cerr << source << std::endl;
#endif

            vex::detail::print_backtrace();
            throw std::runtime_error("nvcc invocation failed");
        }
    }

    // Load the compiled ptx.
    CUmodule prg;
    cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) );

    return program(queue.context(), prg);
}
예제 #11
0
/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
    return command_queue(q.context(), q.device(), q.flags());
}
예제 #12
0
/// Returns context for the given queue.
inline context get_context(const command_queue &q) {
    return q.context();
}
예제 #13
0
/// Returns raw context id for the given queue.
inline context_id get_context_id(const command_queue &q) {
    return q.context().raw();
}
예제 #14
0
/// Binds the specified CUDA context to the calling CPU thread.
inline void select_context(const command_queue &q) {
    q.context().set_current();
}