Beispiel #1
0
std::pair< std::vector<context>, std::vector<command_queue> >
queue_list(DevFilter &&filter, unsigned queue_flags = 0)
{
    cuda_check( do_init() );

    std::vector<context>       ctx;
    std::vector<command_queue> queue;

    int ndev;
    cuda_check( cuDeviceGetCount(&ndev) );

    for(int d = 0; d < ndev; ++d) {
        try {
            CUdevice dev;
            cuda_check( cuDeviceGet(&dev, d) );
            if (!filter(dev)) continue;

            context       c(dev);
            command_queue q(c, dev, queue_flags);

            ctx.push_back(c);
            queue.push_back(q);
        } catch(const error&) { }
    }

    return std::make_pair(ctx, queue);
}
Beispiel #2
0
void Mesh::NEListToArray() {
    std::vector< std::set<size_t> >::const_iterator vec_it;
    std::set<size_t>::const_iterator set_it;
    size_t offset = 0;
    size_t index = 0;

    for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
      offset += vec_it->size();

    NEListArray_size = offset;
    
    cuda_check(cudaHostAlloc((void **)&NEListIndex_pinned, sizeof(size_t) * (NNodes+1), cudaHostAllocPortable));
    cuda_check(cudaHostAlloc((void **)&NEListArray_pinned, sizeof(size_t) * NEListArray_size, cudaHostAllocPortable));

    offset = 0;

    for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
    {
      NEListIndex_pinned[index++] = offset;

      for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++)
        NEListArray_pinned[offset++] = *set_it;
    }

    assert(index == NEList.size());
    NEListIndex_pinned[index] = offset;
}
Beispiel #3
0
        /// Returns device compute capability as a tuple of major and minor version numbers.
        std::tuple<int, int> compute_capability() const {
            int major, minor;

            cuda_check( cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, d) );
            cuda_check( cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, d) );

            return std::make_tuple(major, minor);
        }
Beispiel #4
0
void Random<GPU>::uniform  (float *data, int size, const float  a, const float b) const
{ const int N = size;
  cuda_check (curandGenerateUniform (dnnctx[did_]->curand_, data, N));
  if (a != 0.f || b != 1.f)
  XPU_KERNEL_LAUNCH (tensor_scale, cuda_get_blocks(N), CUDA_NUM_THREADS, 0, dnnctx[did_]->stream_,
    N, data, a, b);
}
Beispiel #5
0
        svm_vector(const backend::command_queue &q, size_t n) : n(n), q(q), p(NULL) {
            q.context().set_current();

            CUdeviceptr dptr;
            cuda_check( cuMemAllocManaged(&dptr, n * sizeof(T), CU_MEM_ATTACH_GLOBAL) );
            p = reinterpret_cast<T*>(static_cast<size_t>(dptr));
        }
Beispiel #6
0
        spmat_hyb(
                const command_queue &queue,
                int n, int m,
                const row_t *row_begin,
                const col_t *col_begin,
                const val_t *val_begin
                )
            : handle( cusparse_handle(queue) ),
              desc  ( create_description(), detail::deleter() ),
              mat   ( create_matrix(),      detail::deleter() )
        {
            cuda_check( cusparseSetMatType(desc.get(), CUSPARSE_MATRIX_TYPE_GENERAL) );
            cuda_check( cusparseSetMatIndexBase(desc.get(), CUSPARSE_INDEX_BASE_ZERO) );

            fill_matrix(queue, n, m, row_begin, col_begin, val_begin);
        }
Beispiel #7
0
        static CUstream create(const vex::backend::context &ctx, unsigned flags = 0) {
            ctx.set_current();

            CUstream s;
            cuda_check( cuStreamCreate(&s, flags) );

            return s;
        }
Beispiel #8
0
std::vector<device> device_list(DevFilter&& filter) {
    cuda_check( do_init() );

    std::vector<device> device;

    int ndev;
    cuda_check( cuDeviceGetCount(&ndev) );

    for(int d = 0; d < ndev; ++d) {
        try {
            CUdevice dev;
            cuda_check( cuDeviceGet(&dev, d) );
            if (!filter(dev)) continue;
            device.push_back(dev);
        } catch(const error&) { }
    }

    return device;
}
Beispiel #9
0
        /// Allocates memory buffer on the device associated with the given queue.
        device_vector(const command_queue &q, size_t n) : n(n) {
            if (n) {
                q.context().set_current();

                CUdeviceptr ptr;
                cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) );

                buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() );
            }
        }
Beispiel #10
0
inline cusparseHandle_t cusparse_handle(const command_queue &q) {
    typedef std::shared_ptr<std::remove_pointer<cusparseHandle_t>::type> smart_handle;
    typedef vex::detail::object_cache<vex::detail::index_by_context, smart_handle> cache_type;

    static cache_type cache;

    auto h = cache.find(q);

    if (h == cache.end()) {
        select_context(q);
        cusparseHandle_t handle;
        cuda_check( cusparseCreate(&handle) );
        cuda_check( cusparseSetStream(handle, q.raw()) );

        h = cache.insert(q, smart_handle(handle, detail::deleter()));
    }

    return h->second.get();
}
Beispiel #11
0
        /// Copies data from host memory to device.
        void write(const command_queue &q, size_t offset, size_t size, const T *host,
                bool blocking = false) const
        {
            (void)blocking;

            if (size) {
                q.context().set_current();
                cuda_check( cuMemcpyHtoD(raw() + offset * sizeof(T), host, size * sizeof(T)) );
            }
        }
Beispiel #12
0
        /// Copies data from device to host memory.
        void read(const command_queue &q, size_t offset, size_t size, T *host,
                bool blocking = false) const
        {
            (void)blocking;

            if (size) {
                q.context().set_current();
                cuda_check( cuMemcpyDtoH(host, raw() + offset * sizeof(T), size * sizeof(T)) );
            }
        }
Beispiel #13
0
 /// Constructor. Extracts a backend::kernel instance from backend::program.
 kernel(const command_queue &queue,
        const program &P,
        const std::string &name,
        std::function<size_t(size_t)> smem
        )
     : ctx(queue.context()), P(P), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
Beispiel #14
0
 /// Constructor. Creates a backend::kernel instance from source.
 kernel(const command_queue &queue,
        const std::string &src, const std::string &name,
        std::function<size_t(size_t)> smem,
        const std::string &options = ""
        )
     : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
 {
     cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );
     config(queue, smem);
 }
Beispiel #15
0
void Mesh::pin_data() {
  NNListToArray();
  NEListToArray();

  size_t ENList_bytes = sizeof(size_t) * ENList.size();
  size_t coords_bytes = sizeof(float) * coords.size();
  size_t metric_bytes = sizeof(float) * metric.size();
  size_t normal_bytes = sizeof(float) * normals.size();
  
  cuda_check(cudaHostAlloc((void **)&ENList_pinned, ENList_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&coords_pinned, coords_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&metric_pinned, metric_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&normals_pinned, normal_bytes, cudaHostAllocPortable));
  
  memcpy(ENList_pinned, &ENList[0], ENList_bytes);
  memcpy(coords_pinned, &coords[0], coords_bytes);
  memcpy(metric_pinned, &metric[0], metric_bytes);
  memcpy(normals_pinned, &normals[0], normal_bytes);
}
Beispiel #16
0
        spmat_crs(
                const command_queue &queue,
                int n, int m,
                const row_t *row_begin,
                const col_t *col_begin,
                const val_t *val_begin
                )
            : n(n), m(m), nnz(static_cast<unsigned>(row_begin[n] - row_begin[0])),
              handle( cusparse_handle(queue) ),
              desc  ( create_description(), detail::deleter() ),
              row(queue, n+1, row_begin),
              col(queue, nnz, col_begin + row_begin[0]),
              val(queue, nnz, val_begin + row_begin[0])
        {
            if (row_begin[0] != 0)
                vector<int>(queue, row) -= row_begin[0];

            cuda_check( cusparseSetMatType(desc.get(), CUSPARSE_MATRIX_TYPE_GENERAL) );
            cuda_check( cusparseSetMatIndexBase(desc.get(), CUSPARSE_INDEX_BASE_ZERO) );
        }
Beispiel #17
0
        /// Constructor. Extracts a backend::kernel instance from backend::program.
        kernel(const command_queue &queue,
               const program &P,
               const std::string &name,
               size_t smem_per_thread = 0
               )
            : ctx(queue.context()), P(P), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
Beispiel #18
0
        void mul(const device_vector<double> &x, device_vector<double> &y,
                 double alpha = 1, bool append = false) const
        {
            double beta = append ? 1.0 : 0.0;

            cuda_check(
                    cusparseDhybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                        &alpha, desc.get(), mat.get(),
                        x.raw_ptr(), &beta, y.raw_ptr()
                        )
                    );
        }
Beispiel #19
0
        void mul(const device_vector<float> &x, device_vector<float> &y,
                 float alpha = 1, bool append = false) const
        {
            float beta = append ? 1.0f : 0.0f;

            cuda_check(
                    cusparseShybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                        &alpha, desc.get(), mat.get(),
                        x.raw_ptr(), &beta, y.raw_ptr()
                        )
                    );
        }
Beispiel #20
0
        /// Constructor. Creates a backend::kernel instance from source.
        kernel(const command_queue &queue,
               const std::string &src,
               const std::string &name,
               size_t smem_per_thread = 0,
               const std::string &options = ""
               )
            : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0)
        {
            cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) );

            config(queue,
                    [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; });
        }
Beispiel #21
0
        void fill_matrix(const command_queue &q,
                int n, int m, const row_t *row, const col_t *col, const double *val)
        {
            device_vector<int>    r(q, n + 1,  row);
            device_vector<int>    c(q, row[n], col + row[0]);
            device_vector<double> v(q, row[n], val + row[0]);

            if (row[0] != 0) vector<int>(q, r) -= row[0];

            cuda_check(
                    cusparseDcsr2hyb(handle, n, m, desc.get(),
                        v.raw_ptr(), r.raw_ptr(), c.raw_ptr(), mat.get(), 0,
                        CUSPARSE_HYB_PARTITION_AUTO
                        )
                    );
        }
Beispiel #22
0
        /// Enqueue the kernel to the specified command queue.
        void operator()(const command_queue &q) {
            prm_addr.clear();
            for(auto p = prm_pos.begin(); p != prm_pos.end(); ++p)
                prm_addr.push_back(stack.data() + *p);

            cuda_check(
                    cuLaunchKernel(
                        K,
                        static_cast<unsigned>(g_size.x), static_cast<unsigned>(g_size.y), static_cast<unsigned>(g_size.z),
                        static_cast<unsigned>(w_size.x), static_cast<unsigned>(w_size.y), static_cast<unsigned>(w_size.z),
                        static_cast<unsigned>(smem),
                        q.raw(),
                        prm_addr.data(),
                        0
                        )
                    );

            reset();
        }
Beispiel #23
0
        device_vector(const command_queue &q, size_t n,
                const H *host = 0, mem_flags flags = MEM_READ_WRITE)
            : n(n)
        {
            (void)flags;

            if (n) {
                q.context().set_current();

                CUdeviceptr ptr;
                cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) );

                buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() );

                if (host) {
                    if (std::is_same<T, H>::value)
                        write(q, 0, n, reinterpret_cast<const T*>(host), true);
                    else
                        write(q, 0, n, std::vector<T>(host, host + n).data(), true);
                }
            }
        }
Beispiel #24
0
void Random<GPU>::gaussian (float *data, int size, const float mu, const float sigma) const
{ CHECK (sigma > 0.f);
  cuda_check (curandGenerateNormal (dnnctx[did_]->curand_, data, size, mu, sigma));
}
Beispiel #25
0
void Random<GPU>::set_seed (int seed)
{ cuda_check (curandSetPseudoRandomGeneratorSeed (dnnctx[did_]->curand_, seed));
}
Beispiel #26
0
void DataBuffer<DT>::page_unlk ()
{ cuda_check (cudaHostUnregister ( data_.dptr));
  cuda_check (cudaHostUnregister ( pred_.dptr));
  cuda_check (cudaHostUnregister (label_.dptr));
}
Beispiel #27
0
void DataBuffer<DT>::page_lock ()
{ cuda_check (cudaHostRegister ( data_.dptr,  data_.size_d(), cudaHostRegisterPortable));
  cuda_check (cudaHostRegister ( pred_.dptr,  pred_.size_d(), cudaHostRegisterPortable));
  cuda_check (cudaHostRegister (label_.dptr, label_.size_d(), cudaHostRegisterPortable));
}
Beispiel #28
0
 static void dispose(cusparseHybMat_t handle) {
     cuda_check( cusparseDestroyHybMat(handle) );
 }
Beispiel #29
0
 static void dispose(cusparseMatDescr_t handle) {
     cuda_check( cusparseDestroyMatDescr(handle) );
 }
Beispiel #30
0
 static void dispose(cusparseHandle_t handle) {
     cuda_check( cusparseDestroy(handle) );
 }