std::pair< std::vector<context>, std::vector<command_queue> > queue_list(DevFilter &&filter, unsigned queue_flags = 0) { cuda_check( do_init() ); std::vector<context> ctx; std::vector<command_queue> queue; int ndev; cuda_check( cuDeviceGetCount(&ndev) ); for(int d = 0; d < ndev; ++d) { try { CUdevice dev; cuda_check( cuDeviceGet(&dev, d) ); if (!filter(dev)) continue; context c(dev); command_queue q(c, dev, queue_flags); ctx.push_back(c); queue.push_back(q); } catch(const error&) { } } return std::make_pair(ctx, queue); }
void Mesh::NEListToArray() { std::vector< std::set<size_t> >::const_iterator vec_it; std::set<size_t>::const_iterator set_it; size_t offset = 0; size_t index = 0; for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++) offset += vec_it->size(); NEListArray_size = offset; cuda_check(cudaHostAlloc((void **)&NEListIndex_pinned, sizeof(size_t) * (NNodes+1), cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&NEListArray_pinned, sizeof(size_t) * NEListArray_size, cudaHostAllocPortable)); offset = 0; for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++) { NEListIndex_pinned[index++] = offset; for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++) NEListArray_pinned[offset++] = *set_it; } assert(index == NEList.size()); NEListIndex_pinned[index] = offset; }
/// Returns device compute capability as a tuple of major and minor version numbers. std::tuple<int, int> compute_capability() const { int major, minor; cuda_check( cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, d) ); cuda_check( cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, d) ); return std::make_tuple(major, minor); }
void Random<GPU>::uniform (float *data, int size, const float a, const float b) const { const int N = size; cuda_check (curandGenerateUniform (dnnctx[did_]->curand_, data, N)); if (a != 0.f || b != 1.f) XPU_KERNEL_LAUNCH (tensor_scale, cuda_get_blocks(N), CUDA_NUM_THREADS, 0, dnnctx[did_]->stream_, N, data, a, b); }
svm_vector(const backend::command_queue &q, size_t n) : n(n), q(q), p(NULL) { q.context().set_current(); CUdeviceptr dptr; cuda_check( cuMemAllocManaged(&dptr, n * sizeof(T), CU_MEM_ATTACH_GLOBAL) ); p = reinterpret_cast<T*>(static_cast<size_t>(dptr)); }
spmat_hyb( const command_queue &queue, int n, int m, const row_t *row_begin, const col_t *col_begin, const val_t *val_begin ) : handle( cusparse_handle(queue) ), desc ( create_description(), detail::deleter() ), mat ( create_matrix(), detail::deleter() ) { cuda_check( cusparseSetMatType(desc.get(), CUSPARSE_MATRIX_TYPE_GENERAL) ); cuda_check( cusparseSetMatIndexBase(desc.get(), CUSPARSE_INDEX_BASE_ZERO) ); fill_matrix(queue, n, m, row_begin, col_begin, val_begin); }
static CUstream create(const vex::backend::context &ctx, unsigned flags = 0) { ctx.set_current(); CUstream s; cuda_check( cuStreamCreate(&s, flags) ); return s; }
std::vector<device> device_list(DevFilter&& filter) { cuda_check( do_init() ); std::vector<device> device; int ndev; cuda_check( cuDeviceGetCount(&ndev) ); for(int d = 0; d < ndev; ++d) { try { CUdevice dev; cuda_check( cuDeviceGet(&dev, d) ); if (!filter(dev)) continue; device.push_back(dev); } catch(const error&) { } } return device; }
/// Allocates memory buffer on the device associated with the given queue. device_vector(const command_queue &q, size_t n) : n(n) { if (n) { q.context().set_current(); CUdeviceptr ptr; cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) ); buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() ); } }
inline cusparseHandle_t cusparse_handle(const command_queue &q) { typedef std::shared_ptr<std::remove_pointer<cusparseHandle_t>::type> smart_handle; typedef vex::detail::object_cache<vex::detail::index_by_context, smart_handle> cache_type; static cache_type cache; auto h = cache.find(q); if (h == cache.end()) { select_context(q); cusparseHandle_t handle; cuda_check( cusparseCreate(&handle) ); cuda_check( cusparseSetStream(handle, q.raw()) ); h = cache.insert(q, smart_handle(handle, detail::deleter())); } return h->second.get(); }
/// Copies data from host memory to device. void write(const command_queue &q, size_t offset, size_t size, const T *host, bool blocking = false) const { (void)blocking; if (size) { q.context().set_current(); cuda_check( cuMemcpyHtoD(raw() + offset * sizeof(T), host, size * sizeof(T)) ); } }
/// Copies data from device to host memory. void read(const command_queue &q, size_t offset, size_t size, T *host, bool blocking = false) const { (void)blocking; if (size) { q.context().set_current(); cuda_check( cuMemcpyDtoH(host, raw() + offset * sizeof(T), size * sizeof(T)) ); } }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, std::function<size_t(size_t)> smem ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, std::function<size_t(size_t)> smem, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, smem); }
void Mesh::pin_data() { NNListToArray(); NEListToArray(); size_t ENList_bytes = sizeof(size_t) * ENList.size(); size_t coords_bytes = sizeof(float) * coords.size(); size_t metric_bytes = sizeof(float) * metric.size(); size_t normal_bytes = sizeof(float) * normals.size(); cuda_check(cudaHostAlloc((void **)&ENList_pinned, ENList_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&coords_pinned, coords_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&metric_pinned, metric_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&normals_pinned, normal_bytes, cudaHostAllocPortable)); memcpy(ENList_pinned, &ENList[0], ENList_bytes); memcpy(coords_pinned, &coords[0], coords_bytes); memcpy(metric_pinned, &metric[0], metric_bytes); memcpy(normals_pinned, &normals[0], normal_bytes); }
spmat_crs( const command_queue &queue, int n, int m, const row_t *row_begin, const col_t *col_begin, const val_t *val_begin ) : n(n), m(m), nnz(static_cast<unsigned>(row_begin[n] - row_begin[0])), handle( cusparse_handle(queue) ), desc ( create_description(), detail::deleter() ), row(queue, n+1, row_begin), col(queue, nnz, col_begin + row_begin[0]), val(queue, nnz, val_begin + row_begin[0]) { if (row_begin[0] != 0) vector<int>(queue, row) -= row_begin[0]; cuda_check( cusparseSetMatType(desc.get(), CUSPARSE_MATRIX_TYPE_GENERAL) ); cuda_check( cusparseSetMatIndexBase(desc.get(), CUSPARSE_INDEX_BASE_ZERO) ); }
/// Constructor. Extracts a backend::kernel instance from backend::program. kernel(const command_queue &queue, const program &P, const std::string &name, size_t smem_per_thread = 0 ) : ctx(queue.context()), P(P), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
void mul(const device_vector<double> &x, device_vector<double> &y, double alpha = 1, bool append = false) const { double beta = append ? 1.0 : 0.0; cuda_check( cusparseDhybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, desc.get(), mat.get(), x.raw_ptr(), &beta, y.raw_ptr() ) ); }
void mul(const device_vector<float> &x, device_vector<float> &y, float alpha = 1, bool append = false) const { float beta = append ? 1.0f : 0.0f; cuda_check( cusparseShybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, desc.get(), mat.get(), x.raw_ptr(), &beta, y.raw_ptr() ) ); }
/// Constructor. Creates a backend::kernel instance from source. kernel(const command_queue &queue, const std::string &src, const std::string &name, size_t smem_per_thread = 0, const std::string &options = "" ) : ctx(queue.context()), P(build_sources(queue, src, options)), smem(0) { cuda_check( cuModuleGetFunction(&K, P.raw(), name.c_str()) ); config(queue, [smem_per_thread](size_t wgs){ return wgs * smem_per_thread; }); }
void fill_matrix(const command_queue &q, int n, int m, const row_t *row, const col_t *col, const double *val) { device_vector<int> r(q, n + 1, row); device_vector<int> c(q, row[n], col + row[0]); device_vector<double> v(q, row[n], val + row[0]); if (row[0] != 0) vector<int>(q, r) -= row[0]; cuda_check( cusparseDcsr2hyb(handle, n, m, desc.get(), v.raw_ptr(), r.raw_ptr(), c.raw_ptr(), mat.get(), 0, CUSPARSE_HYB_PARTITION_AUTO ) ); }
/// Enqueue the kernel to the specified command queue. void operator()(const command_queue &q) { prm_addr.clear(); for(auto p = prm_pos.begin(); p != prm_pos.end(); ++p) prm_addr.push_back(stack.data() + *p); cuda_check( cuLaunchKernel( K, static_cast<unsigned>(g_size.x), static_cast<unsigned>(g_size.y), static_cast<unsigned>(g_size.z), static_cast<unsigned>(w_size.x), static_cast<unsigned>(w_size.y), static_cast<unsigned>(w_size.z), static_cast<unsigned>(smem), q.raw(), prm_addr.data(), 0 ) ); reset(); }
device_vector(const command_queue &q, size_t n, const H *host = 0, mem_flags flags = MEM_READ_WRITE) : n(n) { (void)flags; if (n) { q.context().set_current(); CUdeviceptr ptr; cuda_check( cuMemAlloc(&ptr, n * sizeof(T)) ); buffer.reset(reinterpret_cast<char*>(static_cast<size_t>(ptr)), detail::deleter() ); if (host) { if (std::is_same<T, H>::value) write(q, 0, n, reinterpret_cast<const T*>(host), true); else write(q, 0, n, std::vector<T>(host, host + n).data(), true); } } }
void Random<GPU>::gaussian (float *data, int size, const float mu, const float sigma) const { CHECK (sigma > 0.f); cuda_check (curandGenerateNormal (dnnctx[did_]->curand_, data, size, mu, sigma)); }
void Random<GPU>::set_seed (int seed) { cuda_check (curandSetPseudoRandomGeneratorSeed (dnnctx[did_]->curand_, seed)); }
void DataBuffer<DT>::page_unlk () { cuda_check (cudaHostUnregister ( data_.dptr)); cuda_check (cudaHostUnregister ( pred_.dptr)); cuda_check (cudaHostUnregister (label_.dptr)); }
void DataBuffer<DT>::page_lock () { cuda_check (cudaHostRegister ( data_.dptr, data_.size_d(), cudaHostRegisterPortable)); cuda_check (cudaHostRegister ( pred_.dptr, pred_.size_d(), cudaHostRegisterPortable)); cuda_check (cudaHostRegister (label_.dptr, label_.size_d(), cudaHostRegisterPortable)); }
static void dispose(cusparseHybMat_t handle) { cuda_check( cusparseDestroyHybMat(handle) ); }
static void dispose(cusparseMatDescr_t handle) { cuda_check( cusparseDestroyMatDescr(handle) ); }
static void dispose(cusparseHandle_t handle) { cuda_check( cusparseDestroy(handle) ); }