/*cudnn set tensor dim*/ void setTensorDesc(cudnnTensorDescriptor_t& tensorDesc, cudnnTensorFormat_t& tensorFormat, cudnnDataType_t& dataType, int n, int c, int h, int w){ #if SIMPLE_TENSOR_DESCRIPTOR /*cudnn set 4d tensor*/ checkCUDNN(cudnnSetTensor4dDescriptor(tensorDesc, tensorFormat, dataType, n, c, h, w)); #elif defined(ND_TENSOR_DESCRIPTOR) const int nDim = 4; int dimA[nDim] = {n,c,h,w}; int strideA[nDim] = {c*h*w, h*w, w, 1}; checkCUDNN(cudnnSetTensorNdDescriptor(tensorDesc, dataType, 4, dimA, strideA)); #else checkCUDNN(cudnnSetTensor4dDescriptorEx(tensorDesc, dataType, n, c, h, w, c*h*w, h*w, w, 1)); #endif }
inline void setTensorNdDesc(cudnnTensorDescriptor_t* desc, const int_tp total_dims, const int_tp* shape, const int_tp* stride) { // Pad to at least 4 dimensions int_tp cudnn_dims = std::max(total_dims, (int_tp)4); int_tp padding = std::max((int_tp)0, cudnn_dims - total_dims); std::vector<int> shape_int(cudnn_dims); std::vector<int> stride_int(cudnn_dims); for (int_tp i = cudnn_dims - 1; i >= 0; --i) { if (i < padding) { shape_int[i] = 1; stride_int[i] = shape_int[i + 1] * stride_int[i + 1]; } else { shape_int[i] = shape[i - padding]; stride_int[i] = stride[i - padding]; } } const int* shape_ptr = &shape_int[0]; const int* stride_ptr = &stride_int[0]; CUDNN_CHECK( cudnnSetTensorNdDescriptor(*desc, dataType<Dtype>::type, cudnn_dims, shape_ptr, stride_ptr)); }
CuDnnTensorDescriptor(size_t hiddenSize, size_t miniBatch, size_t numLayers) : m_tensorDesc(nullptr) { cudnnDataType_t m_dataType = CuDnnTensor::GetDataType<ElemType>(); int dimA[3] = { (int)hiddenSize, (int)miniBatch, (int)numLayers }; int strideA[3] = { 1, dimA[0], dimA[0] * dimA[1] }; CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensorDesc)); CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensorDesc, m_dataType, 3, dimA, strideA)); }
void PoolBC01CuDNN<T>::fprop(const T *imgs, int *imgs_shape, T *poolout) { bool new_shape = false; int n_imgs_dims = n_img_dims + 2; for (int i = 0; i < n_imgs_dims; ++i) { if (this->imgs_shape[i] != imgs_shape[i]) { new_shape = true; break; } } if (new_shape) { for (int i = 0; i < n_imgs_dims; ++i) { this->imgs_shape[i] = imgs_shape[i]; } int imgs_strides[n_imgs_dims]; array_strides(n_imgs_dims, imgs_shape, imgs_strides); CUDNN_CHECK(cudnnSetTensorNdDescriptor( imgs_desc, CUDNN_DATA_FLOAT, n_imgs_dims, imgs_shape, imgs_strides )); CUDNN_CHECK(cudnnSetPoolingNdDescriptor( pool_desc, pool_mode, n_img_dims, win_shape, padding, strides )); int poolout_shape[n_imgs_dims]; poolout_shape[0] = imgs_shape[0]; poolout_shape[1] = imgs_shape[1]; for (int i = 0; i < n_img_dims; ++i) { poolout_shape[i+2] = (imgs_shape[i+2] + 2*padding[i] - win_shape[i]) / strides[i] + 1; } int poolout_strides[n_imgs_dims]; array_strides(n_imgs_dims, poolout_shape, poolout_strides); CUDNN_CHECK(cudnnSetTensorNdDescriptor( poolout_desc, CUDNN_DATA_FLOAT, n_imgs_dims, poolout_shape, poolout_strides )); } CUDNN_CHECK(cudnnPoolingForward( CUDNN::handle(), pool_desc, &CUDNN::one, imgs_desc, imgs, &CUDNN::zero, poolout_desc, poolout )); }
static int c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) { cudnnDataType_t dt; size_t ds; switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd"); return -1; } ds = gpuarray_get_elsize(var->ga.typecode); int strs[8], dims[8], default_stride = 1; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 8) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ? PyGpuArray_STRIDE(var, i)/ds : default_stride; default_stride *= PyGpuArray_DIM(var, i); dims[i] = PyGpuArray_DIM(var, i); } /* Tensors can't be smaller than 3d for cudnn so we pad the * descriptor if they are */ for (unsigned int i = nd; i < 3; i++) { strs[i] = 1; dims[i] = 1; } //only for grouped convolution i.e when groups > 1 dims[1] = dims[1] / groups; cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set tensorNd descriptor: %s", cudnnGetErrorString(err)); return -1; } return 0; }
void CuDnnRNNExecutor<ElemType>::SetDescriptors(size_t dim, const vector<size_t>& numSequencesForFrame, vector<cudnnTensorDescriptor_t>& descriptors) { for (size_t i = 0; i < numSequencesForFrame.size(); i++) { if (descriptors.size() <= i) { descriptors.push_back(cudnnTensorDescriptor_t()); CUDNN_CALL(cudnnCreateTensorDescriptor(&descriptors[i])); } // these dimensions are what CUDNN expects: (the minibatch dimension, the data dimension, and the number 1 (because each descriptor describes one frame of data) int dims[3] = { (int)numSequencesForFrame[i], (int)dim, 1 }; int strides[3] = { dims[2] * dims[1], dims[2], 1 }; CUDNN_CALL(cudnnSetTensorNdDescriptor(descriptors[i], m_dataType, 3, dims, strides)); } }
static int c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { cudnnDataType_t dt; size_t ds; switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; #if CUDNN_VERSION > 3000 case GA_HALF: dt = CUDNN_DATA_HALF; break; #endif default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd"); return -1; } ds = gpuarray_get_elsize(var->ga.typecode); int strs[5], dims[5], default_stride = 1; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 5) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; strs[i] = PyGpuArray_STRIDE(var, i) ? PyGpuArray_STRIDE(var, i)/ds : default_stride; default_stride *= PyGpuArray_DIM(var, i); dims[i] = PyGpuArray_DIM(var, i); } cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set tensorNd descriptor: %s", cudnnGetErrorString(err)); return -1; } return 0; }
void cudnn_util::set_tensor_descriptor( cudnnTensorDescriptor_t tensor_desc, const layer_configuration_specific& config, unsigned int entry_count) { std::vector<int> tensor_dimensions(config.dimension_sizes.size() + 2); tensor_dimensions[0] = entry_count; tensor_dimensions[1] = config.feature_map_count; for(int i = 0; i < config.dimension_sizes.size(); ++i) tensor_dimensions[i + 2] = config.dimension_sizes[config.dimension_sizes.size() - 1 - i]; std::vector<int> tensor_strides(tensor_dimensions.size()); tensor_strides.back() = 1; for(int i = static_cast<int>(tensor_strides.size()) - 2; i >= 0; --i) tensor_strides[i] = tensor_strides[i + 1] * tensor_dimensions[i + 1]; cudnn_safe_call(cudnnSetTensorNdDescriptor( tensor_desc, CUDNN_DATA_FLOAT, static_cast<int>(tensor_dimensions.size()), &tensor_dimensions[0], &tensor_strides[0])); }
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, PyGpuArrayObject *w, PyGpuArrayObject *x, PyGpuArrayObject *hx, PyGpuArrayObject *cx, gpudata **reserve, PyGpuArrayObject **y, PyGpuArrayObject **hy, PyGpuArrayObject **cy, cudnnHandle_t _handle) { PyGpuContextObject *c = x->context; cudnnTensorDescriptor_t xdesc = NULL; cudnnTensorDescriptor_t hxdesc = NULL; cudnnTensorDescriptor_t cxdesc = NULL; cudnnTensorDescriptor_t ydesc = NULL; cudnnTensorDescriptor_t hydesc = NULL; cudnnTensorDescriptor_t cydesc = NULL; cudnnFilterDescriptor_t wdesc = NULL; cudnnTensorDescriptor_t *xl = NULL; cudnnTensorDescriptor_t *yl = NULL; gpudata *workspace = NULL; size_t worksize, ressize; size_t seqLength = PyGpuArray_DIM(x, 0); size_t miniBatch = PyGpuArray_DIM(x, 1); size_t inputSize = PyGpuArray_DIM(x, 2); size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2); size_t shape[3]; int strs[3], dims[3]; cudnnStatus_t err; cudnnDataType_t dt; int res = -1; switch (x->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported data type for x"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(c->ctx); err = cudnnCreateTensorDescriptor(&xdesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create xdesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = PyGpuArray_DIM(x, 1); dims[1] = PyGpuArray_DIM(x, 2); dims[2] = 1; strs[0] = dims[1] * dims[2]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %s", cudnnGetErrorString(err)); goto fail; } if (c_make_tensorNd(hx, &hxdesc) != 0) goto fail; if (cx != NULL) if (c_make_tensorNd(cx, &cxdesc) != 0) goto fail; if (c_make_filter(w, &wdesc) != 0) goto fail; shape[0] = seqLength; shape[1] = miniBatch; shape[2] = hiddenSizeDir; if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0) goto fail; err = cudnnCreateTensorDescriptor(&ydesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create ydesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = shape[1]; dims[1] = shape[2]; dims[2] = 1; strs[0] = dims[2] * dims[1]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set ydesc: %s", cudnnGetErrorString(err)); goto fail; } if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx), hx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*hy, &hydesc) != 0) goto fail; if (cy != NULL) { if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx), cx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*cy, &cydesc) != 0) goto fail; } xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (xl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) xl[i] = xdesc; yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (yl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) yl[i] = ydesc; err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, xl, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get worksize: %s", cudnnGetErrorString(err)); goto fail; } workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace"); goto fail; } err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength, xl, &ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get reserve size: %s", cudnnGetErrorString(err)); goto fail; } *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL); if (*reserve == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve"); goto fail; } err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength, xl, PyGpuArray_DEV_DATA(x), hxdesc, PyGpuArray_DEV_DATA(hx), cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL, wdesc, PyGpuArray_DEV_DATA(w), yl, PyGpuArray_DEV_DATA(*y), hydesc, PyGpuArray_DEV_DATA(*hy), cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL, *(void **)workspace, worksize, *(void **)(*reserve), ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could run RNN: %s", cudnnGetErrorString(err)); goto fail; } res = 0; fail: if (xdesc != NULL) cudnnDestroyTensorDescriptor(xdesc); if (hxdesc != NULL) cudnnDestroyTensorDescriptor(hxdesc); if (cxdesc != NULL) cudnnDestroyTensorDescriptor(cxdesc); if (wdesc != NULL) cudnnDestroyFilterDescriptor(wdesc); if (ydesc != NULL) cudnnDestroyTensorDescriptor(ydesc); if (hydesc != NULL) cudnnDestroyTensorDescriptor(hydesc); if (cydesc != NULL) cudnnDestroyTensorDescriptor(cydesc); free(xl); free(yl); if (workspace != NULL) gpudata_release(workspace); cuda_exit(c->ctx); return res; }