inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool, cudnnPoolingMode_t mode, int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool)); CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool, mode, h, w, pad_h, pad_w, stride_h, stride_w)); }
inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc, Size size, Stride stride) { CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type, size.num(), size.channels(), size.height(), size.width(), stride.nstride(), stride.cstride(), stride.hstride(), stride.wstride())); }
ConvBC01CuDNN<T>::ConvBC01CuDNN(int pad_y, int pad_x, int stride_y, int stride_x) : pad_y(pad_y), pad_x(pad_x), stride_y(stride_y), stride_x(stride_x), n_imgs(0), n_channels(0), n_filters(0), img_h(0), img_w(0), filter_h(0), filter_w(0), workspace_size(0) { CUDNN_CHECK(cudnnCreateTensorDescriptor(&imgs_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&convout_desc)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filters_desc)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc)); }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, int n, int c, int h, int w) { CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); #if CUDNN_VERSION_MIN(5, 0, 0) CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type, CUDNN_TENSOR_NCHW, n, c, h, w)); #else CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type, CUDNN_TENSOR_NCHW, n, c, h, w)); #endif }
inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, int pad_h, int pad_w, int stride_h, int stride_w) { #if CUDNN_VERSION_MIN(6, 0, 0) CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION, dataType<Dtype>::type)); #else CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); #endif }
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, WORKSPACE_LIMIT, &fwd_algo )); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &workspace_size )); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }
void ConvBC01CuDNN<T>::bprop(const T* imgs, const T* filters, const T *convout_d, T *imgs_d, T *filters_d) { if (filters_d) { CUDNN_CHECK(cudnnConvolutionBackwardFilter( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, convout_desc, convout_d, conv_desc, &CUDNN::zero, filters_desc, filters_d )); } if (imgs_d) { CUDNN_CHECK(cudnnConvolutionBackwardData( CUDNN::handle(), &CUDNN::one, filters_desc, filters, convout_desc, convout_d, conv_desc, &CUDNN::zero, imgs_desc, imgs_d )); } }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace = NULL; workspace = NULL; workspaceSizeInBytes = (size_t)0; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
void PoolBC01CuDNN<T>::bprop(const T *imgs, const T* poolout, const T *poolout_d, T *imgs_d) { CUDNN_CHECK(cudnnPoolingBackward( CUDNN::handle(), pool_desc, &CUDNN::one, poolout_desc, poolout, poolout_desc, poolout_d, imgs_desc, imgs, &CUDNN::zero, imgs_desc, imgs_d )); }
inline void setTensorNdDesc(cudnnTensorDescriptor_t* desc, const int_tp total_dims, const int_tp* shape, const int_tp* stride) { // Pad to at least 4 dimensions int_tp cudnn_dims = std::max(total_dims, (int_tp)4); int_tp padding = std::max((int_tp)0, cudnn_dims - total_dims); std::vector<int> shape_int(cudnn_dims); std::vector<int> stride_int(cudnn_dims); for (int_tp i = cudnn_dims - 1; i >= 0; --i) { if (i < padding) { shape_int[i] = 1; stride_int[i] = shape_int[i + 1] * stride_int[i + 1]; } else { shape_int[i] = shape[i - padding]; stride_int[i] = stride[i - padding]; } } const int* shape_ptr = &shape_int[0]; const int* stride_ptr = &stride_int[0]; CUDNN_CHECK( cudnnSetTensorNdDescriptor(*desc, dataType<Dtype>::type, cudnn_dims, shape_ptr, stride_ptr)); }
void Activation::compute_gpu(const vector<bool>& add) { DTYPE alpha = 1.; DTYPE beta = add[0] ? 1. : 0.; CUDNN_CHECK(cudnnActivationForward(cudnn_handle(), activation_mode_, &alpha, bottom_desc_, inputs_[0]->gpu_data(), &beta, top_desc_, outputs_[0]->mutable_gpu_data())); }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); // stride const int* kernel_shape_data = this->kernel_shape_.cpu_data(); // stride const int* stride_data = this->stride_.cpu_data(); // padding const int* pad_data = this->pad_.cpu_data(); int kernel_shape[this->num_spatial_axes_]; int stride[this->num_spatial_axes_]; int pad[this->num_spatial_axes_]; for (int i = 0; i < this->num_spatial_axes_; i++){ kernel_shape[i] = kernel_shape_data[i]; stride[i] = stride_data[i]; pad[i] = pad_data[i]; } CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createPoolingNdDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->num_spatial_axes_, kernel_shape, pad, stride); handles_setup_ = true; }
void Softmax::compute_gpu(const vector<bool>& add) { DTYPE alpha = 1.; DTYPE beta = add[0] ? 1. : 0.; CUDNN_CHECK(cudnnSoftmaxForward(cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, softmax_mode_, &alpha, bottom_desc_, inputs_[0]->gpu_data(), &beta, top_desc_, outputs_[0]->mutable_gpu_data())); }
void Context::Init(int device_id) { device_id_ = device_id; SwitchDevice(); #if defined(USE_CUDA) if (blas_handle_ == nullptr) { CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_)); CHECK_NOTNULL(blas_handle_); } #endif #if defined(USE_CUDNN) if (cudnn_handle_ == nullptr) { CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_)); CHECK_NOTNULL(cudnn_handle_); } #endif #if defined(USE_NNPACK) if (nnpack_handle_ == nullptr) { CHECK_EQ(nnp_initialize(), nnp_status_success); nnpack_handle_ = pthreadpool_create(0); CHECK_NOTNULL(nnpack_handle_); } #endif }
void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LRNLayer<Dtype>::LayerSetUp(bottom, top); CUDNN_CHECK(cudnnCreate(&handle_)); CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); // create a LRN handle handles_setup_ = true; size_ = this->layer_param().lrn_param().local_size(); alpha_ = this->layer_param().lrn_param().alpha(); beta_ = this->layer_param().lrn_param().beta(); k_ = this->layer_param().lrn_param().k(); }
inline void createPoolingDesc(cudnnPoolingDescriptor_t* conv, PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, int h, int w, int stride_h, int stride_w) { switch (poolmethod) { case PoolingParameter_PoolMethod_MAX: *mode = CUDNN_POOLING_MAX; break; case PoolingParameter_PoolMethod_AVE: *mode = CUDNN_POOLING_AVERAGE; break; default: LOG(FATAL) << "Unknown pooling method."; } CUDNN_CHECK(cudnnCreatePoolingDescriptor(conv)); CUDNN_CHECK(cudnnSetPoolingDescriptor(*conv, *mode, h, w, stride_h, stride_w)); }
inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { switch (poolmethod) { case PoolingParameter_PoolMethod_MAX: *mode = CUDNN_POOLING_MAX; break; case PoolingParameter_PoolMethod_AVE: *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; break; default: LOG(FATAL) << "Unknown pooling method."; } CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, pad_h, pad_w, stride_h, stride_w)); }
void PoolBC01CuDNN<T>::fprop(const T *imgs, int *imgs_shape, T *poolout) { bool new_shape = false; int n_imgs_dims = n_img_dims + 2; for (int i = 0; i < n_imgs_dims; ++i) { if (this->imgs_shape[i] != imgs_shape[i]) { new_shape = true; break; } } if (new_shape) { for (int i = 0; i < n_imgs_dims; ++i) { this->imgs_shape[i] = imgs_shape[i]; } int imgs_strides[n_imgs_dims]; array_strides(n_imgs_dims, imgs_shape, imgs_strides); CUDNN_CHECK(cudnnSetTensorNdDescriptor( imgs_desc, CUDNN_DATA_FLOAT, n_imgs_dims, imgs_shape, imgs_strides )); CUDNN_CHECK(cudnnSetPoolingNdDescriptor( pool_desc, pool_mode, n_img_dims, win_shape, padding, strides )); int poolout_shape[n_imgs_dims]; poolout_shape[0] = imgs_shape[0]; poolout_shape[1] = imgs_shape[1]; for (int i = 0; i < n_img_dims; ++i) { poolout_shape[i+2] = (imgs_shape[i+2] + 2*padding[i] - win_shape[i]) / strides[i] + 1; } int poolout_strides[n_imgs_dims]; array_strides(n_imgs_dims, poolout_shape, poolout_strides); CUDNN_CHECK(cudnnSetTensorNdDescriptor( poolout_desc, CUDNN_DATA_FLOAT, n_imgs_dims, poolout_shape, poolout_strides )); } CUDNN_CHECK(cudnnPoolingForward( CUDNN::handle(), pool_desc, &CUDNN::one, imgs_desc, imgs, &CUDNN::zero, poolout_desc, poolout )); }
void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { SoftmaxLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDNN. CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); handles_setup_ = true; }
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ReLULayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorNdDesc<Dtype>(&bottom_desc_); cudnn::createTensorNdDesc<Dtype>(&top_desc_); handles_setup_ = true; }
void CuDNNLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LRNLayer<Dtype>::Reshape(bottom, top); cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(), this->channels_, this->height_, this->width_); cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(), this->channels_, this->height_, this->width_); CUDNN_CHECK(cudnnSetLRNDescriptor(norm_desc_, size_, alpha_, beta_, k_)); }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH); handles_setup_ = true; }
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ReLULayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); handles_setup_ = true; cudnnCreateActivationDescriptor(&activation_desc_); cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0); }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, const int_tp num_spatial_dims, const int_tp n, const int_tp c, const int_tp* shape) { std::vector<int> shape_int(num_spatial_dims + 2); shape_int[0] = n; shape_int[1] = c; for (int_tp i = 0; i < num_spatial_dims; ++i) { shape_int[2+i] = shape[i]; } const int* shape_ptr = &shape_int[0]; CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); CUDNN_CHECK(cudnnSetFilterNdDescriptor(*desc, dataType<Dtype>::type, num_spatial_dims + 2, shape_ptr)); }
PoolBC01CuDNN<T>::PoolBC01CuDNN(int n_img_dims, int *win_shape, int *padding, int *strides, PoolMode pool_mode) : n_img_dims(n_img_dims) { if (n_img_dims > MAX_IMG_DIMS + 2) { throw std::runtime_error("More than 3 image dimensions."); } for (int i = 0; i < n_img_dims; ++i) { this->win_shape[i] = win_shape[i]; this->padding[i] = padding[i]; this->strides[i] = strides[i]; } for (int i = 0; i < n_img_dims + 2; ++i) { imgs_shape[i] = -1; } this->pool_mode = pool_mode == POOL_MAX ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; CUDNN_CHECK(cudnnCreateTensorDescriptor(&imgs_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&poolout_desc)); CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc)); }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createNdPoolingDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->kernel_shape_, this->pad_, this->stride_); handles_setup_ = true; }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); const int N = bottom[0]->num(); const int K = bottom[0]->channels(); const int H = bottom[0]->height(); const int W = bottom[0]->width(); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W); cudnn::createTensor4dDesc<Dtype>(&top_desc_, N, K, H, W); }
void ConvBC01CuDNN<T>::bprop(const T* imgs, const T* filters, const T *convout_d, T *imgs_d, T *filters_d) { void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } if (filters_d) { CUDNN_CHECK(cudnnConvolutionBackwardFilter( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, convout_desc, convout_d, conv_desc, bwd_filters_algo, workspace, workspace_size, &CUDNN::zero, filters_desc, filters_d )); } if (imgs_d) { CUDNN_CHECK(cudnnConvolutionBackwardData( CUDNN::handle(), &CUDNN::one, filters_desc, filters, convout_desc, convout_d, conv_desc, bwd_imgs_algo, workspace, workspace_size, &CUDNN::zero, imgs_desc, imgs_d )); } }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); // Sanity check: CUDNN currently only supports pad == 0. CHECK_EQ(this->pad_h_, 0); CHECK_EQ(this->pad_w_, 0); CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); cudnn::createPoolingDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_); handles_setup_ = true; }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); #if CUDNN_VERSION_MIN(5, 0, 0) cudnnCreateActivationDescriptor(&activation_desc_); cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN, 0); #endif handles_setup_ = true; }