void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, WORKSPACE_LIMIT, &fwd_algo )); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &workspace_size )); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); const int n_requestedAlgo = 10; int n_returnedAlgo; cudnnConvolutionFwdAlgoPerf_t fwd_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, n_requestedAlgo, &n_returnedAlgo, fwd_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionFwdAlgoPerf_t found"); } fwd_algo = fwd_algo_perf[0].algo; cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc, n_requestedAlgo, &n_returnedAlgo, bwd_data_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionBwdDataAlgoPerf_t found"); } bwd_imgs_algo = bwd_data_algo_perf[0].algo; cudnnConvolutionBwdFilterAlgoPerf_t bwd_filters_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc, n_requestedAlgo, &n_returnedAlgo, bwd_filters_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionBwdFilterAlgoPerf_t found"); } bwd_filters_algo = bwd_filters_algo_perf[0].algo; size_t fwd_workspace_size; size_t bwd_imgs_workspace_size; size_t bwd_filters_workspace_size; CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &fwd_workspace_size )); CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc, bwd_imgs_algo, &bwd_imgs_workspace_size )); CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc, bwd_filters_algo, &bwd_filters_workspace_size )); workspace_size = std::max(fwd_workspace_size, bwd_imgs_workspace_size); workspace_size = std::max(workspace_size, bwd_filters_workspace_size); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }