void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, WORKSPACE_LIMIT, &fwd_algo )); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &workspace_size )); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }
void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); #if CUDNN_MAJOR >= 6 cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); #else cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); #endif #if CUDNN_MAJOR >= 7 cudnnSetConvolutionGroupCount(l->convDesc, l->groups); #else if(l->groups > 1){ error("CUDNN < 7 doesn't support groups, please upgrade!"); } #endif cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); //cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
void CuDNNConvolutionLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::Reshape(bottom, top); CHECK_EQ(2, this->num_spatial_axes_) << "CuDNNConvolution input must have 2 spatial axes " << "(e.g., height and width). " << "Use 'engine: CAFFE' for general ND convolution."; bottom_offset_ = this->bottom_dim_ / this->group_; top_offset_ = this->top_dim_ / this->group_; const int height = bottom[0]->shape(this->channel_axis_ + 1); const int width = bottom[0]->shape(this->channel_axis_ + 2); const int height_out = top[0]->shape(this->channel_axis_ + 1); const int width_out = top[0]->shape(this->channel_axis_ + 2); const int* pad_data = this->pad_.cpu_data(); const int pad_h = pad_data[0]; const int pad_w = pad_data[1]; const int* stride_data = this->stride_.cpu_data(); const int stride_h = stride_data[0]; const int stride_w = stride_data[1]; // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory mangagement size_t workspace_limit_bytes, total_memory; gpu_memory::getInfo(&workspace_limit_bytes, &total_memory); for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i], this->num_, this->channels_ / this->group_, height, width, this->channels_ * height * width, height * width, width, 1); cudnn::setTensor4dDesc<Dtype>(&top_descs_[i], this->num_, this->num_output_ / this->group_, height_out, width_out, this->num_output_ * this->out_spatial_dim_, this->out_spatial_dim_, width_out, 1); cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i], filter_desc_, pad_h, pad_w, stride_h, stride_w); if (!this->IsForwardPassed() || !this->IsBackwardPassed()) { continue; } // choose forward and backward algorithms + workspace(s) CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(), bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &fwd_algo_[i])); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(), bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], fwd_algo_[i], &(workspace_fwd_sizes_[i]))); // // choose backward algorithm for filter CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( Caffe::cudnn_handle(), bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_filter_algo_[i]) ); // get workspace for backwards filter algorithm CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( Caffe::cudnn_handle(), bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); // choose backward algo for data CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( Caffe::cudnn_handle(), filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_data_algo_[i])); // get workspace size CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( Caffe::cudnn_handle(), filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::setTensor4dDesc<Dtype>(&bias_desc_, 1, this->num_output_ / this->group_, 1, 1); } }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PARAMS_TYPE* params) { PyGpuContextObject *c = input->context; void *alpha_p; void *beta_p; float af = alpha, bf = beta; cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) { PyErr_SetString(PyExc_ValueError, "Number of filters must be divisible by number of groups"); return 1; } switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } if (params->inplace) { Py_XDECREF(*output); *output = om; Py_INCREF(*output); } else { if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; } if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { int err2 = GpuArray_memset(&(*output)->ga, 0); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv could not fill the output with zeros: %d", err2); return 1; } return 0; } if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) return 1; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) return 1; size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; cudnnConvolutionFwdAlgo_t algo = params->conv_algo; #ifdef DEBUG char algorithm_name[128]; #endif cuda_enter(c->ctx); if (params->choose_algo) { if (!params->choose_once) { reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } } if (!reuse_algo) { size_t free; int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU"); cuda_exit(c->ctx); return 1; } // Guess 4Mb if the info is not available if (free == 0) free = 4 * 1024 * 1024; if (params->choose_time) { int count; cudnnConvolutionFwdAlgoPerf_t choice; gpudata *tmpmem; tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); if (tmpmem == NULL) { PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); return -1; } // We don't sync the buffer as we don't care about the values. err = cudnnFindConvolutionForwardAlgorithmEx( params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), 1, &count, &choice, *(void **)tmpmem, free); gpudata_release(tmpmem); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #ifdef DEBUG if (count == 0) { PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); return 1; } else if (choice.status != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting best-timed FWD algo: %s", cudnnGetErrorString(choice.status)); return 1; } // Else, count is necessarly 1 for current implementation. #endif } else { err = cudnnGetConvolutionForwardAlgorithm( params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } } prev_algo = algo; } else { algo = prev_algo; } #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; // NB: This is printed only when algorithm is chosen at runtime. if (reuse_algo) fprintf(stderr, "(reused %s)\n", algorithm_name); else fprintf(stderr, "(using %s)\n", algorithm_name); #endif if (params->choose_once) { reuse_algo = 1; } else { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } } } /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (PyGpuArray_NDIM(input) == 5 && !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) { #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Issue should be resolved for cuDNN > V6.0. if (cudnnGetVersion() < 6100 && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && PyGpuArray_DIM(input, 0) > 65536) { #ifdef DEBUG fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM " "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n"); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. The tiled-FFT implementation // does not support strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters /* NB: TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1. New conditions apply and may depend on number of dimensions (2D or 3D) e.g. for FFT_TILING. TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms? */ if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int dilation[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s error getting worksize: " "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); for ( int g = 0; g < params->num_groups; g++) { err = cudnnConvolutionForward( params->handle, alpha_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); } if (worksize != 0) gpudata_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void convolution_layer_updater_cuda::enqueue_test( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { cudnnConvolutionFwdAlgo_t algo; cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm( cuda_config->get_cudnn_handle(), input_data_desc, weights_desc, convolution_desc, output_data_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, additional_buffers[0]->get_size(), &algo)); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, weights_desc, *data[0], convolution_desc, algo, *additional_buffers[0], additional_buffers[0]->get_size(), &beta, output_data_desc, *output_neurons_buffer)); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_neurons_buffer)); } }
int APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, CudaNdarray *om, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **output) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; int nb_dim = CudaNdarray_NDIM(input); #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; { size_t worksize; void *workspace; cudnnConvolutionFwdAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the inputs and the kernels // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] == APPLY_SPECIFIC(previous_input_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] == APPLY_SPECIFIC(previous_kerns_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the input and kernel // shapes. Either by choosing one according to heuristics or by making // cuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionForwardAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // The implementation should be chosen using heuristics based on the // input shapes and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionForwardAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the inputs and kernels as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo_set) = true; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_input_shape)[i] = CudaNdarray_HOST_DIMS(input)[i]; APPLY_SPECIFIC(previous_kerns_shape)[i] = CudaNdarray_HOST_DIMS(kerns)[i]; } } else { // Reuse the previously chosen convolution implementation chosen_algo = APPLY_SPECIFIC(previous_algo); } } else { chosen_algo = CONV_ALGO; } // The FFT implementation (only in V3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // The tiled-FFT implementation (only in V4 onward) does not support // strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default on a safe implementation if it // can't. // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are // defined only for 2d-filters if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(kerns)[2]; int filter_w = CudaNdarray_HOST_DIMS(kerns)[3]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(input)[2]; int input_w = CudaNdarray_HOST_DIMS(input)[3]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } err = cudnnGetConvolutionForwardWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), chosen_algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported // TODO: Print a warning chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), chosen_algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; err = cudnnConvolutionForward( _handle, (void *)&alpha, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output)); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*out_h * out_w * l->n*sizeof(float)); l->delta = realloc(l->delta, l->batch*out_h * out_w * l->n*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); int padding = l->pad ? l->size/2 : 0; cudnnSetConvolution2dDescriptor(l->convDesc, padding, padding, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->filterDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->filterDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); #endif #endif l->workspace_size = get_workspace_size(*l); }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = pad; l.batch_normalize = batch_normalize; l.filters = calloc(c*n*size*size, sizeof(float)); l.filter_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); if(binary){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } #ifdef GPU l.filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); } if(xnor){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.filterDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dfilterDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); int padding = l.pad ? l.size/2 : 0; cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l.srcTensorDesc, l.filterDesc, l.convDesc, l.dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l.fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l.filterDesc, l.ddstTensorDesc, l.convDesc, l.dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l.bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l.srcTensorDesc, l.ddstTensorDesc, l.convDesc, l.dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l.bf_algo); #endif #endif l.workspace_size = get_workspace_size(l); l.activation = activation; fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); return l; }
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\ create(const std::vector<DataTensor_in *>& inputs, std::vector<DataTensor_out *>& outputs, ConvActivePoolingParam<OpTensor>& param, Context<NV> &ctx) { if (!(ctx == this->_ctx)) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } this->_ctx = ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } int input_num = inputs[0]->num(); int input_channel = inputs[0]->channel(); int input_height = inputs[0]->height(); int input_width = inputs[0]->width(); int output_channel = outputs[0]->channel(); int output_height = outputs[0]->height(); int output_width = outputs[0]->width(); { _inner_shape = inputs[0]->shape(); _inner_shape[0] = input_num; _inner_shape[1] = param.conv_param.weight()->num(); int kernel_exten = param.conv_param.dilation_h * (param.conv_param.weight()->height() - 1) + 1; int output_dim = (input_height + 2 * param.conv_param.pad_h - kernel_exten) / param.conv_param.stride_h + 1; _inner_shape[2] = output_dim; kernel_exten = param.conv_param.dilation_w * (param.conv_param.weight()->width() - 1) + 1; output_dim = (input_width + 2 * param.conv_param.pad_w - kernel_exten) / param.conv_param.stride_w + 1; _inner_shape[3] = output_dim; _inner_tensor.re_alloc(_inner_shape); } int kernel_h = param.conv_param.weight()->height(); int kernel_w = param.conv_param.weight()->width(); int filter_dim_a[] = {output_channel, input_channel / param.conv_param.group, kernel_h, kernel_w}; cudnn::setNDFilterDesc<OpDataType>(&_filter_desc, param.conv_param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); Shape in_stride = inputs[0]->get_stride(); Shape inner_stride = _inner_tensor.get_stride(); Shape out_stride = outputs[0]->get_stride(); int dim_a[] = {input_num, input_channel, input_height, input_width}; int dim_inner[] = {_inner_shape[0], _inner_shape[1], _inner_shape[2], _inner_shape[3]}; int dim_b[] = {input_num, output_channel, output_height, output_width}; cudnn::setTensorNdDesc<InDataType >(&_input_descs, inputs[0]->dims(), dim_a, &in_stride[0]); cudnn::setTensorNdDesc<InDataType >(&_inner_descs, 4, dim_inner, &inner_stride[0]); cudnn::setTensorNdDesc<InDataType>(&_output_descs, outputs[0]->dims(), dim_b, &out_stride[0]); int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; cudnn::setConvolutionNdDesc<OpDataType >(&_conv_descs, inputs[0]->dims() - 2, pad_a, filter_stride_a, dilation_a); // set activation descriptor if (param.has_activation) { cudnn::set_activation_des<OpDataType>(&_active_descs, param.activation_param.active); } if (param.has_pooling) { int windowHeight[] = {param.pooling_param.window_h, param.pooling_param.window_w}; int padding[] = {param.pooling_param.pad_h, param.pooling_param.pad_w}; int stride[] = {param.pooling_param.stride_h, param.pooling_param.stride_w}; cudnn::set_nd_pooling_des<OpDataType >(&_pooling_descs, param.pooling_param.pooling_type, _inner_tensor.dims() - 2, windowHeight, padding,stride); } // true: use tensor core // false: disable tensor core cudnn::set_math_type<OpDataType>(&_conv_descs, _use_tensor_core); cudnn::set_group_count<OpDataType>(&_conv_descs, param.conv_param.group); // Get fastest implement of cudnn // set up algo and workspace size if (param.conv_param.group == inputs[0]->channel() && \ inputs[0]->channel() == outputs[0]->channel()) { _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;//CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } else { CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \ _input_descs, _filter_desc, _conv_descs, _inner_descs, \ _preference, _workspace_limit_bytes, &_fwd_algo)); } CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, _input_descs, _filter_desc, _conv_descs, _inner_descs, _fwd_algo, &_workspace_fwd_sizes)); if (_workspace_fwd_sizes > _workspaceSizeInBytes) { _workspaceSizeInBytes = _workspace_fwd_sizes; if (_workspaceData != NULL) { cudaFree(_workspaceData); } cudaMalloc(&_workspaceData, _workspaceSizeInBytes); _workspace = reinterpret_cast<char*>(_workspaceData); } if (param.conv_param.bias()->size()> 0) { int dim_bias[] = {1, output_channel, 1, 1}; int stride_bias[] = {output_channel, 1, 1, 1}; cudnn::setTensorNdDesc<OpDataType >(&_bias_desc, 4, dim_bias, stride_bias); } return SaberSuccess; }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO /* Static variables are only initialized once so this will not * reset the previous algo every time */ static int reuse_algo = 0; static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_kern_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionFwdAlgoPerf_t choice; err = cudnnFindConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } #endif #endif /* These two algos are not supported for 3d conv */ if (PyGpuArray_NDIM(input) == 5 && (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } } #endif #if CUDNN_VERSION < 3000 /* cuDNN before v3 does not support kernels larger than input even * if appropriate padding is selected. */ for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) { if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) { PyErr_SetString(PyExc_RuntimeError, "the current version " "of CuDNN does not support kernels larger than the " "inputs in any spatial dimension, even if the inputs " "are padded such that the padded inputs are larger " "than the kernels. Update your installation of CuDNN " "to V3 or more recent to solve the issue."); cuda_exit(c->ctx); return 1; } } #endif { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } err = cudnnConvolutionForward( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); if (worksize != 0) c->ops->buffer_release(workspace); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void CuDNNConvolutionLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::Reshape(bottom, top); CHECK_EQ(2, this->num_spatial_axes_) << "CuDNNConvolution input must have 2 spatial axes " << "(e.g., height and width). " << "Use 'engine: CAFFE' for general ND convolution."; bottom_offset_ = this->bottom_dim_ / this->group_; top_offset_ = this->top_dim_ / this->group_; const int_tp height = bottom[0]->shape(this->channel_axis_ + 1); const int_tp width = bottom[0]->shape(this->channel_axis_ + 2); const int_tp height_out = top[0]->shape(this->channel_axis_ + 1); const int_tp width_out = top[0]->shape(this->channel_axis_ + 2); const int_tp* pad_data = this->pad_.cpu_data(); const int_tp pad_h = pad_data[0]; const int_tp pad_w = pad_data[1]; const int_tp* stride_data = this->stride_.cpu_data(); const int_tp stride_h = stride_data[0]; const int_tp stride_w = stride_data[1]; // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory mangagement uint_tp workspace_limit_bytes = 8*1024*1024; for (int_tp i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i], this->num_, this->channels_ / this->group_, height, width, this->channels_ * height * width, height * width, width, 1); cudnn::setTensor4dDesc<Dtype>(&top_descs_[i], this->num_, this->num_output_ / this->group_, height_out, width_out, this->num_output_ * this->out_spatial_dim_, this->out_spatial_dim_, width_out, 1); cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i], filter_desc_, pad_h, pad_w, stride_h, stride_w); // choose forward and backward algorithms + workspace(s) CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &fwd_algo_[i])); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], fwd_algo_[i], &(workspace_fwd_sizes_[i]))); // choose backward algorithm for filter CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0], bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_filter_algo_[i]) ); // get workspace for backwards filter algorithm CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0], bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); // choose backward algo for data CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_data_algo_[i])); // get workspace size CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); } // reduce over all workspace sizes to get a maximum to allocate / reallocate uint_tp total_workspace_fwd = 0; uint_tp total_workspace_bwd_data = 0; uint_tp total_workspace_bwd_filter = 0; for (uint_tp i = 0; i < bottom.size(); i++) { total_workspace_fwd = std::max(total_workspace_fwd, workspace_fwd_sizes_[i]); total_workspace_bwd_data = std::max(total_workspace_bwd_data, workspace_bwd_data_sizes_[i]); total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, workspace_bwd_filter_sizes_[i]); } // get max over all operations uint_tp max_workspace = std::max(total_workspace_fwd, total_workspace_bwd_data); max_workspace = std::max(max_workspace, total_workspace_bwd_filter); // ensure all groups have enough workspace uint_tp total_max_workspace = max_workspace * (this->group_ * CUDNN_STREAMS_PER_GROUP); // this is the total amount of storage needed over all groups + streams if (total_max_workspace > workspaceSizeInBytes) { LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; workspaceSizeInBytes = total_max_workspace; // free the existing workspace and allocate a new (larger) one cudaFree(this->workspaceData); cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes); if (err != cudaSuccess) { // force zero memory path for (int_tp i = 0; i < bottom.size(); i++) { workspace_fwd_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; workspace_bwd_data_sizes_[i] = 0; fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } // NULL out all workspace pointers for (int_tp g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { workspace[g] = NULL; } // NULL out underlying data workspaceData = NULL; workspaceSizeInBytes = 0; } // if we succeed in the allocation, set pointer aliases for workspaces for (int_tp g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace; } } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::setTensor4dDesc<Dtype>(&bias_desc_, 1, this->num_output_ / this->group_, 1, 1); } }