void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); #if CUDNN_MAJOR >= 6 cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); #else cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); #endif #if CUDNN_MAJOR >= 7 cudnnSetConvolutionGroupCount(l->convDesc, l->groups); #else if(l->groups > 1){ error("CUDNN < 7 doesn't support groups, please upgrade!"); } #endif cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); //cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
void CuDNNConvolutionLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::Reshape(bottom, top); CHECK_EQ(2, this->num_spatial_axes_) << "CuDNNConvolution input must have 2 spatial axes " << "(e.g., height and width). " << "Use 'engine: CAFFE' for general ND convolution."; bottom_offset_ = this->bottom_dim_ / this->group_; top_offset_ = this->top_dim_ / this->group_; const int height = bottom[0]->shape(this->channel_axis_ + 1); const int width = bottom[0]->shape(this->channel_axis_ + 2); const int height_out = top[0]->shape(this->channel_axis_ + 1); const int width_out = top[0]->shape(this->channel_axis_ + 2); const int* pad_data = this->pad_.cpu_data(); const int pad_h = pad_data[0]; const int pad_w = pad_data[1]; const int* stride_data = this->stride_.cpu_data(); const int stride_h = stride_data[0]; const int stride_w = stride_data[1]; // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory mangagement size_t workspace_limit_bytes, total_memory; gpu_memory::getInfo(&workspace_limit_bytes, &total_memory); for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i], this->num_, this->channels_ / this->group_, height, width, this->channels_ * height * width, height * width, width, 1); cudnn::setTensor4dDesc<Dtype>(&top_descs_[i], this->num_, this->num_output_ / this->group_, height_out, width_out, this->num_output_ * this->out_spatial_dim_, this->out_spatial_dim_, width_out, 1); cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i], filter_desc_, pad_h, pad_w, stride_h, stride_w); if (!this->IsForwardPassed() || !this->IsBackwardPassed()) { continue; } // choose forward and backward algorithms + workspace(s) CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(), bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &fwd_algo_[i])); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(), bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], fwd_algo_[i], &(workspace_fwd_sizes_[i]))); // // choose backward algorithm for filter CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( Caffe::cudnn_handle(), bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_filter_algo_[i]) ); // get workspace for backwards filter algorithm CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( Caffe::cudnn_handle(), bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); // choose backward algo for data CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( Caffe::cudnn_handle(), filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_data_algo_[i])); // get workspace size CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( Caffe::cudnn_handle(), filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::setTensor4dDesc<Dtype>(&bias_desc_, 1, this->num_output_ / this->group_, 1, 1); } }
int APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, CudaNdarray *km, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **kerns) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; int nb_dim = CudaNdarray_NDIM(output); #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km)) return 1; #endif if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000 { size_t worksize; void *workspace; cudnnConvolutionBwdFilterAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics, if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the the kernels and the outputs // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] == APPLY_SPECIFIC(previous_input_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] == APPLY_SPECIFIC(previous_output_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the input and output // shapes. Either by choosing one according to heuristics or by making // CuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error selecting convolution algo: " "%s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // Choose the convolution implementation using heuristics based on the // shapes of the inputs and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the inputs and kernels as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo_set) = true; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_input_shape)[i] = CudaNdarray_HOST_DIMS(input)[i]; APPLY_SPECIFIC(previous_output_shape)[i] = CudaNdarray_HOST_DIMS(output)[i]; } } else { // Reuse the previously chosen convlution implementation chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo); } } else { chosen_algo = CONV_ALGO; } // The FFT implementation (only in v3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can be used // on the current data and default on a safe implementation if it // can't. if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4) { // Extract the properties of the convolution descriptor int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y; cudnnConvolutionMode_t mode; err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w, &stride_v, &stride_h, &upscale_x, &upscale_y, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2]; int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(input)[2]; int input_w = CudaNdarray_HOST_DIMS(input)[3]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (stride_v != 1 || stride_h != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } // Infer required workspace size from the chosen implementation err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), chosen_algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } // Allocate workspace for the convolution workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; // Perform the convolution err = cudnnConvolutionBackwardFilter_v3( _handle, (void *)&alpha, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns)); } #else err = cudnnConvolutionBackwardFilter( _handle, (void *)&alpha, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, (void *)&beta, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns)); #endif if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*out_h * out_w * l->n*sizeof(float)); l->delta = realloc(l->delta, l->batch*out_h * out_w * l->n*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); int padding = l->pad ? l->size/2 : 0; cudnnSetConvolution2dDescriptor(l->convDesc, padding, padding, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->filterDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->filterDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); #endif #endif l->workspace_size = get_workspace_size(*l); }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = pad; l.batch_normalize = batch_normalize; l.filters = calloc(c*n*size*size, sizeof(float)); l.filter_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); if(binary){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } #ifdef GPU l.filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); } if(xnor){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.filterDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dfilterDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); int padding = l.pad ? l.size/2 : 0; cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l.srcTensorDesc, l.filterDesc, l.convDesc, l.dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l.fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l.filterDesc, l.ddstTensorDesc, l.convDesc, l.dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l.bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l.srcTensorDesc, l.ddstTensorDesc, l.convDesc, l.dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l.bf_algo); #endif #endif l.workspace_size = get_workspace_size(l); l.activation = activation; fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); return l; }
void CuDNNConvolutionLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::Reshape(bottom, top); CHECK_EQ(2, this->num_spatial_axes_) << "CuDNNConvolution input must have 2 spatial axes " << "(e.g., height and width). " << "Use 'engine: CAFFE' for general ND convolution."; bottom_offset_ = this->bottom_dim_ / this->group_; top_offset_ = this->top_dim_ / this->group_; const int_tp height = bottom[0]->shape(this->channel_axis_ + 1); const int_tp width = bottom[0]->shape(this->channel_axis_ + 2); const int_tp height_out = top[0]->shape(this->channel_axis_ + 1); const int_tp width_out = top[0]->shape(this->channel_axis_ + 2); const int_tp* pad_data = this->pad_.cpu_data(); const int_tp pad_h = pad_data[0]; const int_tp pad_w = pad_data[1]; const int_tp* stride_data = this->stride_.cpu_data(); const int_tp stride_h = stride_data[0]; const int_tp stride_w = stride_data[1]; // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory mangagement uint_tp workspace_limit_bytes = 8*1024*1024; for (int_tp i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i], this->num_, this->channels_ / this->group_, height, width, this->channels_ * height * width, height * width, width, 1); cudnn::setTensor4dDesc<Dtype>(&top_descs_[i], this->num_, this->num_output_ / this->group_, height_out, width_out, this->num_output_ * this->out_spatial_dim_, this->out_spatial_dim_, width_out, 1); cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i], filter_desc_, pad_h, pad_w, stride_h, stride_w); // choose forward and backward algorithms + workspace(s) CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &fwd_algo_[i])); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0], bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], fwd_algo_[i], &(workspace_fwd_sizes_[i]))); // choose backward algorithm for filter CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0], bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_filter_algo_[i]) ); // get workspace for backwards filter algorithm CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0], bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); // choose backward algo for data CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_data_algo_[i])); // get workspace size CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); } // reduce over all workspace sizes to get a maximum to allocate / reallocate uint_tp total_workspace_fwd = 0; uint_tp total_workspace_bwd_data = 0; uint_tp total_workspace_bwd_filter = 0; for (uint_tp i = 0; i < bottom.size(); i++) { total_workspace_fwd = std::max(total_workspace_fwd, workspace_fwd_sizes_[i]); total_workspace_bwd_data = std::max(total_workspace_bwd_data, workspace_bwd_data_sizes_[i]); total_workspace_bwd_filter = std::max(total_workspace_bwd_filter, workspace_bwd_filter_sizes_[i]); } // get max over all operations uint_tp max_workspace = std::max(total_workspace_fwd, total_workspace_bwd_data); max_workspace = std::max(max_workspace, total_workspace_bwd_filter); // ensure all groups have enough workspace uint_tp total_max_workspace = max_workspace * (this->group_ * CUDNN_STREAMS_PER_GROUP); // this is the total amount of storage needed over all groups + streams if (total_max_workspace > workspaceSizeInBytes) { LOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; workspaceSizeInBytes = total_max_workspace; // free the existing workspace and allocate a new (larger) one cudaFree(this->workspaceData); cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes); if (err != cudaSuccess) { // force zero memory path for (int_tp i = 0; i < bottom.size(); i++) { workspace_fwd_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; workspace_bwd_data_sizes_[i] = 0; fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } // NULL out all workspace pointers for (int_tp g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { workspace[g] = NULL; } // NULL out underlying data workspaceData = NULL; workspaceSizeInBytes = 0; } // if we succeed in the allocation, set pointer aliases for workspaces for (int_tp g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) { workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace; } } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::setTensor4dDesc<Dtype>(&bias_desc_, 1, this->num_output_ / this->group_, 1, 1); } }