ConvBC01CuDNN<T>::ConvBC01CuDNN(int pad_y, int pad_x, int stride_y, int stride_x) : pad_y(pad_y), pad_x(pad_x), stride_y(stride_y), stride_x(stride_x), n_imgs(0), n_channels(0), n_filters(0), img_h(0), img_w(0), filter_h(0), filter_w(0), workspace_size(0) { CUDNN_CHECK(cudnnCreateTensorDescriptor(&imgs_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&convout_desc)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filters_desc)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc)); }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, int n, int c, int h, int w) { CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); #if CUDNN_VERSION_MIN(5, 0, 0) CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type, CUDNN_TENSOR_NCHW, n, c, h, w)); #else CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type, CUDNN_TENSOR_NCHW, n, c, h, w)); #endif }
convolution_layer_updater_cuda::convolution_layer_updater_cuda() : input_data_desc(0) , output_data_desc(0) , weights_desc(0) , convolution_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateFilterDescriptor(&weights_desc)); cudnn_safe_call(cudnnCreateConvolutionDescriptor(&convolution_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) { cudnnStatus_t err; err = cudnnCreateFilterDescriptor(desc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create tensor descriptor: %s", cudnnGetErrorString(err)); return -1; } if (c_set_filter(var, *desc, 1) != 0) { cudnnDestroyFilterDescriptor(*desc); return -1; } return 0; }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, const int_tp num_spatial_dims, const int_tp n, const int_tp c, const int_tp* shape) { std::vector<int> shape_int(num_spatial_dims + 2); shape_int[0] = n; shape_int[1] = c; for (int_tp i = 0; i < num_spatial_dims; ++i) { shape_int[2+i] = shape[i]; } const int* shape_ptr = &shape_int[0]; CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); CUDNN_CHECK(cudnnSetFilterNdDescriptor(*desc, dataType<Dtype>::type, num_spatial_dims + 2, shape_ptr)); }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, Size size) { CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type, size.num(), size.channels(), size.height(), size.width())); }
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, int n, int c, int h, int w) { CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type, n, c, h, w)); }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = pad; l.batch_normalize = batch_normalize; l.filters = calloc(c*n*size*size, sizeof(float)); l.filter_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); if(binary){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } #ifdef GPU l.filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); } if(xnor){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.filterDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dfilterDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); int padding = l.pad ? l.size/2 : 0; cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l.srcTensorDesc, l.filterDesc, l.convDesc, l.dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l.fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l.filterDesc, l.ddstTensorDesc, l.convDesc, l.dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l.bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l.srcTensorDesc, l.ddstTensorDesc, l.convDesc, l.dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l.bf_algo); #endif #endif l.workspace_size = get_workspace_size(l); l.activation = activation; fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); return l; }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.groups = groups; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = padding; l.batch_normalize = batch_normalize; l.weights = calloc(c/groups*n*size*size, sizeof(float)); l.weight_updates = calloc(c/groups*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); l.nweights = c/groups*n*size*size; l.nbiases = n; // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c/l.groups)); //scale = .02; //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal(); int out_w = convolutional_out_width(l); int out_h = convolutional_out_height(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*l.outputs, sizeof(float)); l.delta = calloc(l.batch*l.outputs, sizeof(float)); l.forward = forward_convolutional_layer; l.backward = backward_convolutional_layer; l.update = update_convolutional_layer; if(binary){ l.binary_weights = calloc(l.nweights, sizeof(float)); l.cweights = calloc(l.nweights, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_weights = calloc(l.nweights, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.mean_delta = calloc(n, sizeof(float)); l.variance_delta = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); l.x = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } if(adam){ l.m = calloc(l.nweights, sizeof(float)); l.v = calloc(l.nweights, sizeof(float)); l.bias_m = calloc(n, sizeof(float)); l.scale_m = calloc(n, sizeof(float)); l.bias_v = calloc(n, sizeof(float)); l.scale_v = calloc(n, sizeof(float)); } #ifdef GPU l.forward_gpu = forward_convolutional_layer_gpu; l.backward_gpu = backward_convolutional_layer_gpu; l.update_gpu = update_convolutional_layer_gpu; if(gpu_index >= 0){ if (adam) { l.m_gpu = cuda_make_array(l.m, l.nweights); l.v_gpu = cuda_make_array(l.v, l.nweights); l.bias_m_gpu = cuda_make_array(l.bias_m, n); l.bias_v_gpu = cuda_make_array(l.bias_v, n); l.scale_m_gpu = cuda_make_array(l.scale_m, n); l.scale_v_gpu = cuda_make_array(l.scale_v, n); } l.weights_gpu = cuda_make_array(l.weights, l.nweights); l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); } if(xnor){ l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.weightDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dweightDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnn_convolutional_setup(&l); #endif } #endif l.workspace_size = get_workspace_size(l); l.activation = activation; //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; }
static PyObject *conv_dfilter_buffers(PyObject *self, PyObject *args) { cudaError_t err; cudnnStatus_t status; int PAD, gpu_ind, filters_ind, imgs_ind, conv_out_ind, out_ind, stream_ind; if (!PyArg_ParseTuple(args, "iiiiiii", &filters_ind, &imgs_ind, &conv_out_ind, &out_ind, &PAD, &stream_ind, &gpu_ind)) return NULL; if(filters_ind >= N_BUFFERS || filters_ind < 0 || imgs_ind >= N_BUFFERS || imgs_ind < 0 || conv_out_ind >= N_BUFFERS || conv_out_ind < 0 || out_ind >= N_BUFFERS || out_ind < 0){ printf("invalid buffer index\n"); return NULL; } if(gpu_ind < 0 || gpu_ind > N_GPUS){ printf("invalid gpu index %i\n", gpu_ind); return NULL; } if(stream_ind < 0 || stream_ind > N_ALT_STREAMS){ printf("invalid stream index %i\n", stream_ind); return NULL; } if(data_buffers[gpu_ind][filters_ind] == NULL || data_buffers[gpu_ind][imgs_ind] == NULL || data_buffers[gpu_ind][conv_out_ind] == NULL){ printf("one or more buffers not initialized on this gpu\n"); return NULL; } if(filter_flags[gpu_ind][filters_ind] == 0 || filter_flags[gpu_ind][imgs_ind] == 1 || filter_flags[gpu_ind][conv_out_ind] == 1){ printf("one or more buffers was not initialized correctly, filters when should be tensor or vice versa\n"); return NULL; } cudaSetDevice(gpu_ind); CHECK_CUDA_ERR cudaStreamSynchronize(streams[gpu_ind]); // make sure the inputs are in the buffers first cudnnSetStream(handle, alt_streams[gpu_ind][stream_ind]); int n_filters = data_dims[0][gpu_ind][filters_ind]; int n_channels = data_dims[1][gpu_ind][filters_ind]; int filter_sz = data_dims[2][gpu_ind][filters_ind]; if(data_buffers[gpu_ind][out_ind] == NULL){ // allocate output status = cudnnCreateFilterDescriptor(&desc_filters[gpu_ind][out_ind]); ERR_CHECK status = cudnnSetFilterDescriptor(desc_filters[gpu_ind][out_ind], dataType, n_filters, n_channels, filter_sz, filter_sz); ERR_CHECK err = cudaMalloc((void**) &data_buffers[gpu_ind][out_ind], n_filters*n_channels*filter_sz*filter_sz * DATA_TYPE_SZ); MALLOC_ERR_CHECK data_dims[0][gpu_ind][out_ind] = n_filters; data_dims[1][gpu_ind][out_ind] = n_channels; data_dims[2][gpu_ind][out_ind] = filter_sz; data_dims[3][gpu_ind][out_ind] = filter_sz; filter_flags[gpu_ind][out_ind] = 1; }else if(filter_flags[gpu_ind][out_ind] == 0 || data_dims[0][gpu_ind][out_ind] != n_filters || data_dims[1][gpu_ind][out_ind] != n_channels || data_dims[2][gpu_ind][out_ind] != filter_sz || data_dims[3][gpu_ind][out_ind] != filter_sz){ // make sure output buffer is of correct size printf("output buffer size is not matching output of this function and/or initialized as a tensor, %s %i\n", __FILE__, __LINE__); return NULL; } //--------------------------------------- // Set decriptors //--------------------------------------- status = cudnnSetConvolutionDescriptor(convDesc, desc_buffers[gpu_ind][imgs_ind], desc_filters[gpu_ind][out_ind], PAD, PAD, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION); ERR_CHECK //--------------------------------------- // Query output layout //--------------------------------------- int n_imgs_out, n_filters_out, conv_out_sz_x, conv_out_sz_y; status = cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD, &n_imgs_out, &n_filters_out, &conv_out_sz_x, &conv_out_sz_y); ERR_CHECK //-------------------------------------- // set filter and image values //-------------------------------------- if(n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x != data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]* data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]){ printf("predicted conv output not matching given input %s %i\n", __FILE__, __LINE__); printf("%i %i\n", n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x, data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]* data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]); printf("%i %i\n", n_imgs_out, data_dims[0][gpu_ind][conv_out_ind]); printf("%i %i\n", n_filters_out, data_dims[1][gpu_ind][conv_out_ind]); printf("%i %i\n", conv_out_sz_x, data_dims[2][gpu_ind][conv_out_ind]); printf("%i %i\n", conv_out_sz_y, data_dims[3][gpu_ind][conv_out_ind]); //return NULL; } //-------------------------------------- // Convolution //-------------------------------------- status = cudnnConvolutionBackwardFilter(handle, desc_buffers[gpu_ind][imgs_ind], data_buffers[gpu_ind][imgs_ind], desc_buffers[gpu_ind][conv_out_ind], data_buffers[gpu_ind][conv_out_ind], convDesc, desc_filters[gpu_ind][out_ind], data_buffers[gpu_ind][out_ind], CUDNN_RESULT_NO_ACCUMULATE); ERR_CHECK cudnnSetStream(handle, NULL); cudaSetDevice(0); CHECK_CUDA_ERR Py_INCREF(Py_None); return Py_None; }
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; int nInputPlane = module->SpatialConvolution.nInputPlane; int nOutputPlane = module->SpatialConvolution.nOutputPlane; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int sizes[4]; int pad[2], filterStride[2], upscale[2]; cudnnTensorDescriptor_t dinput, dbias, doutput; cudnnConvolutionDescriptor_t dconv; cudnnFilterDescriptor_t dweight; float one = 1, zero = 0; size_t reqwssize; static void *ws; static size_t wssize; static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; pad[0] = padH; pad[1] = padW; filterStride[0] = dH; filterStride[1] = dW; upscale[0] = 1; upscale[1] = 1; if(input->nDimension <= 2) { // Here we use the SpatialConvolution module to perform a linear transformation errcheck(cudnnCreateTensorDescriptor(&dinput)); if(input->nDimension == 1) errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1)); else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1)); } else errcheck(THcudnn_TensorDescriptor(&dinput, input)); errcheck(cudnnCreateFilterDescriptor(&dweight)); errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW)); errcheck(cudnnCreateTensorDescriptor(&dbias)); errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1)); errcheck(cudnnCreateConvolutionDescriptor(&dconv)); errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype)); errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes)); THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize)); if(reqwssize > wssize) { wssize = reqwssize; errcheck(cudaMalloc(&ws, reqwssize)); } } errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input), dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero, doutput, THFloatTensor_data(output))); errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias), &one, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyFilterDescriptor(dweight); cudnnDestroyTensorDescriptor(dbias); cudnnDestroyTensorDescriptor(doutput); cudnnDestroyConvolutionDescriptor(dconv); return output; }