Exemple #1
0
ConvBC01CuDNN<T>::ConvBC01CuDNN(int pad_y, int pad_x, int stride_y,
    int stride_x) : pad_y(pad_y), pad_x(pad_x), stride_y(stride_y),
    stride_x(stride_x), n_imgs(0), n_channels(0), n_filters(0), img_h(0),
    img_w(0), filter_h(0), filter_w(0), workspace_size(0) {
  CUDNN_CHECK(cudnnCreateTensorDescriptor(&imgs_desc));
  CUDNN_CHECK(cudnnCreateTensorDescriptor(&convout_desc));
  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filters_desc));
  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc));
}
Exemple #2
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    int n, int c, int h, int w) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
      CUDNN_TENSOR_NCHW, n, c, h, w));
#else
  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
      CUDNN_TENSOR_NCHW, n, c, h, w));
#endif
}
		convolution_layer_updater_cuda::convolution_layer_updater_cuda()
			: input_data_desc(0)
			, output_data_desc(0)
			, weights_desc(0)
			, convolution_desc(0)
			, bias_desc(0)
		{
			cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc));
			cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc));
			cudnn_safe_call(cudnnCreateFilterDescriptor(&weights_desc));
			cudnn_safe_call(cudnnCreateConvolutionDescriptor(&convolution_desc));
			cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc));
		}
Exemple #4
0
static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
  cudnnStatus_t err;
  err = cudnnCreateFilterDescriptor(desc);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not create tensor descriptor: %s",
                 cudnnGetErrorString(err));
    return -1;
  }
  if (c_set_filter(var, *desc, 1) != 0) {
    cudnnDestroyFilterDescriptor(*desc);
    return -1;
  }
  return 0;
}
Exemple #5
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    const int_tp num_spatial_dims,
    const int_tp n, const int_tp c, const int_tp* shape) {

  std::vector<int> shape_int(num_spatial_dims + 2);

  shape_int[0] = n;
  shape_int[1] = c;

  for (int_tp i = 0; i < num_spatial_dims; ++i) {
    shape_int[2+i] = shape[i];
  }

  const int* shape_ptr = &shape_int[0];

  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
  CUDNN_CHECK(cudnnSetFilterNdDescriptor(*desc, dataType<Dtype>::type,
                                         num_spatial_dims + 2,
                                         shape_ptr));
}
Exemple #6
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, Size size) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
          size.num(), size.channels(), size.height(), size.width()));
}
Exemple #7
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    int n, int c, int h, int w) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
      n, c, h, w));
}
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor)
{
    int i;
    convolutional_layer l = {0};
    l.type = CONVOLUTIONAL;

    l.h = h;
    l.w = w;
    l.c = c;
    l.n = n;
    l.binary = binary;
    l.xnor = xnor;
    l.batch = batch;
    l.stride = stride;
    l.size = size;
    l.pad = pad;
    l.batch_normalize = batch_normalize;

    l.filters = calloc(c*n*size*size, sizeof(float));
    l.filter_updates = calloc(c*n*size*size, sizeof(float));

    l.biases = calloc(n, sizeof(float));
    l.bias_updates = calloc(n, sizeof(float));

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1);
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = n;
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));

    if(binary){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.cfilters = calloc(c*n*size*size, sizeof(char));
        l.scales = calloc(n, sizeof(float));
    }
    if(xnor){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
    }

    if(batch_normalize){
        l.scales = calloc(n, sizeof(float));
        l.scale_updates = calloc(n, sizeof(float));
        for(i = 0; i < n; ++i){
            l.scales[i] = 1;
        }

        l.mean = calloc(n, sizeof(float));
        l.variance = calloc(n, sizeof(float));

        l.rolling_mean = calloc(n, sizeof(float));
        l.rolling_variance = calloc(n, sizeof(float));
    }

#ifdef GPU
    l.filters_gpu = cuda_make_array(l.filters, c*n*size*size);
    l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size);

    l.biases_gpu = cuda_make_array(l.biases, n);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);

    l.scales_gpu = cuda_make_array(l.scales, n);
    l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);

    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

    if(binary){
        l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
    }
    if(xnor){
        l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
        l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
    }

    if(batch_normalize){
        l.mean_gpu = cuda_make_array(l.mean, n);
        l.variance_gpu = cuda_make_array(l.variance, n);

        l.rolling_mean_gpu = cuda_make_array(l.mean, n);
        l.rolling_variance_gpu = cuda_make_array(l.variance, n);

        l.mean_delta_gpu = cuda_make_array(l.mean, n);
        l.variance_delta_gpu = cuda_make_array(l.variance, n);

        l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
    }
#ifdef CUDNN
    cudnnCreateTensorDescriptor(&l.srcTensorDesc);
    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
    cudnnCreateFilterDescriptor(&l.filterDesc);
    cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
    cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
    cudnnCreateFilterDescriptor(&l.dfilterDesc);
    cudnnCreateConvolutionDescriptor(&l.convDesc);
    cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); 
    cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); 

    cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); 
    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); 
    int padding = l.pad ? l.size/2 : 0;
    cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l.srcTensorDesc,
            l.filterDesc,
            l.convDesc,
            l.dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l.fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l.filterDesc,
            l.ddstTensorDesc,
            l.convDesc,
            l.dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l.bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l.srcTensorDesc,
            l.ddstTensorDesc,
            l.convDesc,
            l.dfilterDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l.bf_algo);
#endif
#endif
    l.workspace_size = get_workspace_size(l);
    l.activation = activation;

    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);

    return l;
}
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
{
    int i;
    convolutional_layer l = {0};
    l.type = CONVOLUTIONAL;

    l.groups = groups;
    l.h = h;
    l.w = w;
    l.c = c;
    l.n = n;
    l.binary = binary;
    l.xnor = xnor;
    l.batch = batch;
    l.stride = stride;
    l.size = size;
    l.pad = padding;
    l.batch_normalize = batch_normalize;

    l.weights = calloc(c/groups*n*size*size, sizeof(float));
    l.weight_updates = calloc(c/groups*n*size*size, sizeof(float));

    l.biases = calloc(n, sizeof(float));
    l.bias_updates = calloc(n, sizeof(float));

    l.nweights = c/groups*n*size*size;
    l.nbiases = n;

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c/l.groups));
    //scale = .02;
    //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
    for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
    int out_w = convolutional_out_width(l);
    int out_h = convolutional_out_height(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = n;
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.output = calloc(l.batch*l.outputs, sizeof(float));
    l.delta  = calloc(l.batch*l.outputs, sizeof(float));

    l.forward = forward_convolutional_layer;
    l.backward = backward_convolutional_layer;
    l.update = update_convolutional_layer;
    if(binary){
        l.binary_weights = calloc(l.nweights, sizeof(float));
        l.cweights = calloc(l.nweights, sizeof(char));
        l.scales = calloc(n, sizeof(float));
    }
    if(xnor){
        l.binary_weights = calloc(l.nweights, sizeof(float));
        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
    }

    if(batch_normalize){
        l.scales = calloc(n, sizeof(float));
        l.scale_updates = calloc(n, sizeof(float));
        for(i = 0; i < n; ++i){
            l.scales[i] = 1;
        }

        l.mean = calloc(n, sizeof(float));
        l.variance = calloc(n, sizeof(float));

        l.mean_delta = calloc(n, sizeof(float));
        l.variance_delta = calloc(n, sizeof(float));

        l.rolling_mean = calloc(n, sizeof(float));
        l.rolling_variance = calloc(n, sizeof(float));
        l.x = calloc(l.batch*l.outputs, sizeof(float));
        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
    }
    if(adam){
        l.m = calloc(l.nweights, sizeof(float));
        l.v = calloc(l.nweights, sizeof(float));
        l.bias_m = calloc(n, sizeof(float));
        l.scale_m = calloc(n, sizeof(float));
        l.bias_v = calloc(n, sizeof(float));
        l.scale_v = calloc(n, sizeof(float));
    }

#ifdef GPU
    l.forward_gpu = forward_convolutional_layer_gpu;
    l.backward_gpu = backward_convolutional_layer_gpu;
    l.update_gpu = update_convolutional_layer_gpu;

    if(gpu_index >= 0){
        if (adam) {
            l.m_gpu = cuda_make_array(l.m, l.nweights);
            l.v_gpu = cuda_make_array(l.v, l.nweights);
            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
        }

        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);

        l.biases_gpu = cuda_make_array(l.biases, n);
        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);

        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

        if(binary){
            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
        }
        if(xnor){
            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
        }

        if(batch_normalize){
            l.mean_gpu = cuda_make_array(l.mean, n);
            l.variance_gpu = cuda_make_array(l.variance, n);

            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
            l.rolling_variance_gpu = cuda_make_array(l.variance, n);

            l.mean_delta_gpu = cuda_make_array(l.mean, n);
            l.variance_delta_gpu = cuda_make_array(l.variance, n);

            l.scales_gpu = cuda_make_array(l.scales, n);
            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);

            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        }
#ifdef CUDNN
        cudnnCreateTensorDescriptor(&l.normTensorDesc);
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
        cudnnCreateConvolutionDescriptor(&l.convDesc);
        cudnn_convolutional_setup(&l);
#endif
    }
#endif
    l.workspace_size = get_workspace_size(l);
    l.activation = activation;

    //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);

    return l;
}
static PyObject *conv_dfilter_buffers(PyObject *self, PyObject *args)  {
	cudaError_t err;
	cudnnStatus_t status;
	
	int PAD, gpu_ind, filters_ind, imgs_ind, conv_out_ind, out_ind, stream_ind;
	
	if (!PyArg_ParseTuple(args, "iiiiiii", &filters_ind, &imgs_ind, &conv_out_ind, &out_ind, &PAD, &stream_ind, &gpu_ind)) 
		return NULL;
	
	if(filters_ind >= N_BUFFERS || filters_ind < 0 || imgs_ind >= N_BUFFERS || imgs_ind < 0 || 
		conv_out_ind >= N_BUFFERS || conv_out_ind < 0 || out_ind >= N_BUFFERS || out_ind < 0){
		printf("invalid buffer index\n");
		return NULL;
	}
	
	if(gpu_ind < 0 || gpu_ind > N_GPUS){
		printf("invalid gpu index %i\n", gpu_ind);
		return NULL;
	}
	
	if(stream_ind < 0 || stream_ind > N_ALT_STREAMS){
		printf("invalid stream index %i\n", stream_ind);
		return NULL;
	}
	
	if(data_buffers[gpu_ind][filters_ind] == NULL || data_buffers[gpu_ind][imgs_ind] == NULL || 
		data_buffers[gpu_ind][conv_out_ind] == NULL){
			printf("one or more buffers not initialized on this gpu\n");
			return NULL;
	}
	
	if(filter_flags[gpu_ind][filters_ind] == 0 || filter_flags[gpu_ind][imgs_ind] == 1 ||
		filter_flags[gpu_ind][conv_out_ind] == 1){
			printf("one or more buffers was not initialized correctly, filters when should be tensor or vice versa\n");
			return NULL;
	}
	
	cudaSetDevice(gpu_ind); CHECK_CUDA_ERR
	cudaStreamSynchronize(streams[gpu_ind]); // make sure the inputs are in the buffers first
	cudnnSetStream(handle, alt_streams[gpu_ind][stream_ind]);
		
	int n_filters = data_dims[0][gpu_ind][filters_ind];
	int n_channels = data_dims[1][gpu_ind][filters_ind];
	int filter_sz = data_dims[2][gpu_ind][filters_ind];
	
	if(data_buffers[gpu_ind][out_ind] == NULL){ // allocate output
		status = cudnnCreateFilterDescriptor(&desc_filters[gpu_ind][out_ind]);  ERR_CHECK
		status = cudnnSetFilterDescriptor(desc_filters[gpu_ind][out_ind], dataType, n_filters, n_channels, filter_sz, filter_sz);  ERR_CHECK
		err = cudaMalloc((void**) &data_buffers[gpu_ind][out_ind], n_filters*n_channels*filter_sz*filter_sz * DATA_TYPE_SZ); MALLOC_ERR_CHECK
		
		data_dims[0][gpu_ind][out_ind] = n_filters;
		data_dims[1][gpu_ind][out_ind] = n_channels;
		data_dims[2][gpu_ind][out_ind] = filter_sz;
		data_dims[3][gpu_ind][out_ind] = filter_sz;
		
		filter_flags[gpu_ind][out_ind] = 1;
	}else if(filter_flags[gpu_ind][out_ind] == 0 || data_dims[0][gpu_ind][out_ind] != n_filters || 
		data_dims[1][gpu_ind][out_ind] != n_channels || data_dims[2][gpu_ind][out_ind] != filter_sz || 
		data_dims[3][gpu_ind][out_ind] != filter_sz){ // make sure output buffer is of correct size
			printf("output buffer size is not matching output of this function and/or initialized as a tensor, %s %i\n", __FILE__, __LINE__);
			return NULL;
	}
	
	//---------------------------------------
	// Set decriptors
	//---------------------------------------
	status = cudnnSetConvolutionDescriptor(convDesc, desc_buffers[gpu_ind][imgs_ind], desc_filters[gpu_ind][out_ind], PAD, PAD, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION);  ERR_CHECK

	//---------------------------------------
	// Query output layout
	//---------------------------------------
	int n_imgs_out, n_filters_out, conv_out_sz_x, conv_out_sz_y;
	status = cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD, &n_imgs_out, &n_filters_out, &conv_out_sz_x, &conv_out_sz_y);    ERR_CHECK

	//--------------------------------------
	// set filter and image values
	//--------------------------------------
	if(n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x != data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]*
		data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]){
		printf("predicted conv output not matching given input %s %i\n", __FILE__, __LINE__);
		printf("%i %i\n", n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x, data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]*
		data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]);
		printf("%i %i\n", n_imgs_out, data_dims[0][gpu_ind][conv_out_ind]);
		printf("%i %i\n", n_filters_out, data_dims[1][gpu_ind][conv_out_ind]);
		printf("%i %i\n", conv_out_sz_x, data_dims[2][gpu_ind][conv_out_ind]);
		printf("%i %i\n", conv_out_sz_y, data_dims[3][gpu_ind][conv_out_ind]);
		//return NULL;
	}
	
	//--------------------------------------
	// Convolution
	//--------------------------------------
	status = cudnnConvolutionBackwardFilter(handle, desc_buffers[gpu_ind][imgs_ind], data_buffers[gpu_ind][imgs_ind],
		desc_buffers[gpu_ind][conv_out_ind], data_buffers[gpu_ind][conv_out_ind], convDesc, 
		desc_filters[gpu_ind][out_ind], data_buffers[gpu_ind][out_ind], CUDNN_RESULT_NO_ACCUMULATE);  ERR_CHECK

	cudnnSetStream(handle, NULL);
	cudaSetDevice(0); CHECK_CUDA_ERR
	
	Py_INCREF(Py_None);
	return Py_None;
}
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input)
{
	int kW = module->SpatialConvolution.kW;
	int kH = module->SpatialConvolution.kH;
	int dW = module->SpatialConvolution.dW;
	int dH = module->SpatialConvolution.dH;
	int padW = module->SpatialConvolution.padW;
	int padH = module->SpatialConvolution.padH;
	int nInputPlane  = module->SpatialConvolution.nInputPlane;
	int nOutputPlane = module->SpatialConvolution.nOutputPlane;

	THFloatTensor *weight = module->SpatialConvolution.weight;
	THFloatTensor *bias = module->SpatialConvolution.bias;
	THFloatTensor *output = module->output;

	int sizes[4];
	int pad[2], filterStride[2], upscale[2];
	cudnnTensorDescriptor_t dinput, dbias, doutput;
	cudnnConvolutionDescriptor_t dconv;
	cudnnFilterDescriptor_t dweight;
	float one = 1, zero = 0;
	size_t reqwssize;
	static void *ws;
	static size_t wssize;
	static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

	pad[0] = padH;
	pad[1] = padW;
	filterStride[0] = dH;
	filterStride[1] = dW;
	upscale[0] = 1;
	upscale[1] = 1;

	if(input->nDimension <= 2)
	{
		// Here we use the SpatialConvolution module to perform a linear transformation
		errcheck(cudnnCreateTensorDescriptor(&dinput));
		if(input->nDimension == 1)
			errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1));
		else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1));
	} else errcheck(THcudnn_TensorDescriptor(&dinput, input));
	errcheck(cudnnCreateFilterDescriptor(&dweight));
	errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW));
	errcheck(cudnnCreateTensorDescriptor(&dbias));
	errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1));
	errcheck(cudnnCreateConvolutionDescriptor(&dconv));
	errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype));
	errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes));
	THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]);
	errcheck(THcudnn_TensorDescriptor(&doutput, output));
	if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM ||
		alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)
	{
		errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize));
		if(reqwssize > wssize)
		{
			wssize = reqwssize;
			errcheck(cudaMalloc(&ws, reqwssize));
		}			
	}
	errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input),
		dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero,
		doutput, THFloatTensor_data(output)));
	errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias),
		&one, doutput, THFloatTensor_data(output)));
	cudnnDestroyTensorDescriptor(dinput);
	cudnnDestroyFilterDescriptor(dweight);
	cudnnDestroyTensorDescriptor(dbias);
	cudnnDestroyTensorDescriptor(doutput);
	cudnnDestroyConvolutionDescriptor(dconv);
	return output;
}