Exemplo n.º 1
0
void cudnn_convolutional_setup(layer *l)
{
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 

    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
    #if CUDNN_MAJOR >= 6
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
    #else
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    #endif

    #if CUDNN_MAJOR >= 7
    cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
    #else
    if(l->groups > 1){
        error("CUDNN < 7 doesn't support groups, please upgrade!");
    }
    #endif

    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->weightDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);
}
Exemplo n.º 2
0
static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
  if (!CudaNdarray_is_c_contiguous(var)) {
    PyErr_SetString(PyExc_ValueError,
		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
    desc, CUDNN_DATA_FLOAT,
    CudaNdarray_HOST_DIMS(var)[0],
    CudaNdarray_HOST_DIMS(var)[1],
    CudaNdarray_HOST_DIMS(var)[2],
    CudaNdarray_HOST_DIMS(var)[3]
    );
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set filter descriptor: %s."
		 " dims= %d %d %d %d",
		 cudnnGetErrorString(err),
		 CudaNdarray_HOST_DIMS(var)[0],
		 CudaNdarray_HOST_DIMS(var)[1],
		 CudaNdarray_HOST_DIMS(var)[2],
		 CudaNdarray_HOST_DIMS(var)[3]);
    return -1;
  }
  return 0;
}
Exemplo n.º 3
0
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs,
    int n_channels, int n_filters, int img_h, int img_w, int filter_h,
    int filter_w, T *convout) {
  bool set_conv_desc = false;
  if (n_imgs != this->n_imgs || n_channels != this->n_channels ||
      img_h != this->img_h || img_w != this->img_w) {
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels,
        img_h, img_w
    ));
    this->n_imgs = n_imgs;
    this->n_channels = n_channels;
    this->img_h = img_h;
    this->img_w = img_w;
    set_conv_desc = true;
  }
  if (n_filters != this->n_filters || n_channels != this->n_channels ||
      filter_h != this->filter_h || filter_w != this->filter_w) {
    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
        filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h,
        filter_w
    ));
    this->n_filters = n_filters;
    this->n_channels = n_channels;
    this->filter_h = filter_h;
    this->filter_w = filter_w;
    set_conv_desc = true;
  }
  if (set_conv_desc) {
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION
    ));
    int n, c, h, w;
    CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim(
      conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w
    ));
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w
    ));
    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, WORKSPACE_LIMIT,
        &fwd_algo
    ));
    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        fwd_algo, &workspace_size
    ));

  }
  void *workspace = NULL;
  if (workspace_size > 0) {
    workspace = CUDA::buffer(workspace_size);
  }
  CUDNN_CHECK(cudnnConvolutionForward(
      CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters,
      conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero,
      convout_desc, convout
  ));
}
Exemplo n.º 4
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    int n, int c, int h, int w) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
      CUDNN_TENSOR_NCHW, n, c, h, w));
#else
  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
      CUDNN_TENSOR_NCHW, n, c, h, w));
#endif
}
Exemplo n.º 5
0
void cudnn_convolutional_setup(layer *l)
{
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    //cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);

    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->weightDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);
}
		void convolution_layer_updater_cuda::updater_configured()
		{
			nnforge_shared_ptr<const convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const convolution_layer>(layer_schema);

			window_sizes = layer_derived->window_sizes;

			zero_padding = layer_derived->left_zero_padding;
			for(int i = 0; i < window_sizes.size(); ++i)
			{
				if (zero_padding[i] != layer_derived->right_zero_padding[i])
					throw neural_network_exception("cuDNN is not able to run convolution when left and right padding sizes don't match");
			}

			cudnn_safe_call(cudnnSetFilter4dDescriptor(
				weights_desc,
				CUDNN_DATA_FLOAT,
				output_configuration_specific.feature_map_count,
				input_configuration_specific.feature_map_count,
				(window_sizes.size() > 1) ? window_sizes[1] : 1,
				window_sizes[0]));

			cudnn_safe_call(cudnnSetTensor4dDescriptor(
				bias_desc,
				CUDNN_TENSOR_NCHW,
				CUDNN_DATA_FLOAT,
				1,
				output_configuration_specific.feature_map_count,
				1,
				1));

			cudnn_safe_call(cudnnSetConvolution2dDescriptor(
				convolution_desc,
				(zero_padding.size() > 1) ? zero_padding[1] : 1,
				zero_padding[0],
				1,
				1,
				1,
				1,
				CUDNN_CROSS_CORRELATION));
		}
Exemplo n.º 7
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc, Size size) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
          size.num(), size.channels(), size.height(), size.width()));
}
Exemplo n.º 8
0
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    int n, int c, int h, int w) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
      n, c, h, w));
}
Exemplo n.º 9
0
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs,
    int n_channels, int n_filters, int img_h, int img_w, int filter_h,
    int filter_w, T *convout) {
  bool set_conv_desc = false;
  if (n_imgs != this->n_imgs || n_channels != this->n_channels ||
      img_h != this->img_h || img_w != this->img_w) {
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels,
        img_h, img_w
    ));
    this->n_imgs = n_imgs;
    this->n_channels = n_channels;
    this->img_h = img_h;
    this->img_w = img_w;
    set_conv_desc = true;
  }
  if (n_filters != this->n_filters || n_channels != this->n_channels ||
      filter_h != this->filter_h || filter_w != this->filter_w) {
    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
        filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h,
        filter_w
    ));
    this->n_filters = n_filters;
    this->n_channels = n_channels;
    this->filter_h = filter_h;
    this->filter_w = filter_w;
    set_conv_desc = true;
  }
  if (set_conv_desc) {
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION
    ));
    int n, c, h, w;
    CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim(
      conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w
    ));
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w
    ));
    const int n_requestedAlgo = 10;
    int n_returnedAlgo;
    cudnnConvolutionFwdAlgoPerf_t fwd_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        n_requestedAlgo, &n_returnedAlgo, fwd_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionFwdAlgoPerf_t found");
    }

    fwd_algo = fwd_algo_perf[0].algo;
    cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
        CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc,
        n_requestedAlgo, &n_returnedAlgo, bwd_data_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionBwdDataAlgoPerf_t found");
    }

    bwd_imgs_algo = bwd_data_algo_perf[0].algo;
    cudnnConvolutionBwdFilterAlgoPerf_t bwd_filters_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
        CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc,
        n_requestedAlgo, &n_returnedAlgo, bwd_filters_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionBwdFilterAlgoPerf_t found");
    }
    bwd_filters_algo = bwd_filters_algo_perf[0].algo;
    size_t fwd_workspace_size;
    size_t bwd_imgs_workspace_size;
    size_t bwd_filters_workspace_size;
    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        fwd_algo, &fwd_workspace_size
    ));
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
        CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc,
        bwd_imgs_algo, &bwd_imgs_workspace_size
    ));
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
        CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc,
        bwd_filters_algo, &bwd_filters_workspace_size
    ));
    workspace_size = std::max(fwd_workspace_size, bwd_imgs_workspace_size);
    workspace_size = std::max(workspace_size, bwd_filters_workspace_size);
  }
  void *workspace = NULL;
  if (workspace_size > 0) {
    workspace = CUDA::buffer(workspace_size);
  }
  CUDNN_CHECK(cudnnConvolutionForward(
      CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters,
      conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero,
      convout_desc, convout
  ));
}
Exemplo n.º 10
0
void resize_convolutional_layer(convolutional_layer *l, int w, int h)
{
    l->w = w;
    l->h = h;
    int out_w = convolutional_out_width(*l);
    int out_h = convolutional_out_height(*l);

    l->out_w = out_w;
    l->out_h = out_h;

    l->outputs = l->out_h * l->out_w * l->out_c;
    l->inputs = l->w * l->h * l->c;

    l->output = realloc(l->output,
            l->batch*out_h * out_w * l->n*sizeof(float));
    l->delta  = realloc(l->delta,
            l->batch*out_h * out_w * l->n*sizeof(float));

#ifdef GPU
    cuda_free(l->delta_gpu);
    cuda_free(l->output_gpu);

    l->delta_gpu =     cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
    l->output_gpu =    cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
#ifdef CUDNN
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    int padding = l->pad ? l->size/2 : 0;
    cudnnSetConvolution2dDescriptor(l->convDesc, padding, padding, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->filterDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->filterDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dfilterDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);
#endif
#endif
    l->workspace_size = get_workspace_size(*l);
}
Exemplo n.º 11
0
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor)
{
    int i;
    convolutional_layer l = {0};
    l.type = CONVOLUTIONAL;

    l.h = h;
    l.w = w;
    l.c = c;
    l.n = n;
    l.binary = binary;
    l.xnor = xnor;
    l.batch = batch;
    l.stride = stride;
    l.size = size;
    l.pad = pad;
    l.batch_normalize = batch_normalize;

    l.filters = calloc(c*n*size*size, sizeof(float));
    l.filter_updates = calloc(c*n*size*size, sizeof(float));

    l.biases = calloc(n, sizeof(float));
    l.bias_updates = calloc(n, sizeof(float));

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1);
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = n;
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));

    if(binary){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.cfilters = calloc(c*n*size*size, sizeof(char));
        l.scales = calloc(n, sizeof(float));
    }
    if(xnor){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
    }

    if(batch_normalize){
        l.scales = calloc(n, sizeof(float));
        l.scale_updates = calloc(n, sizeof(float));
        for(i = 0; i < n; ++i){
            l.scales[i] = 1;
        }

        l.mean = calloc(n, sizeof(float));
        l.variance = calloc(n, sizeof(float));

        l.rolling_mean = calloc(n, sizeof(float));
        l.rolling_variance = calloc(n, sizeof(float));
    }

#ifdef GPU
    l.filters_gpu = cuda_make_array(l.filters, c*n*size*size);
    l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size);

    l.biases_gpu = cuda_make_array(l.biases, n);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);

    l.scales_gpu = cuda_make_array(l.scales, n);
    l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);

    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

    if(binary){
        l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
    }
    if(xnor){
        l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
        l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
    }

    if(batch_normalize){
        l.mean_gpu = cuda_make_array(l.mean, n);
        l.variance_gpu = cuda_make_array(l.variance, n);

        l.rolling_mean_gpu = cuda_make_array(l.mean, n);
        l.rolling_variance_gpu = cuda_make_array(l.variance, n);

        l.mean_delta_gpu = cuda_make_array(l.mean, n);
        l.variance_delta_gpu = cuda_make_array(l.variance, n);

        l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
    }
#ifdef CUDNN
    cudnnCreateTensorDescriptor(&l.srcTensorDesc);
    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
    cudnnCreateFilterDescriptor(&l.filterDesc);
    cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
    cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
    cudnnCreateFilterDescriptor(&l.dfilterDesc);
    cudnnCreateConvolutionDescriptor(&l.convDesc);
    cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); 
    cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); 

    cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); 
    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); 
    int padding = l.pad ? l.size/2 : 0;
    cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l.srcTensorDesc,
            l.filterDesc,
            l.convDesc,
            l.dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l.fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l.filterDesc,
            l.ddstTensorDesc,
            l.convDesc,
            l.dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            0,
            &l.bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l.srcTensorDesc,
            l.ddstTensorDesc,
            l.convDesc,
            l.dfilterDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l.bf_algo);
#endif
#endif
    l.workspace_size = get_workspace_size(l);
    l.activation = activation;

    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);

    return l;
}
Exemplo n.º 12
0
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input)
{
	int kW = module->SpatialConvolution.kW;
	int kH = module->SpatialConvolution.kH;
	int dW = module->SpatialConvolution.dW;
	int dH = module->SpatialConvolution.dH;
	int padW = module->SpatialConvolution.padW;
	int padH = module->SpatialConvolution.padH;
	int nInputPlane  = module->SpatialConvolution.nInputPlane;
	int nOutputPlane = module->SpatialConvolution.nOutputPlane;

	THFloatTensor *weight = module->SpatialConvolution.weight;
	THFloatTensor *bias = module->SpatialConvolution.bias;
	THFloatTensor *output = module->output;

	int sizes[4];
	int pad[2], filterStride[2], upscale[2];
	cudnnTensorDescriptor_t dinput, dbias, doutput;
	cudnnConvolutionDescriptor_t dconv;
	cudnnFilterDescriptor_t dweight;
	float one = 1, zero = 0;
	size_t reqwssize;
	static void *ws;
	static size_t wssize;
	static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

	pad[0] = padH;
	pad[1] = padW;
	filterStride[0] = dH;
	filterStride[1] = dW;
	upscale[0] = 1;
	upscale[1] = 1;

	if(input->nDimension <= 2)
	{
		// Here we use the SpatialConvolution module to perform a linear transformation
		errcheck(cudnnCreateTensorDescriptor(&dinput));
		if(input->nDimension == 1)
			errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1));
		else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1));
	} else errcheck(THcudnn_TensorDescriptor(&dinput, input));
	errcheck(cudnnCreateFilterDescriptor(&dweight));
	errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW));
	errcheck(cudnnCreateTensorDescriptor(&dbias));
	errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1));
	errcheck(cudnnCreateConvolutionDescriptor(&dconv));
	errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype));
	errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes));
	THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]);
	errcheck(THcudnn_TensorDescriptor(&doutput, output));
	if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM ||
		alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)
	{
		errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize));
		if(reqwssize > wssize)
		{
			wssize = reqwssize;
			errcheck(cudaMalloc(&ws, reqwssize));
		}			
	}
	errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input),
		dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero,
		doutput, THFloatTensor_data(output)));
	errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias),
		&one, doutput, THFloatTensor_data(output)));
	cudnnDestroyTensorDescriptor(dinput);
	cudnnDestroyFilterDescriptor(dweight);
	cudnnDestroyTensorDescriptor(dbias);
	cudnnDestroyTensorDescriptor(doutput);
	cudnnDestroyConvolutionDescriptor(dconv);
	return output;
}