예제 #1
0
파일: cudnn.cpp 프로젝트: bebee/cudarray
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs,
    int n_channels, int n_filters, int img_h, int img_w, int filter_h,
    int filter_w, T *convout) {
  bool set_conv_desc = false;
  if (n_imgs != this->n_imgs || n_channels != this->n_channels ||
      img_h != this->img_h || img_w != this->img_w) {
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels,
        img_h, img_w
    ));
    this->n_imgs = n_imgs;
    this->n_channels = n_channels;
    this->img_h = img_h;
    this->img_w = img_w;
    set_conv_desc = true;
  }
  if (n_filters != this->n_filters || n_channels != this->n_channels ||
      filter_h != this->filter_h || filter_w != this->filter_w) {
    CUDNN_CHECK(cudnnSetFilter4dDescriptor(
        filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h,
        filter_w
    ));
    this->n_filters = n_filters;
    this->n_channels = n_channels;
    this->filter_h = filter_h;
    this->filter_w = filter_w;
    set_conv_desc = true;
  }
  if (set_conv_desc) {
    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(
        conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION
    ));
    int n, c, h, w;
    CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim(
      conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w
    ));
    CUDNN_CHECK(cudnnSetTensor4dDescriptor(
        convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w
    ));
    const int n_requestedAlgo = 10;
    int n_returnedAlgo;
    cudnnConvolutionFwdAlgoPerf_t fwd_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        n_requestedAlgo, &n_returnedAlgo, fwd_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionFwdAlgoPerf_t found");
    }

    fwd_algo = fwd_algo_perf[0].algo;
    cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
        CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc,
        n_requestedAlgo, &n_returnedAlgo, bwd_data_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionBwdDataAlgoPerf_t found");
    }

    bwd_imgs_algo = bwd_data_algo_perf[0].algo;
    cudnnConvolutionBwdFilterAlgoPerf_t bwd_filters_algo_perf[n_requestedAlgo];
    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
        CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc,
        n_requestedAlgo, &n_returnedAlgo, bwd_filters_algo_perf
    ));
    if (n_returnedAlgo == 0) {
      throw std::runtime_error("No cudnnConvolutionBwdFilterAlgoPerf_t found");
    }
    bwd_filters_algo = bwd_filters_algo_perf[0].algo;
    size_t fwd_workspace_size;
    size_t bwd_imgs_workspace_size;
    size_t bwd_filters_workspace_size;
    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
        CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc,
        fwd_algo, &fwd_workspace_size
    ));
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
        CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc,
        bwd_imgs_algo, &bwd_imgs_workspace_size
    ));
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
        CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc,
        bwd_filters_algo, &bwd_filters_workspace_size
    ));
    workspace_size = std::max(fwd_workspace_size, bwd_imgs_workspace_size);
    workspace_size = std::max(workspace_size, bwd_filters_workspace_size);
  }
  void *workspace = NULL;
  if (workspace_size > 0) {
    workspace = CUDA::buffer(workspace_size);
  }
  CUDNN_CHECK(cudnnConvolutionForward(
      CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters,
      conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero,
      convout_desc, convout
  ));
}
예제 #2
0
파일: dnn_gw.c 프로젝트: aalmah/Theano
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
                        float alpha, float beta, CudaNdarray **kerns) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
                   "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  int nb_dim = CudaNdarray_NDIM(output);

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
    return 1;
#endif

  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
  {
    size_t worksize;
    void *workspace;
    cudnnConvolutionBwdFilterAlgo_t chosen_algo;

    if (CHOOSE_ALGO)
    {

      // A new convolution implementation should be selected, based either on
      // timing or heuristics, if in one of the two following cases :
      // - The implementation should only be chosen during the first execution
      //   of an apply node and this is the first execution of the apply node.
      // - The implementation should be chosen as often as necessary and the
      //   shapes of the inputs differ from the last time an implementation
      //   was chosen.
      bool reuse_previous_algo;
      if (CHOOSE_ALGO_ONCE)
      {
        // Only choose a new implementation of none has been chosen before.
        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
      }
      else
      {
        // Reuse the previous implementation if the the kernels and the outputs
        // have the same shapes as they had when the previous implementation
        // was selected
        bool same_shapes = true;
        for (int i = 0; (i < nb_dim) && same_shapes; i++)
        {
            same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
                            APPLY_SPECIFIC(previous_input_shape)[i]);
            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
                            APPLY_SPECIFIC(previous_output_shape)[i]);
        }
        reuse_previous_algo = same_shapes;
      }

      // If the previously choosen implementation can't be reused, select a
      // new one based on the shapes of the current inputs
      if (!reuse_previous_algo)
      {
        // Obtain a convolution algorithm appropriate for the input and output
        // shapes. Either by choosing one according to heuristics or by making
        // CuDNN time every implementation and choose the best one.
        if (CHOOSE_ALGO_TIME)
        {
          // Time the different implementations to choose the best one
          int requestedCount = 1;
          int count;
          cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
          err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
                                                            APPLY_SPECIFIC(input),
                                                            APPLY_SPECIFIC(output),
                                                            desc,
                                                            APPLY_SPECIFIC(kerns),
                                                            requestedCount,
                                                            &count,
                                                            &choosen_algo_perf);
          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuDnnConvGradW: error selecting convolution algo: "
                         "%s", cudnnGetErrorString(err));
            return 1;
          }

          chosen_algo = choosen_algo_perf.algo;
        }
        else
        {
          // Choose the convolution implementation using heuristics based on the
          // shapes of the inputs and the amount of memory available.

          // Get the amount of available memory
          size_t free = 0, total = 0;
          cudaError_t err2 = cudaMemGetInfo(&free, &total);
          if (err2 != cudaSuccess){
            cudaGetLastError();
            fprintf(stderr,
                    "Error when trying to find the memory information"
                    " on the GPU: %s\n", cudaGetErrorString(err2));
            return 1;
          }

          // Use heuristics to choose the implementation
          err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
                                                           APPLY_SPECIFIC(input),
                                                           APPLY_SPECIFIC(output),
                                                           desc,
                                                           APPLY_SPECIFIC(kerns),
                                                           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                                                           free,
                                                           &chosen_algo);

          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuDnnConvGradW: error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            return 1;
          }
        }

        // Store the shapes of the inputs and kernels as well as the chosen
        // algorithm for future use.
        APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
        APPLY_SPECIFIC(previous_algo_set) = true;
        for (int i = 0; i < nb_dim; i++)
        {
            APPLY_SPECIFIC(previous_input_shape)[i] =
                                            CudaNdarray_HOST_DIMS(input)[i];
            APPLY_SPECIFIC(previous_output_shape)[i] =
                                            CudaNdarray_HOST_DIMS(output)[i];
        }

      }
      else
      {
        // Reuse the previously chosen convlution implementation
        chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
      }
    }
    else
    {
        chosen_algo = CONV_ALGO;
    }

    // The FFT implementation (only in v3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // If the chosen implementation is FFT, validate that it can be used
    // on the current data and default on a safe implementation if it
    // can't.
    if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4)
    {

      // Extract the properties of the convolution descriptor
      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
      cudnnConvolutionMode_t mode;
      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
                                            &stride_v, &stride_h,
                                            &upscale_x, &upscale_y,
                                            &mode);

      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuDnnConvGradW: error getting convolution properties: %s",
                     cudnnGetErrorString(err));
        return 1;
      }

      // Extract the spatial size of the filters
      int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2];
      int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3];

      // Extract the spatial size of the input
      int input_h = CudaNdarray_HOST_DIMS(input)[2];
      int input_w = CudaNdarray_HOST_DIMS(input)[3];

      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
      }
    }

    // Infer required workspace size from the chosen implementation
    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
                                                         APPLY_SPECIFIC(input),
                                                         APPLY_SPECIFIC(output),
                                                         desc,
                                                         APPLY_SPECIFIC(kerns),
                                                         chosen_algo,
                                                         &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "GpuDnnConvGradW: error getting worksize: %s",
                   cudnnGetErrorString(err));
      return 1;
    }

    // Allocate workspace for the convolution
    workspace = get_work_mem(worksize);
    if (workspace == NULL && worksize != 0)
      return 1;

    // Perform the convolution
    err = cudnnConvolutionBackwardFilter_v3(
      _handle,
      (void *)&alpha,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
      desc,
      chosen_algo,
      workspace, worksize,
      (void *)&beta,
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));

  }
#else
  err = cudnnConvolutionBackwardFilter(
    _handle,
    (void *)&alpha,
    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
    desc,
    (void *)&beta,
    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
예제 #3
0
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "GpuDnnConv images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
#endif

  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);

#ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_top_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess){
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_top_dims[i] = PyGpuArray_DIM(output, i);
  }
#endif

#endif

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024.
  // If the chosen implementation is FFT, validate that it can
  // be used on the current data and default to a safe implementation if it
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
      PyGpuArray_NDIM(input) == 4) {
    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }

  size_t worksize;
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
      cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
  }

  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter_v3(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));

  if (worksize != 0)
    c->ops->buffer_release(workspace);

  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}