Ejemplo n.º 1
0
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) {
  cudnnDataType_t dt;
  cudnnStatus_t err;

  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
    PyErr_SetString(PyExc_ValueError,
		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
    return -1;
  }

  int dims[8];
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 8) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    dims[i] = PyGpuArray_DIM(var, i);
  }

  /* Filters can't be less than 3d so we pad */
  for (unsigned int i = nd; i < 3; i++)
    dims[i] = 1;
  dims[0] = dims[0] / groups;

  if (nd < 3)
    nd = 3;

    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set filter descriptor: %s.",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Ejemplo n.º 2
0
static int
c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

  int strs[8], dims[8], default_stride = 1;
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 8) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
      PyGpuArray_STRIDE(var, i)/ds : default_stride;
    default_stride *= PyGpuArray_DIM(var, i);
    dims[i] = PyGpuArray_DIM(var, i);
  }

  /* Tensors can't be smaller than 3d for cudnn so we pad the
   * descriptor if they are */
  for (unsigned int i = nd; i < 3; i++) {
    strs[i] = 1;
    dims[i] = 1;
  }
  //only for grouped convolution i.e when groups > 1
  dims[1] = dims[1] / groups;
  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
                                                 dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set tensorNd descriptor: %s",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Ejemplo n.º 3
0
static int
c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  cudnnDataType_t dt;
  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
    PyErr_SetString(PyExc_ValueError,
		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
#if CUDNN_VERSION > 3000
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
#endif
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
    return -1;
  }

  int dims[5];
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 5) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    dims[i] = PyGpuArray_DIM(var, i);
  }

  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set filter descriptor: %s.",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Ejemplo n.º 4
0
static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
#if CUDNN_VERSION > 3000
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
#endif
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

  int strs[5], dims[5], default_stride = 1;
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 5) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    strs[i] = PyGpuArray_STRIDE(var, i) ?
      PyGpuArray_STRIDE(var, i)/ds : default_stride;
    default_stride *= PyGpuArray_DIM(var, i);
    dims[i] = PyGpuArray_DIM(var, i);
  }

  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set tensorNd descriptor: %s",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Ejemplo n.º 5
0
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
                         PARAMS_TYPE* params) {
  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
		    "images and kernel must have the same stack size");
    return 1;
  }
  if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
    PyErr_SetString(PyExc_ValueError,
		    "Number of filters must be divisible by number of groups");
    return 1;
  }

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

  if (params->inplace) {
    Py_XDECREF(*output);
    *output = om;
    Py_INCREF(*output);
  } else {
    if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
                           om->ga.typecode, GA_C_ORDER, c) != 0)
      return 1;
    if (beta != 0.0 && pygpu_move(*output, om))
      return 1;
  }

  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
    int err2 = GpuArray_memset(&(*output)->ga, 0);
    if (err2 != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuDnnConv could not fill the output with zeros: %d", err2);
        return 1;
    }
    return 0;
  }

  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
    return 1;
  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
    return 1;
  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;

  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
  #ifdef DEBUG
  char algorithm_name[128];
  #endif

  cuda_enter(c->ctx);

  if (params->choose_algo) {
    if (!params->choose_once) {
      reuse_algo = 1;
      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
      }
    }

    if (!reuse_algo) {
      size_t free;

      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
      if (err2 != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                     "memory information on the GPU");
        cuda_exit(c->ctx);
        return 1;
      }

      // Guess 4Mb if the info is not available
      if (free == 0) free = 4 * 1024 * 1024;

      if (params->choose_time) {
        int count;
        cudnnConvolutionFwdAlgoPerf_t choice;
        gpudata *tmpmem;

        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
          return -1;
        }
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
          1, &count, &choice, *(void **)tmpmem,
          free);
        gpudata_release(tmpmem);

        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
                       cudnnGetErrorString(err));
          cuda_exit(c->ctx);
          return 1;
        }
        algo = choice.algo;

        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed FWD algo: %s",
                         cudnnGetErrorString(choice.status));
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif

      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            cuda_exit(c->ctx);
            return 1;
          }
      }
      prev_algo = algo;
    } else {
      algo = prev_algo;
    }

    #ifdef DEBUG
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
    // NB: This is printed only when algorithm is chosen at runtime.
    if (reuse_algo)
        fprintf(stderr, "(reused %s)\n", algorithm_name);
    else
        fprintf(stderr, "(using %s)\n", algorithm_name);
    #endif

    if (params->choose_once) {
      reuse_algo = 1;
    } else {
      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
        prev_img_dims[i] = PyGpuArray_DIM(input, i);
        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
      }
    }
  }

  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
  if (PyGpuArray_NDIM(input) == 5 &&
      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
  {
    #ifdef DEBUG
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
    #endif
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  }

  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
  // Issue should be resolved for cuDNN > V6.0.
  if (cudnnGetVersion() < 6100 &&
      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
      PyGpuArray_DIM(input, 0) > 65536)
  {
    #ifdef DEBUG
    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
    #endif
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  }

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
  // does not support strides.
  // If the chosen implementation is FFT or tiled-FFT, validate that it can
  // be used on the current data and default to a safe implementation if it
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
  /* NB:
  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
  New conditions apply and may depend on number of dimensions (2D or 3D)
  e.g. for FFT_TILING.
  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
  */
  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {

    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int dilation[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             dilation, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
      if (stride[0] != 1 || stride[1] != 1 ||
          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
      {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    } else {
      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
      if (stride[0] != 1 || stride[1] != 1) {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    }
  }

  {
    size_t worksize;
    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);

    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
      // Fallback to none algo if not supported

      #ifdef DEBUG
      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
      fprintf(stderr, "(%s error getting worksize: "
                      "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
      #endif

      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
                                                    APPLY_SPECIFIC(output),
                                                    algo,
                                                    &worksize);
    }

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
    }

    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    for ( int g = 0; g < params->num_groups; g++) {
    err = cudnnConvolutionForward(
      params->handle,
      alpha_p,
      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
    }

    if (worksize != 0)
      gpudata_release(workspace);

    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
Ejemplo n.º 6
0
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                           PyGpuArrayObject *const weight,
                           PyGpuArrayObject *const top,
                           const size_t direction,
                           const size_t dH = 1,
                           const size_t dW = 1,
                           const size_t dD = 1,
                           const size_t dilH = 1,
                           const size_t dilW = 1,
                           const size_t dilD = 1,
                           const size_t padH = 0,
                           const size_t padW = 0,
                           const size_t padD = 0)
{
    if (PyGpuArray_NDIM(bottom) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires bottom to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(bottom)[0],
                PyGpuArray_STRIDES(bottom)[1],
                PyGpuArray_STRIDES(bottom)[2],
                PyGpuArray_STRIDES(bottom)[3],
                PyGpuArray_STRIDES(bottom)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(weight) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires weight to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(weight)[0],
                PyGpuArray_STRIDES(weight)[1],
                PyGpuArray_STRIDES(weight)[2],
                PyGpuArray_STRIDES(weight)[3],
                PyGpuArray_STRIDES(weight)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(top) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires top to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(top)[0],
                PyGpuArray_STRIDES(top)[1],
                PyGpuArray_STRIDES(top)[2],
                PyGpuArray_STRIDES(top)[3],
                PyGpuArray_STRIDES(top)[4]);
        return NULL;
    }

    // Extract some shape information for later and check shape consistency
    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
    // weights: (nFilters, nChannels, rows, columns, slices)
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
    const size_t kD = PyGpuArray_DIMS(weight)[4];
    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorr3dMM images and kernel must have the same stack size\n");
        return NULL;
    }
    // implicit dilated filter
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
            topWidth != PyGpuArray_DIMS(top)[3] ||
            topDepth != PyGpuArray_DIMS(top)[4]) {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM shape inconsistency:\n"
                "  bottom shape: %ld %ld %ld %ld %ld\n"
                "  weight shape: %ld %ld %ld %ld %ld\n"
                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
                nFilters, nChannels, kH, kW, kD,
                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
                batchSize, nFilters, topHeight, topWidth, topDepth);
        return NULL;
    }

    int err = gpublas_setup(bottom->context->ctx);
    if (err != GA_NO_ERROR) {
        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
        return NULL;
    }

    // Get the max threads per blocks
    size_t max_threads_dim;
    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
    if (err != GA_NO_ERROR){
        PyErr_Format(PyExc_RuntimeError,
                     "Could not fetch max_threads_dim.");
        return NULL;
    }

    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH * kD;
    col_dim[1] = topHeight * topWidth * topDepth;
    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
                                                           bottom->ga.typecode,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
    if (NULL == col)
    {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
        return NULL;
    }

    // Define some useful variables
    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;
    const DTYPE_INPUT_0 one = 1.0f;
    const DTYPE_INPUT_0 zero = 0.0f;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(max_threads_dim,
                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
                           padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                N_, M_, K_, one,
                                col->ga.data, 0, N_,
                                weight->ga.data, 0, K_,
                                zero,
                                top->ga.data, n * top_stride, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(max_threads_dim,
                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
                           padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
                                K_, M_, N_, one,
                                col->ga.data, 0, N_,
                                top->ga.data, n * top_stride, N_,
                                (n == 0) ? zero : one,
                                weight->ga.data, 0, K_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, one,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                zero,
                                col->ga.data, 0, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
            // col2im3d back to the data
            err = col2im3d(max_threads_dim,
                           col->ga.data, nChannels,
                           bottomHeight, bottomWidth, bottomDepth,
                           kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
                           dH, dW, dD, bottom->ga.data, n * bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    // Free temporary columns
    Py_DECREF(col);

    // Note that we don't change the refcount of the output matrix here. Output
    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
}
Ejemplo n.º 7
0
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
                                  PyGpuArrayObject *gz,
                                  PyArrayObject *ws,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **gx,
                                  PyGpuContextObject *ctx) {
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
    {
      PyErr_Format(PyExc_ValueError,
                   "GpuMaxPoolGrad: requires data to be C-contiguous");
      return 1;
    }
  size_t ndims = PyArray_DIM(ws, 0);
  if (PyGpuArray_NDIM(x) != ndims + 2
      || PyGpuArray_NDIM(gz) != ndims + 2)
    {
      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
      return 1;
    }
  if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolGrad: failed to allocate memory");
      return 1;
    }

  {
    // scope for running kernel
    size_t w[3];
    size_t s[3];
    size_t p[3];
    for(int i = 0; i < ndims; i++) {
      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
    }

    int err;
    const size_t* z_dims = PyGpuArray_DIMS(gz);
    const size_t* x_dims = PyGpuArray_DIMS(x);

    if (ndims == 2) {
      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
      err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
                                         z_dims[2], z_dims[3],
                                         x->ga.data, gz->ga.data,
                                         w[0], w[1], s[0], s[1], p[0], p[1],
                                         INC_PAD, SUM_MODE, (*gx)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
                     GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
        return 1;
      }
    } else if (ndims == 3) {
      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
      err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
                                         z_dims[2], z_dims[3], z_dims[4],
                                         x->ga.data, gz->ga.data,
                                         w[0], w[1], w[2], s[0], s[1], s[2],
                                         p[0], p[1], p[2], INC_PAD, SUM_MODE,
                                         (*gx)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
                     GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
        return 1;
      }
    }
  }
  return 0;
}
Ejemplo n.º 8
0
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
                        PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "GpuDnnConv images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
#else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
#endif

  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);

#ifdef CHOOSE_ALGO
  static int reuse_algo = 0;
  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_top_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess){
      cudaGetLastError();
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                   "information on the GPU: %s\n", cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_top_dims[i] = PyGpuArray_DIM(output, i);
  }
#endif

#endif

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024.
  // If the chosen implementation is FFT, validate that it can
  // be used on the current data and default to a safe implementation if it
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
      PyGpuArray_NDIM(input) == 4) {
    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }

  size_t worksize;
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                 cudnnGetErrorString(err));
      cuda_exit(c->ctx);
    return 1;
  }

  if (worksize != 0) {
    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
  }

  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter_v3(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));

  if (worksize != 0)
    c->ops->buffer_release(workspace);

  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
Ejemplo n.º 9
0
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
                              PyGpuArrayObject **S,
                              PyGpuArrayObject **U, // may be NULL
                              PyGpuArrayObject **VT, // may be NULL
                              PARAMS_TYPE* params) {
  bool compute_uv = (U != NULL);
  magma_int_t *iwork = NULL, iunused[1];
  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
  magma_vec_t jobz;
  size_t s_dims[1], u_dims[2], vt_dims[2];
  float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
        *work = NULL;
  float dummy[1];
  int res = -1, lwork;

  if (A->ga.typecode != GA_FLOAT) {
    PyErr_SetString(PyExc_TypeError,
                    "GpuMagmaMatrixInverse: Unsupported data type");
    return -1;
  }

  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);
  magma_init();

  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
    return 1;
  }
  if (PyGpuArray_NDIM(A) != 2) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaMatrixInverse: matrix rank error");
    goto fail;
  }

  // magma matrix svd
  // reverse dimensions because MAGMA expects column-major matrices:
  M = PyGpuArray_DIM(A, 1);
  N = PyGpuArray_DIM(A, 0);
  K = std::min(M, N);

  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
  cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&s_data, K)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }

  if (compute_uv) {
    if (params->full_matrices) {
      jobz = MagmaAllVec;
    } else {
      jobz = MagmaSomeVec;
    }
    M_U  = (jobz == MagmaAllVec ? M : K);
    N_VT = (jobz == MagmaAllVec ? N : K);
    ldu = M;
    ldv = N_VT;

    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
  } else {
    jobz = MagmaNoVec;
    ldu = M;
    ldv = N;
  }

  // query for workspace size
  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
               dummy, -1, iunused, &info);

  lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate working memory");
    goto fail;
  }

  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate working memory");
    goto fail;
  }

  // compute svd
  magma_sgesdd(jobz, M, N, a_data, M, s_data,
               u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
  if (info > 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
        info);
    goto fail;
  } else if (info < 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
    goto fail;
  }

  s_dims[0] = K;
  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (compute_uv) {
    u_dims[0] = N; u_dims[1] = N_VT;
    if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
    // to match numpy.linalg.svd output
    cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
               cudaMemcpyDeviceToDevice);

    vt_dims[0] = M_U; vt_dims[1] = M;
    if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
    // to match numpy.linalg.svd output
    cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
               cudaMemcpyDeviceToDevice);
  }
  res = 0;
fail:
  if (a_data != NULL)
    magma_free_pinned(a_data);
  if (s_data != NULL)
    magma_free_pinned(s_data);
  if (u_data != NULL)
    magma_free_pinned(u_data);
  if (vt_data != NULL)
    magma_free_pinned(vt_data);
  if (work != NULL)
    magma_free_pinned(work);
  if (iwork != NULL)
    magma_free_cpu(iwork);
  magma_finalize();
  cuda_exit(params->context->ctx);
  return res;
}
Ejemplo n.º 10
0
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
                         PyGpuContextObject *c) {
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
		    "images and kernel must have the same stack size");
    return 1;
  }

  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

#ifdef CONV_INPLACE
  Py_XDECREF(*output);
  *output = om;
  Py_INCREF(*output);
#else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
#endif

  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;

  cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO
  /* Static variables are only initialized once so this will not
   * reset the previous algo every time */
  static int reuse_algo = 0;
  static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
  static size_t prev_img_dims[5] = {0};
  static size_t prev_kern_dims[5] = {0};

  reuse_algo = 1;
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
    reuse_algo = (reuse_algo &&
                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
  }
#endif

  if (!reuse_algo) {
#ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
    algo = choice.algo;
#else
    size_t free = 0, total = 0;
    cudaError_t err2 = cudaMemGetInfo(&free, &total);
    if (err2 != cudaSuccess) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                   "memory information on the GPU: %s\n",
                   cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }

    err = cudnnGetConvolutionForwardAlgorithm(
      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error selecting convolution algo: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
#endif
    prev_algo = algo;
  } else {
    algo = prev_algo;
  }

#ifdef CHOOSE_ONCE
  reuse_algo = 1;
#else
  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
    prev_img_dims[i] = PyGpuArray_DIM(input, i);
    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
  }
#endif

#endif

  /* These two algos are not supported for 3d conv */
  if (PyGpuArray_NDIM(input) == 5 &&
      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

#if CUDNN_VERSION > 3000
  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                          upscale, &mode);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (stride[0] != 1 || stride[1] != 1 ||
        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    }
  }
#endif

#if CUDNN_VERSION < 3000
  /* cuDNN before v3 does not support kernels larger than input even
   * if appropriate padding is selected. */
  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
      PyErr_SetString(PyExc_RuntimeError, "the current version "
                      "of CuDNN does not support kernels larger than the "
                      "inputs in any spatial dimension, even if the inputs "
                      "are padded such that the padded inputs are larger "
                      "than the kernels. Update your installation of CuDNN "
                      "to V3 or more recent to solve the issue.");
      cuda_exit(c->ctx);
      return 1;
    }
  }
#endif

  {
    size_t worksize;
    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
    }

    err = cudnnConvolutionForward(
      APPLY_SPECIFIC(_handle),
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));

    if (worksize != 0)
      c->ops->buffer_release(workspace);
  }
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
Ejemplo n.º 11
0
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
                               PyGpuArrayObject **D,
                               PyGpuArrayObject **V, // may be NULL
                               PARAMS_TYPE *params) {
  PyGpuArrayObject *A = NULL;
  magma_int_t N, liwork, *iwork_data = NULL;
  size_t d_dims[1], v_dims[2];
  magma_uplo_t uplo;
  magma_vec_t jobz;
  float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
  int res = -1, info;

  if (A_->ga.typecode != GA_FLOAT) {
    PyErr_SetString(PyExc_TypeError,
                    "GpuMagmaEigh: Unsupported data type");
    return -1;
  }

  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);

  if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: requires data to be C-contiguous");
    goto fail;
  }
  if (PyGpuArray_NDIM(A_) != 2) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: matrix rank error");
    goto fail;
  }
  if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaEigh: matrix is not square");
    goto fail;
  }

  A = pygpu_copy(A_, GA_F_ORDER);
  if (A == NULL) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to change to column-major order");
    return -1;
  }

  // magma matrix eigen decomposition of a symmetric matrix
  N = PyGpuArray_DIM(A, 0);

  if (params->lower) {
    uplo = MagmaLower;
  } else {
    uplo = MagmaUpper;
  }
  if (params->compute_v) {
    jobz = MagmaVec;
  } else {
    jobz = MagmaNoVec;
  }

  if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }
  if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }

  // query for workspace size
  magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
                   -1, &liwork, -1, &info);

  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }
  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate working memory");
    goto fail;
  }

  magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
                   wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
                   &info);
  if (info > 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaEigh:  %d off-diagonal elements of an didn't converge to zero",
        info);
    goto fail;
  } else if (info < 0) {
    PyErr_Format(
        PyExc_RuntimeError,
        "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
    goto fail;
  }

  d_dims[0] = N;
  if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaEigh: failed to allocate memory for the output");
    goto fail;
  }
  cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
             cudaMemcpyDeviceToDevice);

  if (params->compute_v) {
    *V = theano_try_copy(*V, A);
    if (*V == NULL) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaEigh: failed to allocate memory for the output");
      goto fail;
    }
  }
  res = 0;
fail:
  if (w_data != NULL)
    magma_free_pinned(w_data);
  if (wA_data != NULL)
    magma_free_pinned(wA_data);
  if (work_data != NULL)
    magma_free_pinned(work_data);
  if (iwork_data != NULL)
    magma_free_cpu(iwork_data);
  Py_XDECREF(A);
  cuda_exit(params->context->ctx);
  return res;
}
Ejemplo n.º 12
0
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
                                 PyGpuArrayObject *ex,
                                 PyArrayObject *ws,
                                 PyArrayObject *stride,
                                 PyArrayObject *pad,
                                 PyGpuArrayObject **z,
                                 PyGpuContextObject *ctx) {
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
    {
      PyErr_Format(PyExc_ValueError,
                   "GpuMaxPoolRop: requires data to be C-contiguous");
      return 1;
    }
  size_t ndims = PyArray_DIM(ws, 0);
  if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
    {
      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
      return 1;
    }
  // prepare output
  const size_t* x_dims = PyGpuArray_DIMS(x);
  size_t z_dims[5]; // avoid warning if use 2 + nd
  size_t w[3];
  size_t s[3];
  size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
  int nonzero_padding = 0;
  for (int i = 0; i < ndims; i++) {
    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
    if (p[i] > 0) {
      nonzero_padding = 1;
    }
  }
  if (!IGNORE_BORDER && nonzero_padding) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMaxPoolRop: padding works only with ignore_border=True");
    return 1;
  }

  if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
                         ex->ga.typecode, GA_C_ORDER, ctx) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolRop: failed to allocate memory");
      return 1;
    }
  {
    // scope for running kernel
    int err;

    if (ndims == 2) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
      err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                        x_dims[2], x_dims[3],
                                        x->ga.data, ex->ga.data,
                                        w[0], w[1], s[0], s[1], p[0], p[1],
                                        (*z)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
        return 1;
      }
    }
    else if (ndims == 3) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
      err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                        x_dims[2], x_dims[3], x_dims[4],
                                        x->ga.data, ex->ga.data,
                                        w[0], w[1], w[2], s[0], s[1], s[2],
                                        p[0], p[1], p[2], (*z)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
        return 1;
      }
    }
  }
  return 0;
}
Ejemplo n.º 13
0
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                           PyGpuArrayObject *const weight,
                           PyGpuArrayObject *const top,
                           const size_t direction,
                           const size_t dH = 1,
                           const size_t dW = 1,
                           const size_t dD = 1,
                           const size_t dilH = 1,
                           const size_t dilW = 1,
                           const size_t dilD = 1,
                           const size_t padH = 0,
                           const size_t padW = 0,
                           const size_t padD = 0)
{
    if (PyGpuArray_NDIM(bottom) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires bottom to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(bottom)[0],
                PyGpuArray_STRIDES(bottom)[1],
                PyGpuArray_STRIDES(bottom)[2],
                PyGpuArray_STRIDES(bottom)[3],
                PyGpuArray_STRIDES(bottom)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(weight) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires weight to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(weight)[0],
                PyGpuArray_STRIDES(weight)[1],
                PyGpuArray_STRIDES(weight)[2],
                PyGpuArray_STRIDES(weight)[3],
                PyGpuArray_STRIDES(weight)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(top) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires top to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(top)[0],
                PyGpuArray_STRIDES(top)[1],
                PyGpuArray_STRIDES(top)[2],
                PyGpuArray_STRIDES(top)[3],
                PyGpuArray_STRIDES(top)[4]);
        return NULL;
    }

    // Extract some shape information for later and check shape consistency
    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
    // weights: (nFilters, nChannels, rows, columns, slices)
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
    const size_t kD = PyGpuArray_DIMS(weight)[4];
    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorr3dMM images and kernel must have the same stack size\n");
        return NULL;
    }
    // implicit dilated filter
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
            topWidth != PyGpuArray_DIMS(top)[3] ||
            topDepth != PyGpuArray_DIMS(top)[4]) {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM shape inconsistency:\n"
                "  bottom shape: %ld %ld %ld %ld %ld\n"
                "  weight shape: %ld %ld %ld %ld %ld\n"
                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
                nFilters, nChannels, kH, kW, kD,
                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
                batchSize, nFilters, topHeight, topWidth, topDepth);
        return NULL;
    }

    int err = gpublas_setup(bottom->context->ctx);
    if (err != GA_NO_ERROR) {
        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
        return NULL;
    }

    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH * kD;
    col_dim[1] = topHeight * topWidth * topDepth;
    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
                                                           bottom->ga.typecode,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
    if (NULL == col)
    {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
        return NULL;
    }

    // Define some useful variables
    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            default:
              err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM forward encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            default:
              err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad weights encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
        }
        if (batchSize == 0) {
            err = GpuArray_memset(&weight->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
          // gemm into columns
          switch (top->ga.typecode) {
          case GA_FLOAT:
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_DOUBLE:
            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_HALF:
            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          default:
            err = GA_UNSUPPORTED_ERROR;
          }
          if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuCorr3dMM grad inputs encountered an error running gemm.");
            Py_DECREF(col);
            return NULL;
          }
          // col2im3d back to the data
          err = col2im3d(col->ga.data, nChannels,
                         bottomHeight, bottomWidth, bottomDepth,
                         kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
                         dH, dW, dD, bottom->ga.data, n * bottom_stride);
          if (err != GA_NO_ERROR) {
            Py_DECREF(col);
            return NULL;
          }
        }
    }
    // Free temporary columns
    Py_DECREF(col);

    // Note that we don't change the refcount of the output matrix here. Output
    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
}
Ejemplo n.º 14
0
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **input,
                        PyGpuContextObject *c) {
    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
    float af = alpha, bf = beta;
    void *alpha_p;
    void *beta_p;

    if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
        PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
                        "stack size");
        return 1;
    }

    if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
        return 1;
    if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
        return 1;

    switch (im->ga.typecode) {
    case GA_DOUBLE:
        alpha_p = (void *)&alpha;
        beta_p = (void *)&beta;
        break;
    case GA_FLOAT:
    case GA_HALF:
        alpha_p = (void *)&af;
        beta_p = (void *)&bf;
        break;
    default:
        PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
        return 1;
    }

#ifdef CONV_INPLACE
    Py_XDECREF(*input);
    *input = im;
    Py_INCREF(*input);
#else
    if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
                           im->ga.typecode, GA_C_ORDER, c) != 0)
        return 1;
    if (beta != 0.0 && pygpu_move(*input, im))
        return 1;
#endif

    if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
        return 1;

    cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;

    cuda_enter(c->ctx);

#ifdef CHOOSE_ALGO
    static int reuse_algo = 0;
    static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;

#ifndef CHOOSE_ONCE
    static size_t prev_kern_dims[5] = {0};
    static size_t prev_top_dims[5] = {0};

    reuse_algo = 1;
    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
    }
#endif

    if (!reuse_algo) {
#ifdef CHOOSE_TIME
        int count;
        cudnnConvolutionBwdDataAlgoPerf_t choice;

        err = cudnnFindConvolutionBackwardDataAlgorithm(
                  APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
                  APPLY_SPECIFIC(kerns), 1, &count, &choice);

        if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            cuda_exit(c->ctx);
            return 1;
        }

        algo = choice.algo;
#else
        size_t free = 0, total = 0;
        cudaError_t err2 = cudaMemGetInfo(&free, &total);
        if (err2 != cudaSuccess) {
            cudaGetLastError();
            PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
                         "information on the GPU: %s\n", cudaGetErrorString(err2));
            cuda_exit(c->ctx);
            return 1;
        }

        err = cudnnGetConvolutionBackwardDataAlgorithm(
                  APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
                  desc, APPLY_SPECIFIC(kerns),
                  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
        if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            cuda_exit(c->ctx);
            return 1;
        }
#endif
        prev_algo = algo;
    } else {
        algo = prev_algo;
    }

#ifdef CHOOSE_ONCE
    reuse_algo = 1;
#else
    for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
        prev_top_dims[i] = PyGpuArray_DIM(output, i);
    }
#endif

#endif

#if CUDNN_VERSION > 3000
    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
        int nd;
        int pad[2];
        int stride[2];
        int upscale[2];
        cudnnConvolutionMode_t mode;
        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                              upscale, &mode);
        if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting convolution properties: %s",
                         cudnnGetErrorString(err));
            cuda_exit(c->ctx);
            return 1;
        }

        if (stride[0] != 1 || stride[1] != 1 ||
                PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
                (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
            algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
        }
    }
#endif

    size_t worksize;
    gpudata *workspace;

    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
              APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
              APPLY_SPECIFIC(input), algo, &worksize);

    if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                     cudnnGetErrorString(err));
        cuda_exit(c->ctx);
        return 1;
    }

    if (worksize != 0) {
        workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
        if (workspace == NULL) {
            PyErr_SetString(PyExc_RuntimeError,
                            "Could not allocate working memory");
            cuda_exit(c->ctx);
            return 1;
        }
    }

    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnConvolutionBackwardData_v3(
              APPLY_SPECIFIC(_handle),
              alpha_p,
              APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
              APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
              desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
              beta_p,
              APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));

    if (worksize != 0)
        c->ops->buffer_release(workspace);

    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    cuda_exit(c->ctx);

    if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                     cudnnGetErrorString(err));
        return 1;
    }
    return 0;
}