Exemplo n.º 1
0
int im3d2col(const size_t max_threads_dim,
    gpudata * data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
    const size_t pad_h, const size_t pad_w, const size_t pad_d,
    const size_t stride_h, const size_t stride_w, const size_t stride_d,
    gpudata * data_col) {
  // We are going to launch channels * height_col * width_col * depth_col
  // kernels, each kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
  size_t dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
  size_t num_kernels = channels * height_col * width_col * depth_col;
  size_t threads_per_block = max_threads_dim;
  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  int err;
  GpuKernel *kernel;
  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
    err = dilated_im3d2col_kernel_call(
      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
      width_col, depth_col, data_col);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_im3d2col_kernel: %s.",
                     GpuKernel_error(&k_dilated_im3d2col_kernel, err));
    }
  }
  else{
    err = im3d2col_kernel_call(
      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
      data_col);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: im3d2col_kernel: %s.",
                     GpuKernel_error(&k_im3d2col_kernel, err));
    }
  }
  return err;
}
Exemplo n.º 2
0
int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
                              PyGpuArrayObject **z, PyGpuContextObject *ctx) {
  size_t dims[2] = {0, 0};
  size_t ls, gs;
  void *args[3];
  int err;

  dims[0] = ((DTYPE_INPUT_0 *)PyArray_DATA(n))[0];
  dims[1] = ((DTYPE_INPUT_1 *)PyArray_DATA(m))[0];

  Py_XDECREF(*z);
  *z = pygpu_zeros(2, dims,
                   TYPECODE,
                   GA_C_ORDER,
                   ctx, Py_None);
  if (*z == NULL)
    return -1;

  args[0] = (*z)->ga.data;
  args[1] = &dims[0];
  args[2] = &dims[1];
  ls = 1;
  gs = 256;
  /* The k_eye name comes from the kernel declaration above. */
  err = GpuKernel_call(&k_eye, 1, &ls, &gs, 0, args);
  if (err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
                 "gpuarray error: kEye: %s. n%lu, m=%lu.",
                 GpuKernel_error(&k_eye, err),
                 (unsigned long)dims[0], (unsigned long)dims[1]);
    return -1;
  }
  return 0;
}
Exemplo n.º 3
0
int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t patch_h, const size_t patch_w, const size_t patch_d,
    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
    const size_t pad_h, const size_t pad_w, const size_t pad_d,
    const size_t stride_h, const size_t stride_w, const size_t stride_d,
    gpudata * data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
  size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
  size_t num_kernels = channels * height * width * depth;
  size_t threads_per_block = max_threads_dim;
  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
  int err;
  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
    err = dilated_col2im3d_kernel_call(
      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
      data_im, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_col2im3d_kernel: %s.",
                     GpuKernel_error(&k_dilated_col2im3d_kernel, err));
    }
  }
  else{
    err = col2im3d_kernel_call(
      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
      height_col, width_col, depth_col, data_im, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: col2im3d_kernel: %s.",
                     GpuKernel_error(&k_col2im3d_kernel, err));
    }
  }
  return err;
}
Exemplo n.º 4
0
int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w,
    const size_t stride_h, const size_t stride_w,
    gpudata * data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  size_t num_kernels = channels * height_col * width_col;
  int err;
  if (dilation_h != 1 || dilation_w != 1) {
    err = dilated_im2col_kernel_scall(
      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_im2col_kernel: %s.",
                     GpuKernel_error(&k_dilated_im2col_kernel, err));
    }
  } else {
    err = im2col_kernel_scall(
      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: im2col_kernel: %s.",
                     GpuKernel_error(&k_im2col_kernel, err));
    }
  }
  return err;
}
Exemplo n.º 5
0
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
                                  PyGpuArrayObject *gz,
                                  PyArrayObject *ws,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **gx,
                                  PyGpuContextObject *ctx) {
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
    {
      PyErr_Format(PyExc_ValueError,
                   "GpuMaxPoolGrad: requires data to be C-contiguous");
      return 1;
    }
  size_t ndims = PyArray_DIM(ws, 0);
  if (PyGpuArray_NDIM(x) != ndims + 2
      || PyGpuArray_NDIM(gz) != ndims + 2)
    {
      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
      return 1;
    }
  if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolGrad: failed to allocate memory");
      return 1;
    }

  {
    // scope for running kernel
    size_t w[3];
    size_t s[3];
    size_t p[3];
    for(int i = 0; i < ndims; i++) {
      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
    }

    int err;
    const size_t* z_dims = PyGpuArray_DIMS(gz);
    const size_t* x_dims = PyGpuArray_DIMS(x);

    if (ndims == 2) {
      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
      err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
                                         z_dims[2], z_dims[3],
                                         x->ga.data, gz->ga.data,
                                         w[0], w[1], s[0], s[1], p[0], p[1],
                                         INC_PAD, SUM_MODE, (*gx)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
                     GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
        return 1;
      }
    } else if (ndims == 3) {
      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
      err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
                                         z_dims[2], z_dims[3], z_dims[4],
                                         x->ga.data, gz->ga.data,
                                         w[0], w[1], w[2], s[0], s[1], s[2],
                                         p[0], p[1], p[2], INC_PAD, SUM_MODE,
                                         (*gx)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
                     GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
        return 1;
      }
    }
  }
  return 0;
}
Exemplo n.º 6
0
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
                                 PyGpuArrayObject *ex,
                                 PyArrayObject *ws,
                                 PyArrayObject *stride,
                                 PyArrayObject *pad,
                                 PyGpuArrayObject **z,
                                 PyGpuContextObject *ctx) {
  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
    {
      PyErr_Format(PyExc_ValueError,
                   "GpuMaxPoolRop: requires data to be C-contiguous");
      return 1;
    }
  size_t ndims = PyArray_DIM(ws, 0);
  if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
    {
      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
      return 1;
    }
  // prepare output
  const size_t* x_dims = PyGpuArray_DIMS(x);
  size_t z_dims[5]; // avoid warning if use 2 + nd
  size_t w[3];
  size_t s[3];
  size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
  int nonzero_padding = 0;
  for (int i = 0; i < ndims; i++) {
    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
    if (p[i] > 0) {
      nonzero_padding = 1;
    }
  }
  if (!IGNORE_BORDER && nonzero_padding) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMaxPoolRop: padding works only with ignore_border=True");
    return 1;
  }

  if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
                         ex->ga.typecode, GA_C_ORDER, ctx) != 0)
    {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMaxPoolRop: failed to allocate memory");
      return 1;
    }
  {
    // scope for running kernel
    int err;

    if (ndims == 2) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
      err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                        x_dims[2], x_dims[3],
                                        x->ga.data, ex->ga.data,
                                        w[0], w[1], s[0], s[1], p[0], p[1],
                                        (*z)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
        return 1;
      }
    }
    else if (ndims == 3) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
      err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                        x_dims[2], x_dims[3], x_dims[4],
                                        x->ga.data, ex->ga.data,
                                        w[0], w[1], w[2], s[0], s[1], s[2],
                                        p[0], p[1], p[2], (*z)->ga.data);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
        return 1;
      }
    }
  }
  return 0;
}