Esempio n. 1
0
int GpuArray_fdump(FILE *fd, const GpuArray *a) {
  char *buf, *p;
  size_t s = GpuArray_ITEMSIZE(a);
  size_t k;
  unsigned int i;
  int err;

  for (i = 0; i < a->nd; i++)
    s *= a->dimensions[i];

  buf = malloc(s);
  if (buf == NULL)
    return GA_MEMORY_ERROR;

  err = GpuArray_read(buf, s, a);
  if (err != GA_NO_ERROR) {
    free(buf);
    return err;
  }

  p = buf;
  k = 0;
  while (s) {
    fprintf(fd, "[%" SPREFIX "u] = ", k);
    switch (a->typecode) {
    case GA_UINT:
      fprintf(fd, "%u", *(unsigned int *)p);
      break;
    case GA_LONG:
      fprintf(fd, "%lld", (long long)*(int64_t *)p);
      break;
    case GA_FLOAT:
      fprintf(fd, "%f", *(float *)p);
      break;
    case GA_SSIZE:
      fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p);
      break;
    default:
      free(buf);
      fprintf(fd, "<unsupported data type %d>\n", a->typecode);
      return GA_UNSUPPORTED_ERROR;
    }
    s -= gpuarray_get_elsize(a->typecode);
    p += gpuarray_get_elsize(a->typecode);
    k++;
    fprintf(fd, "\n");
  }
  free(buf);
  return GA_NO_ERROR;
}
Esempio n. 2
0
int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf,
                            int typecode, unsigned int nd, const size_t *dims,
                            const ssize_t *strides) {
  char *base = (char *)buf;
  size_t offset = 0;
  size_t size = gpuarray_get_elsize(typecode);
  gpudata *b;
  int err;
  unsigned int i;

  for (i = 0; i < nd; i++) {
    if (dims[i] == 0) {
      size = 0;
      base = (char *)buf;
      break;
    }

    if (strides[i] < 0)
      base += (dims[i]-1) * strides[i];
    else
      size += (dims[i]-1) * strides[i];
  }
  offset = (char *)buf - base;
  size += offset;

  b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err);
  if (b == NULL) return err;

  err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1);
  gpudata_release(b);
  return err;
}
Esempio n. 3
0
int GpuArray_setarray(GpuArray *a, const GpuArray *v) {
  GpuArray tv;
  size_t sz;
  ssize_t *strs;
  unsigned int i, off;
  int err = GA_NO_ERROR;
  int simple_move = 1;

  if (a->nd < v->nd)
    return GA_VALUE_ERROR;

  if (!GpuArray_ISWRITEABLE(a))
    return GA_VALUE_ERROR;
  if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a))
    return GA_UNALIGNED_ERROR;

  off = a->nd - v->nd;

  for (i = 0; i < v->nd; i++) {
    if (v->dimensions[i] != a->dimensions[i+off]) {
      if (v->dimensions[i] != 1)
	return GA_VALUE_ERROR;
      else
	simple_move = 0;
    }
  }

  if (simple_move && GpuArray_ISONESEGMENT(a) && GpuArray_ISONESEGMENT(v) &&
      GpuArray_ISFORTRAN(a) == GpuArray_ISFORTRAN(v) &&
      a->typecode == v->typecode &&
      a->nd == v->nd) {
    sz = gpuarray_get_elsize(a->typecode);
    for (i = 0; i < a->nd; i++) sz *= a->dimensions[i];
    return gpudata_move(a->data, a->offset, v->data, v->offset, sz);
  }

  strs = calloc(a->nd, sizeof(ssize_t));
  if (strs == NULL)
    return GA_MEMORY_ERROR;

  for (i = off; i < a->nd; i++) {
    if (v->dimensions[i-off] == a->dimensions[i]) {
      strs[i] = v->strides[i-off];
    }
  }

  memcpy(&tv, v, sizeof(GpuArray));
  tv.nd = a->nd;
  tv.dimensions = a->dimensions;
  tv.strides = strs;
  if (tv.nd != 0)
    GpuArray_fix_flags(&tv);
  err = ga_extcopy(a, &tv);
  free(strs);
  return err;
}
Esempio n. 4
0
static int
c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

  int strs[8], dims[8], default_stride = 1;
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 8) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
      PyGpuArray_STRIDE(var, i)/ds : default_stride;
    default_stride *= PyGpuArray_DIM(var, i);
    dims[i] = PyGpuArray_DIM(var, i);
  }

  /* Tensors can't be smaller than 3d for cudnn so we pad the
   * descriptor if they are */
  for (unsigned int i = nd; i < 3; i++) {
    strs[i] = 1;
    dims[i] = 1;
  }
  //only for grouped convolution i.e when groups > 1
  dims[1] = dims[1] / groups;
  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
                                                 dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set tensorNd descriptor: %s",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Esempio n. 5
0
static int get_type_flags(int typecode) {
  int flags = 0;
  if (typecode == GA_DOUBLE || typecode == GA_CDOUBLE)
    flags |= GA_USE_DOUBLE;
  if (typecode == GA_HALF)
    flags |= GA_USE_HALF;
  if (typecode == GA_CFLOAT || typecode == GA_CDOUBLE)
    flags |= GA_USE_COMPLEX;
  if (gpuarray_get_elsize(typecode) < 4)
    flags |= GA_USE_SMALL;
  return flags;
}
Esempio n. 6
0
static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
#if CUDNN_VERSION > 3000
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
#endif
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

  int strs[5], dims[5], default_stride = 1;
  unsigned int nd = PyGpuArray_NDIM(var);

  if (nd > 5) {
    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
    return -1;
  }

  for (unsigned int _i = nd; _i > 0; _i--) {
    unsigned int i = _i - 1;
    strs[i] = PyGpuArray_STRIDE(var, i) ?
      PyGpuArray_STRIDE(var, i)/ds : default_stride;
    default_stride *= PyGpuArray_DIM(var, i);
    dims[i] = PyGpuArray_DIM(var, i);
  }

  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
		 "Could not set tensorNd descriptor: %s",
		 cudnnGetErrorString(err));
    return -1;
  }
  return 0;
}
Esempio n. 7
0
int GpuArray_transfer(GpuArray *res, const GpuArray *a) {
  size_t sz;
  unsigned int i;

  if (!GpuArray_ISONESEGMENT(res))
    return GA_UNSUPPORTED_ERROR;
  if (!GpuArray_ISONESEGMENT(a))
    return GA_UNSUPPORTED_ERROR;

  if (res->typecode != a->typecode)
    return GA_UNSUPPORTED_ERROR;

  sz = gpuarray_get_elsize(a->typecode);
  for (i = 0; i < a->nd; i++) sz *= a->dimensions[i];

 return gpudata_transfer(res->data, res->offset, a->data, a->offset, sz);
}
/**
 * \brief NCCL implementation of \ref gpucomm_reduce_scatter.
 */
static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                          size_t offdest, size_t count, int typecode,
                          int opcode, gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  int ndev = 0;
  size_t resc_size;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  ASSERT_BUF(dest);
  GA_CHECK(get_count(comm, &ndev));
  GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode,
                              opcode, comm, &datatype, &op));
  if (dest->ctx != comm->ctx)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
  resc_size = count * gpuarray_get_elsize(typecode);
  if ((dest->sz - offdest) < resc_size)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
  assert(!(offdest > dest->sz));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc),
                                            (void *)(dest->ptr + offdest), count,
                                            datatype, op, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
/**
 * \brief Helper function to check for restrictions on `gpudata` to be used in
 * nccl
 * collective operations.
 */
static inline int check_restrictions(gpudata *src, size_t offsrc,
                                     gpudata *dest, size_t offdest,
                                     size_t count, int typecode,
                                     int opcode, gpucomm *comm,
                                     ncclDataType_t *datatype,
                                     ncclRedOp_t *op) {
  size_t op_size;
  // Check if count is larger than INT_MAX
  // TODO remove whenif nccl adapts to size_t
  if (count > INT_MAX)
    return error_set(comm->ctx->err, GA_XLARGE_ERROR, "Count too large for int");
  // src, dest and comm must refer to the same context
  if (src->ctx != comm->ctx)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "source and comm context differ");
  if (dest != NULL && dest->ctx != comm->ctx)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
  // typecode must correspond to a valid ncclDataType_t
  if (datatype != NULL) {
    *datatype = convert_data_type(typecode);
    if (*datatype == ncclNumTypes)
      return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
  }
  // opcode must correspond to a valid ncclRedOp_t
  if (op != NULL) {
    *op = convert_reduce_op(opcode);
    if (*op == ncclNumOps)
      return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
  }
  // offsets must not be larger than gpudata's size itself
  // (else out of alloc-ed mem scope)
  assert(!(offsrc > src->sz));
  assert(!(dest != NULL && offdest > dest->sz));
  // size to operate upon must be able to fit inside the gpudata (incl offsets)
  op_size = count * gpuarray_get_elsize(typecode);
  if ((src->sz - offsrc) < op_size)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "source too small for operation");
  if (dest != NULL && (dest->sz - offdest) < op_size)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
  return GA_NO_ERROR;
}
Esempio n. 10
0
int GpuArray_move(GpuArray *dst, const GpuArray *src) {
  size_t sz;
  unsigned int i;
  if (!GpuArray_ISWRITEABLE(dst))
    return GA_VALUE_ERROR;
  if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dst))
    return GA_UNALIGNED_ERROR;
  if (src->nd != dst->nd)
    return GA_VALUE_ERROR;
  for (i = 0; i < src->nd; i++) {
    if (src->dimensions[i] != dst->dimensions[i])
      return GA_VALUE_ERROR;
  }
  if (!GpuArray_ISONESEGMENT(dst) || !GpuArray_ISONESEGMENT(src) ||
      GpuArray_ISFORTRAN(dst) != GpuArray_ISFORTRAN(src) ||
      dst->typecode != src->typecode) {
    return ga_extcopy(dst, src);
  }
  sz = gpuarray_get_elsize(dst->typecode);
  for (i = 0; i < dst->nd; i++) sz *= dst->dimensions[i];
  return gpudata_move(dst->data, dst->offset, src->data, src->offset, sz);
}
Esempio n. 11
0
static inline int gen_extcopy_kernel(const extcopy_args *a,
				     cuda_context *ctx, gpukernel **v,
				     size_t nEls) {
  strb sb = STRB_STATIC_INIT;
  int res = GA_SYS_ERROR;
  int flags = GA_USE_PTX;
  unsigned int bits = sizeof(void *)*8;
  int types[2];
  const char *in_t, *in_ld_t;
  const char *out_t, *out_ld_t;
  const char *rmod;

  in_t = map_t(a->itype);
  out_t = map_t(a->otype);
  /* Since float16 ('f16') is not a fully-supported type we need to use
     it as b16 (basically uint16) for read and write operations. */
  if (a->itype == GA_HALF)
    in_ld_t = "b16";
  else
    in_ld_t = in_t;
  if (a->otype == GA_HALF)
    out_ld_t = "b16";
  else
    out_ld_t = out_t;
  rmod = get_rmod(a->itype, a->otype);
  if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR;

  strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id,
               bits, bits, bits, bits, in_t, out_t, bits,
               bits, bits, bits, bits, nEls, bits, bits);

  cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits);
  cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits);

  strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n"
	       "cvt.s%u.s%u rp2, a_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       "ld.global.%s tmpa, [rp1+%" SPREFIX "u];\n"
	       "cvt%s.%s.%s tmpb, tmpa;\n"
	       "ld.param.u%u rp1, [b_data];\n"
	       "cvt.s%u.s%u rp2, b_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       "st.global.%s [rp1+%" SPREFIX "u], tmpb;\n", bits,
	       bits, bits,
	       bits,
	       in_ld_t, a->ioff,
	       rmod, out_t, in_t,
	       bits,
	       bits, bits,
	       bits,
	       out_ld_t, a->ooff);

  strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls);

  if (strb_error(&sb))
    goto fail;

  if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE ||
      a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) {
    flags |= GA_USE_DOUBLE;
  }

  if (a->otype == GA_HALF || a->itype == GA_HALF) {
    flags |= GA_USE_HALF;
  }

  if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) {
    /* Should check for non-mod4 strides too */
    flags |= GA_USE_SMALL;
  }

  if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT ||
      a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) {
    flags |= GA_USE_COMPLEX;
  }

  types[0] = types[1] = GA_BUFFER;
  res = GA_NO_ERROR;
  *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy",
                      2, types, flags, &res, NULL);
 fail:
  strb_clear(&sb);
  return res;
}
Esempio n. 12
0
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                           PyGpuArrayObject *const weight,
                           PyGpuArrayObject *const top,
                           const size_t direction,
                           const size_t dH = 1,
                           const size_t dW = 1,
                           const size_t dD = 1,
                           const size_t dilH = 1,
                           const size_t dilW = 1,
                           const size_t dilD = 1,
                           const size_t padH = 0,
                           const size_t padW = 0,
                           const size_t padD = 0)
{
    if (PyGpuArray_NDIM(bottom) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires bottom to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(bottom)[0],
                PyGpuArray_STRIDES(bottom)[1],
                PyGpuArray_STRIDES(bottom)[2],
                PyGpuArray_STRIDES(bottom)[3],
                PyGpuArray_STRIDES(bottom)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(weight) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires weight to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(weight)[0],
                PyGpuArray_STRIDES(weight)[1],
                PyGpuArray_STRIDES(weight)[2],
                PyGpuArray_STRIDES(weight)[3],
                PyGpuArray_STRIDES(weight)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(top) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires top to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(top)[0],
                PyGpuArray_STRIDES(top)[1],
                PyGpuArray_STRIDES(top)[2],
                PyGpuArray_STRIDES(top)[3],
                PyGpuArray_STRIDES(top)[4]);
        return NULL;
    }

    // Extract some shape information for later and check shape consistency
    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
    // weights: (nFilters, nChannels, rows, columns, slices)
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
    const size_t kD = PyGpuArray_DIMS(weight)[4];
    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorr3dMM images and kernel must have the same stack size\n");
        return NULL;
    }
    // implicit dilated filter
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
            topWidth != PyGpuArray_DIMS(top)[3] ||
            topDepth != PyGpuArray_DIMS(top)[4]) {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM shape inconsistency:\n"
                "  bottom shape: %ld %ld %ld %ld %ld\n"
                "  weight shape: %ld %ld %ld %ld %ld\n"
                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
                nFilters, nChannels, kH, kW, kD,
                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
                batchSize, nFilters, topHeight, topWidth, topDepth);
        return NULL;
    }

    int err = gpublas_setup(bottom->context->ctx);
    if (err != GA_NO_ERROR) {
        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
        return NULL;
    }

    // Get the max threads per blocks
    size_t max_threads_dim;
    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
    if (err != GA_NO_ERROR){
        PyErr_Format(PyExc_RuntimeError,
                     "Could not fetch max_threads_dim.");
        return NULL;
    }

    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH * kD;
    col_dim[1] = topHeight * topWidth * topDepth;
    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
                                                           bottom->ga.typecode,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
    if (NULL == col)
    {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
        return NULL;
    }

    // Define some useful variables
    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;
    const DTYPE_INPUT_0 one = 1.0f;
    const DTYPE_INPUT_0 zero = 0.0f;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(max_threads_dim,
                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
                           padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                N_, M_, K_, one,
                                col->ga.data, 0, N_,
                                weight->ga.data, 0, K_,
                                zero,
                                top->ga.data, n * top_stride, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(max_threads_dim,
                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
                           padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
                                K_, M_, N_, one,
                                col->ga.data, 0, N_,
                                top->ga.data, n * top_stride, N_,
                                (n == 0) ? zero : one,
                                weight->ga.data, 0, K_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, one,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                zero,
                                col->ga.data, 0, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorr3dMM encountered an error running sgemm.\n");
                Py_DECREF(col);
                return NULL;
            }
            // col2im3d back to the data
            err = col2im3d(max_threads_dim,
                           col->ga.data, nChannels,
                           bottomHeight, bottomWidth, bottomDepth,
                           kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
                           dH, dW, dD, bottom->ga.data, n * bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    // Free temporary columns
    Py_DECREF(col);

    // Note that we don't change the refcount of the output matrix here. Output
    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
}
Esempio n. 13
0
int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode,
                   unsigned int nd, const size_t *dims, ga_order ord) {
  size_t size = gpuarray_get_elsize(typecode);
  unsigned int i;
  int res = GA_NO_ERROR;

  if (ord == GA_ANY_ORDER)
    ord = GA_C_ORDER;

  if (ord != GA_C_ORDER && ord != GA_F_ORDER)
    return GA_VALUE_ERROR;

  for (i = 0; i < nd; i++) {
    size_t d = dims[i];
    /* Check for overflow */
    if ((d >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
	d > 0 && SIZE_MAX / d < size)
      return GA_VALUE_ERROR;
    size *= d;
  }

  a->data = gpudata_alloc(ctx, size, NULL, 0, &res);
  if (a->data == NULL) return res;
  a->nd = nd;
  a->offset = 0;
  a->typecode = typecode;
  a->dimensions = calloc(nd, sizeof(size_t));
  a->strides = calloc(nd, sizeof(ssize_t));
  /* F/C distinction comes later */
  a->flags = GA_BEHAVED;
  if (a->dimensions == NULL || a->strides == NULL) {
    GpuArray_clear(a);
    return GA_MEMORY_ERROR;
  }
  /* Mult will not overflow since calloc succeded */
  memcpy(a->dimensions, dims, sizeof(size_t)*nd);

  size = gpuarray_get_elsize(typecode);
  /* mults will not overflow, checked on entry */
  switch (ord) {
  case GA_C_ORDER:
    for (i = nd; i > 0; i--) {
      a->strides[i-1] = size;
      size *= a->dimensions[i-1];
    }
    a->flags |= GA_C_CONTIGUOUS;
    break;
  case GA_F_ORDER:
    for (i = 0; i < nd; i++) {
      a->strides[i] = size;
      size *= a->dimensions[i];
    }
    a->flags |= GA_F_CONTIGUOUS;
    break;
  default:
    assert(0); /* cannot be reached */
  }

  if (a->nd <= 1)
    a->flags |= GA_F_CONTIGUOUS|GA_C_CONTIGUOUS;

  return GA_NO_ERROR;
}
Esempio n. 14
0
int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
                             const size_t *newdims, ga_order ord) {
  ssize_t *newstrides;
  size_t *tmpdims;
  size_t np;
  size_t op;
  size_t newsize = 1;
  size_t oldsize = 1;
  unsigned int ni = 0;
  unsigned int oi = 0;
  unsigned int nj = 1;
  unsigned int oj = 1;
  unsigned int nk;
  unsigned int ok;
  unsigned int i;

  if (ord == GA_ANY_ORDER && GpuArray_ISFORTRAN(a) && a->nd > 1)
    ord = GA_F_ORDER;

  for (i = 0; i < a->nd; i++) {
    oldsize *= a->dimensions[i];
  }

  for (i = 0; i < nd; i++) {
    size_t d = newdims[i];
    /* Check for overflow */
    if ((d >= MUL_NO_OVERFLOW || newsize >= MUL_NO_OVERFLOW) &&
	d > 0 && SIZE_MAX / d < newsize)
      return GA_INVALID_ERROR;
    newsize *= d;
  }

  if (newsize != oldsize) return GA_INVALID_ERROR;

  /* If the source and desired layouts are the same, then just copy
     strides and dimensions */
  if ((ord == GA_C_ORDER && GpuArray_CHKFLAGS(a, GA_C_CONTIGUOUS)) ||
      (ord == GA_F_ORDER && GpuArray_CHKFLAGS(a, GA_F_CONTIGUOUS))) {
    goto do_final_copy;
  }

  newstrides = calloc(nd, sizeof(ssize_t));
  if (newstrides == NULL)
    return GA_MEMORY_ERROR;

  while (ni < nd && oi < a->nd) {
    np = newdims[ni];
    op = a->dimensions[oi];

    while (np != op) {
      if (np < op) {
        np *= newdims[nj++];
      } else {
        op *= a->dimensions[oj++];
      }
    }

    for (ok = oi; ok < oj - 1; ok++) {
      if (ord == GA_F_ORDER) {
        if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok])
          goto need_copy;
      } else {
        if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1])
          goto need_copy;
      }
    }

    if (ord == GA_F_ORDER) {
      newstrides[ni] = a->strides[oi];
      for (nk = ni + 1; nk < nj; nk++) {
        newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
      }
    } else {
      newstrides[nj-1] = a->strides[oj-1];
      for (nk = nj-1; nk > ni; nk--) {
        newstrides[nk-1] = newstrides[nk]*newdims[nk];
      }
    }
    ni = nj++;
    oi = oj++;
  }

  /* Fixup trailing ones */
  if (ord == GA_F_ORDER) {
    for (i = nj-1; i < nd; i++) {
      newstrides[i] = newstrides[i-1] * newdims[i-1];
    }
  } else {
    for (i = nj-1; i < nd; i++) {
      newstrides[i] = gpuarray_get_elsize(a->typecode);
    }
  }

  /* We can reuse newstrides since it was allocated in this function.
     Can't do the same with newdims (which is a parameter). */
  tmpdims = calloc(nd, sizeof(size_t));
  if (tmpdims == NULL) {
    return GA_MEMORY_ERROR;
  }
  memcpy(tmpdims, newdims, nd*sizeof(size_t));
  a->nd = nd;
  free(a->dimensions);
  free(a->strides);
  a->dimensions = tmpdims;
  a->strides = newstrides;

  goto fix_flags;
 need_copy:
  free(newstrides);
  return GA_COPY_ERROR;

 do_final_copy:
  tmpdims = calloc(nd, sizeof(size_t));
  newstrides = calloc(nd, sizeof(ssize_t));
  if (tmpdims == NULL || newstrides == NULL) {
    free(tmpdims);
    free(newstrides);
    return GA_MEMORY_ERROR;
  }
  memcpy(tmpdims, newdims, nd*sizeof(size_t));
  if (nd > 0) {
    if (ord == GA_F_ORDER) {
      newstrides[0] = gpuarray_get_elsize(a->typecode);
      for (i = 1; i < nd; i++) {
        newstrides[i] = newstrides[i-1] * tmpdims[i-1];
      }
    } else {
      newstrides[nd-1] = gpuarray_get_elsize(a->typecode);
      for (i = nd-1; i > 0; i--) {
        newstrides[i-1] = newstrides[i] * tmpdims[i];
      }
    }
  }
  free(a->dimensions);
  free(a->strides);
  a->nd = nd;
  a->dimensions = tmpdims;
  a->strides = newstrides;

 fix_flags:
  GpuArray_fix_flags(a);
  return GA_NO_ERROR;
}
Esempio n. 15
0
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                           PyGpuArrayObject *const weight,
                           PyGpuArrayObject *const top,
                           const size_t direction,
                           const size_t dH = 1,
                           const size_t dW = 1,
                           const size_t dD = 1,
                           const size_t dilH = 1,
                           const size_t dilW = 1,
                           const size_t dilD = 1,
                           const size_t padH = 0,
                           const size_t padW = 0,
                           const size_t padD = 0)
{
    if (PyGpuArray_NDIM(bottom) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires bottom to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(bottom)[0],
                PyGpuArray_STRIDES(bottom)[1],
                PyGpuArray_STRIDES(bottom)[2],
                PyGpuArray_STRIDES(bottom)[3],
                PyGpuArray_STRIDES(bottom)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(weight) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires weight to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(weight)[0],
                PyGpuArray_STRIDES(weight)[1],
                PyGpuArray_STRIDES(weight)[2],
                PyGpuArray_STRIDES(weight)[3],
                PyGpuArray_STRIDES(weight)[4]);
        return NULL;
    }

    if (PyGpuArray_NDIM(top) != 5)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM requires top to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(top)[0],
                PyGpuArray_STRIDES(top)[1],
                PyGpuArray_STRIDES(top)[2],
                PyGpuArray_STRIDES(top)[3],
                PyGpuArray_STRIDES(top)[4]);
        return NULL;
    }

    // Extract some shape information for later and check shape consistency
    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
    // weights: (nFilters, nChannels, rows, columns, slices)
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
    const size_t kD = PyGpuArray_DIMS(weight)[4];
    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorr3dMM images and kernel must have the same stack size\n");
        return NULL;
    }
    // implicit dilated filter
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
            topWidth != PyGpuArray_DIMS(top)[3] ||
            topDepth != PyGpuArray_DIMS(top)[4]) {
        PyErr_Format(PyExc_ValueError,
                "GpuCorr3dMM shape inconsistency:\n"
                "  bottom shape: %ld %ld %ld %ld %ld\n"
                "  weight shape: %ld %ld %ld %ld %ld\n"
                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
                nFilters, nChannels, kH, kW, kD,
                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
                batchSize, nFilters, topHeight, topWidth, topDepth);
        return NULL;
    }

    int err = gpublas_setup(bottom->context->ctx);
    if (err != GA_NO_ERROR) {
        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
        return NULL;
    }

    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH * kD;
    col_dim[1] = topHeight * topWidth * topDepth;
    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
                                                           bottom->ga.typecode,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
    if (NULL == col)
    {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
        return NULL;
    }

    // Define some useful variables
    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            default:
              err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM forward encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            default:
              err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad weights encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
        }
        if (batchSize == 0) {
            err = GpuArray_memset(&weight->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
          // gemm into columns
          switch (top->ga.typecode) {
          case GA_FLOAT:
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_DOUBLE:
            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_HALF:
            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          default:
            err = GA_UNSUPPORTED_ERROR;
          }
          if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuCorr3dMM grad inputs encountered an error running gemm.");
            Py_DECREF(col);
            return NULL;
          }
          // col2im3d back to the data
          err = col2im3d(col->ga.data, nChannels,
                         bottomHeight, bottomWidth, bottomDepth,
                         kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
                         dH, dW, dD, bottom->ga.data, n * bottom_stride);
          if (err != GA_NO_ERROR) {
            Py_DECREF(col);
            return NULL;
          }
        }
    }
    // Free temporary columns
    Py_DECREF(col);

    // Note that we don't change the refcount of the output matrix here. Output
    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
}