int GpuArray_fdump(FILE *fd, const GpuArray *a) { char *buf, *p; size_t s = GpuArray_ITEMSIZE(a); size_t k; unsigned int i; int err; for (i = 0; i < a->nd; i++) s *= a->dimensions[i]; buf = malloc(s); if (buf == NULL) return GA_MEMORY_ERROR; err = GpuArray_read(buf, s, a); if (err != GA_NO_ERROR) { free(buf); return err; } p = buf; k = 0; while (s) { fprintf(fd, "[%" SPREFIX "u] = ", k); switch (a->typecode) { case GA_UINT: fprintf(fd, "%u", *(unsigned int *)p); break; case GA_LONG: fprintf(fd, "%lld", (long long)*(int64_t *)p); break; case GA_FLOAT: fprintf(fd, "%f", *(float *)p); break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; default: free(buf); fprintf(fd, "<unsupported data type %d>\n", a->typecode); return GA_UNSUPPORTED_ERROR; } s -= gpuarray_get_elsize(a->typecode); p += gpuarray_get_elsize(a->typecode); k++; fprintf(fd, "\n"); } free(buf); return GA_NO_ERROR; }
int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides) { char *base = (char *)buf; size_t offset = 0; size_t size = gpuarray_get_elsize(typecode); gpudata *b; int err; unsigned int i; for (i = 0; i < nd; i++) { if (dims[i] == 0) { size = 0; base = (char *)buf; break; } if (strides[i] < 0) base += (dims[i]-1) * strides[i]; else size += (dims[i]-1) * strides[i]; } offset = (char *)buf - base; size += offset; b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err); if (b == NULL) return err; err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1); gpudata_release(b); return err; }
int GpuArray_setarray(GpuArray *a, const GpuArray *v) { GpuArray tv; size_t sz; ssize_t *strs; unsigned int i, off; int err = GA_NO_ERROR; int simple_move = 1; if (a->nd < v->nd) return GA_VALUE_ERROR; if (!GpuArray_ISWRITEABLE(a)) return GA_VALUE_ERROR; if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a)) return GA_UNALIGNED_ERROR; off = a->nd - v->nd; for (i = 0; i < v->nd; i++) { if (v->dimensions[i] != a->dimensions[i+off]) { if (v->dimensions[i] != 1) return GA_VALUE_ERROR; else simple_move = 0; } } if (simple_move && GpuArray_ISONESEGMENT(a) && GpuArray_ISONESEGMENT(v) && GpuArray_ISFORTRAN(a) == GpuArray_ISFORTRAN(v) && a->typecode == v->typecode && a->nd == v->nd) { sz = gpuarray_get_elsize(a->typecode); for (i = 0; i < a->nd; i++) sz *= a->dimensions[i]; return gpudata_move(a->data, a->offset, v->data, v->offset, sz); } strs = calloc(a->nd, sizeof(ssize_t)); if (strs == NULL) return GA_MEMORY_ERROR; for (i = off; i < a->nd; i++) { if (v->dimensions[i-off] == a->dimensions[i]) { strs[i] = v->strides[i-off]; } } memcpy(&tv, v, sizeof(GpuArray)); tv.nd = a->nd; tv.dimensions = a->dimensions; tv.strides = strs; if (tv.nd != 0) GpuArray_fix_flags(&tv); err = ga_extcopy(a, &tv); free(strs); return err; }
static int c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) { cudnnDataType_t dt; size_t ds; switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd"); return -1; } ds = gpuarray_get_elsize(var->ga.typecode); int strs[8], dims[8], default_stride = 1; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 8) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ? PyGpuArray_STRIDE(var, i)/ds : default_stride; default_stride *= PyGpuArray_DIM(var, i); dims[i] = PyGpuArray_DIM(var, i); } /* Tensors can't be smaller than 3d for cudnn so we pad the * descriptor if they are */ for (unsigned int i = nd; i < 3; i++) { strs[i] = 1; dims[i] = 1; } //only for grouped convolution i.e when groups > 1 dims[1] = dims[1] / groups; cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set tensorNd descriptor: %s", cudnnGetErrorString(err)); return -1; } return 0; }
static int get_type_flags(int typecode) { int flags = 0; if (typecode == GA_DOUBLE || typecode == GA_CDOUBLE) flags |= GA_USE_DOUBLE; if (typecode == GA_HALF) flags |= GA_USE_HALF; if (typecode == GA_CFLOAT || typecode == GA_CDOUBLE) flags |= GA_USE_COMPLEX; if (gpuarray_get_elsize(typecode) < 4) flags |= GA_USE_SMALL; return flags; }
static int c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { cudnnDataType_t dt; size_t ds; switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; #if CUDNN_VERSION > 3000 case GA_HALF: dt = CUDNN_DATA_HALF; break; #endif default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd"); return -1; } ds = gpuarray_get_elsize(var->ga.typecode); int strs[5], dims[5], default_stride = 1; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 5) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; strs[i] = PyGpuArray_STRIDE(var, i) ? PyGpuArray_STRIDE(var, i)/ds : default_stride; default_stride *= PyGpuArray_DIM(var, i); dims[i] = PyGpuArray_DIM(var, i); } cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set tensorNd descriptor: %s", cudnnGetErrorString(err)); return -1; } return 0; }
int GpuArray_transfer(GpuArray *res, const GpuArray *a) { size_t sz; unsigned int i; if (!GpuArray_ISONESEGMENT(res)) return GA_UNSUPPORTED_ERROR; if (!GpuArray_ISONESEGMENT(a)) return GA_UNSUPPORTED_ERROR; if (res->typecode != a->typecode) return GA_UNSUPPORTED_ERROR; sz = gpuarray_get_elsize(a->typecode); for (i = 0; i < a->nd; i++) sz *= a->dimensions[i]; return gpudata_transfer(res->data, res->offset, a->data, a->offset, sz); }
/** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(get_count(comm, &ndev)); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief Helper function to check for restrictions on `gpudata` to be used in * nccl * collective operations. */ static inline int check_restrictions(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm, ncclDataType_t *datatype, ncclRedOp_t *op) { size_t op_size; // Check if count is larger than INT_MAX // TODO remove whenif nccl adapts to size_t if (count > INT_MAX) return error_set(comm->ctx->err, GA_XLARGE_ERROR, "Count too large for int"); // src, dest and comm must refer to the same context if (src->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "source and comm context differ"); if (dest != NULL && dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); if (*datatype == ncclNumTypes) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); if (*op == ncclNumOps) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself // (else out of alloc-ed mem scope) assert(!(offsrc > src->sz)); assert(!(dest != NULL && offdest > dest->sz)); // size to operate upon must be able to fit inside the gpudata (incl offsets) op_size = count * gpuarray_get_elsize(typecode); if ((src->sz - offsrc) < op_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "source too small for operation"); if (dest != NULL && (dest->sz - offdest) < op_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); return GA_NO_ERROR; }
int GpuArray_move(GpuArray *dst, const GpuArray *src) { size_t sz; unsigned int i; if (!GpuArray_ISWRITEABLE(dst)) return GA_VALUE_ERROR; if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dst)) return GA_UNALIGNED_ERROR; if (src->nd != dst->nd) return GA_VALUE_ERROR; for (i = 0; i < src->nd; i++) { if (src->dimensions[i] != dst->dimensions[i]) return GA_VALUE_ERROR; } if (!GpuArray_ISONESEGMENT(dst) || !GpuArray_ISONESEGMENT(src) || GpuArray_ISFORTRAN(dst) != GpuArray_ISFORTRAN(src) || dst->typecode != src->typecode) { return ga_extcopy(dst, src); } sz = gpuarray_get_elsize(dst->typecode); for (i = 0; i < dst->nd; i++) sz *= dst->dimensions[i]; return gpudata_move(dst->data, dst->offset, src->data, src->offset, sz); }
static inline int gen_extcopy_kernel(const extcopy_args *a, cuda_context *ctx, gpukernel **v, size_t nEls) { strb sb = STRB_STATIC_INIT; int res = GA_SYS_ERROR; int flags = GA_USE_PTX; unsigned int bits = sizeof(void *)*8; int types[2]; const char *in_t, *in_ld_t; const char *out_t, *out_ld_t; const char *rmod; in_t = map_t(a->itype); out_t = map_t(a->otype); /* Since float16 ('f16') is not a fully-supported type we need to use it as b16 (basically uint16) for read and write operations. */ if (a->itype == GA_HALF) in_ld_t = "b16"; else in_ld_t = in_t; if (a->otype == GA_HALF) out_ld_t = "b16"; else out_ld_t = out_t; rmod = get_rmod(a->itype, a->otype); if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR; strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id, bits, bits, bits, bits, in_t, out_t, bits, bits, bits, bits, bits, nEls, bits, bits); cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits); cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits); strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n" "cvt.s%u.s%u rp2, a_p;\n" "add.s%u rp1, rp1, rp2;\n" "ld.global.%s tmpa, [rp1+%" SPREFIX "u];\n" "cvt%s.%s.%s tmpb, tmpa;\n" "ld.param.u%u rp1, [b_data];\n" "cvt.s%u.s%u rp2, b_p;\n" "add.s%u rp1, rp1, rp2;\n" "st.global.%s [rp1+%" SPREFIX "u], tmpb;\n", bits, bits, bits, bits, in_ld_t, a->ioff, rmod, out_t, in_t, bits, bits, bits, bits, out_ld_t, a->ooff); strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls); if (strb_error(&sb)) goto fail; if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE || a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) { flags |= GA_USE_DOUBLE; } if (a->otype == GA_HALF || a->itype == GA_HALF) { flags |= GA_USE_HALF; } if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) { /* Should check for non-mod4 strides too */ flags |= GA_USE_SMALL; } if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT || a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) { flags |= GA_USE_COMPLEX; } types[0] = types[1] = GA_BUFFER; res = GA_NO_ERROR; *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy", 2, types, flags, &res, NULL); fail: strb_clear(&sb); return res; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1; const size_t topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1; const size_t topDepth = (bottomDepth + 2*padD - dil_kD) / dD + 1; if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Get the max threads per blocks size_t max_threads_dim; err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_Format(PyExc_RuntimeError, "Could not fetch max_threads_dim."); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; const DTYPE_INPUT_0 one = 1.0f; const DTYPE_INPUT_0 zero = 0.0f; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, one, col->ga.data, 0, N_, weight->ga.data, 0, K_, zero, top->ga.data, n * top_stride, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, one, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? zero : one, weight->ga.data, 0, K_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, one, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, zero, col->ga.data, 0, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(max_threads_dim, col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }
int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) { size_t size = gpuarray_get_elsize(typecode); unsigned int i; int res = GA_NO_ERROR; if (ord == GA_ANY_ORDER) ord = GA_C_ORDER; if (ord != GA_C_ORDER && ord != GA_F_ORDER) return GA_VALUE_ERROR; for (i = 0; i < nd; i++) { size_t d = dims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && d > 0 && SIZE_MAX / d < size) return GA_VALUE_ERROR; size *= d; } a->data = gpudata_alloc(ctx, size, NULL, 0, &res); if (a->data == NULL) return res; a->nd = nd; a->offset = 0; a->typecode = typecode; a->dimensions = calloc(nd, sizeof(size_t)); a->strides = calloc(nd, sizeof(ssize_t)); /* F/C distinction comes later */ a->flags = GA_BEHAVED; if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); return GA_MEMORY_ERROR; } /* Mult will not overflow since calloc succeded */ memcpy(a->dimensions, dims, sizeof(size_t)*nd); size = gpuarray_get_elsize(typecode); /* mults will not overflow, checked on entry */ switch (ord) { case GA_C_ORDER: for (i = nd; i > 0; i--) { a->strides[i-1] = size; size *= a->dimensions[i-1]; } a->flags |= GA_C_CONTIGUOUS; break; case GA_F_ORDER: for (i = 0; i < nd; i++) { a->strides[i] = size; size *= a->dimensions[i]; } a->flags |= GA_F_CONTIGUOUS; break; default: assert(0); /* cannot be reached */ } if (a->nd <= 1) a->flags |= GA_F_CONTIGUOUS|GA_C_CONTIGUOUS; return GA_NO_ERROR; }
int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord) { ssize_t *newstrides; size_t *tmpdims; size_t np; size_t op; size_t newsize = 1; size_t oldsize = 1; unsigned int ni = 0; unsigned int oi = 0; unsigned int nj = 1; unsigned int oj = 1; unsigned int nk; unsigned int ok; unsigned int i; if (ord == GA_ANY_ORDER && GpuArray_ISFORTRAN(a) && a->nd > 1) ord = GA_F_ORDER; for (i = 0; i < a->nd; i++) { oldsize *= a->dimensions[i]; } for (i = 0; i < nd; i++) { size_t d = newdims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || newsize >= MUL_NO_OVERFLOW) && d > 0 && SIZE_MAX / d < newsize) return GA_INVALID_ERROR; newsize *= d; } if (newsize != oldsize) return GA_INVALID_ERROR; /* If the source and desired layouts are the same, then just copy strides and dimensions */ if ((ord == GA_C_ORDER && GpuArray_CHKFLAGS(a, GA_C_CONTIGUOUS)) || (ord == GA_F_ORDER && GpuArray_CHKFLAGS(a, GA_F_CONTIGUOUS))) { goto do_final_copy; } newstrides = calloc(nd, sizeof(ssize_t)); if (newstrides == NULL) return GA_MEMORY_ERROR; while (ni < nd && oi < a->nd) { np = newdims[ni]; op = a->dimensions[oi]; while (np != op) { if (np < op) { np *= newdims[nj++]; } else { op *= a->dimensions[oj++]; } } for (ok = oi; ok < oj - 1; ok++) { if (ord == GA_F_ORDER) { if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok]) goto need_copy; } else { if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1]) goto need_copy; } } if (ord == GA_F_ORDER) { newstrides[ni] = a->strides[oi]; for (nk = ni + 1; nk < nj; nk++) { newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1]; } } else { newstrides[nj-1] = a->strides[oj-1]; for (nk = nj-1; nk > ni; nk--) { newstrides[nk-1] = newstrides[nk]*newdims[nk]; } } ni = nj++; oi = oj++; } /* Fixup trailing ones */ if (ord == GA_F_ORDER) { for (i = nj-1; i < nd; i++) { newstrides[i] = newstrides[i-1] * newdims[i-1]; } } else { for (i = nj-1; i < nd; i++) { newstrides[i] = gpuarray_get_elsize(a->typecode); } } /* We can reuse newstrides since it was allocated in this function. Can't do the same with newdims (which is a parameter). */ tmpdims = calloc(nd, sizeof(size_t)); if (tmpdims == NULL) { return GA_MEMORY_ERROR; } memcpy(tmpdims, newdims, nd*sizeof(size_t)); a->nd = nd; free(a->dimensions); free(a->strides); a->dimensions = tmpdims; a->strides = newstrides; goto fix_flags; need_copy: free(newstrides); return GA_COPY_ERROR; do_final_copy: tmpdims = calloc(nd, sizeof(size_t)); newstrides = calloc(nd, sizeof(ssize_t)); if (tmpdims == NULL || newstrides == NULL) { free(tmpdims); free(newstrides); return GA_MEMORY_ERROR; } memcpy(tmpdims, newdims, nd*sizeof(size_t)); if (nd > 0) { if (ord == GA_F_ORDER) { newstrides[0] = gpuarray_get_elsize(a->typecode); for (i = 1; i < nd; i++) { newstrides[i] = newstrides[i-1] * tmpdims[i-1]; } } else { newstrides[nd-1] = gpuarray_get_elsize(a->typecode); for (i = nd-1; i > 0; i--) { newstrides[i-1] = newstrides[i] * tmpdims[i]; } } } free(a->dimensions); free(a->strides); a->nd = nd; a->dimensions = tmpdims; a->strides = newstrides; fix_flags: GpuArray_fix_flags(a); return GA_NO_ERROR; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH); const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW); const size_t topDepthNoDD = (bottomDepth + 2*padD - dil_kD); // the above values might be negative so we need to use Python-like // flooring integer division to be compatible with get_conv_output. // note: this macro implements Python's // for negative x only #define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y)) const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; const size_t topDepth = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1; #undef _CONV_FLOORDIV if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM forward encountered an error running gemm."); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights encountered an error running gemm."); Py_DECREF(col); return NULL; } } if (batchSize == 0) { err = GpuArray_memset(&weight->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns switch (top->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad inputs encountered an error running gemm."); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }