static int c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) { cudnnDataType_t dt; cudnnStatus_t err; if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { PyErr_SetString(PyExc_ValueError, "Only contiguous filters (kernels) are supported."); return -1; } switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter"); return -1; } int dims[8]; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 8) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; dims[i] = PyGpuArray_DIM(var, i); } /* Filters can't be less than 3d so we pad */ for (unsigned int i = nd; i < 3; i++) dims[i] = 1; dims[0] = dims[0] / groups; if (nd < 3) nd = 3; err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set filter descriptor: %s.", cudnnGetErrorString(err)); return -1; } return 0; }
static int c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { cudnnDataType_t dt; if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) { PyErr_SetString(PyExc_ValueError, "Only contiguous filters (kernels) are supported."); return -1; } switch (var->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; #if CUDNN_VERSION > 3000 case GA_HALF: dt = CUDNN_DATA_HALF; break; #endif default: PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter"); return -1; } int dims[5]; unsigned int nd = PyGpuArray_NDIM(var); if (nd > 5) { PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d"); return -1; } for (unsigned int _i = nd; _i > 0; _i--) { unsigned int i = _i - 1; dims[i] = PyGpuArray_DIM(var, i); } cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set filter descriptor: %s.", cudnnGetErrorString(err)); return -1; } return 0; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1; const size_t topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1; const size_t topDepth = (bottomDepth + 2*padD - dil_kD) / dD + 1; if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Get the max threads per blocks size_t max_threads_dim; err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_Format(PyExc_RuntimeError, "Could not fetch max_threads_dim."); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; const DTYPE_INPUT_0 one = 1.0f; const DTYPE_INPUT_0 zero = 0.0f; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, one, col->ga.data, 0, N_, weight->ga.data, 0, K_, zero, top->ga.data, n * top_stride, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, one, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? zero : one, weight->ga.data, 0, K_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, one, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, zero, col->ga.data, 0, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(max_threads_dim, col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, PyGpuArrayObject *gz, PyArrayObject *ws, PyArrayObject *stride, PyArrayObject *pad, PyGpuArrayObject **gx, PyGpuContextObject *ctx) { if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&gz->ga)) { PyErr_Format(PyExc_ValueError, "GpuMaxPoolGrad: requires data to be C-contiguous"); return 1; } size_t ndims = PyArray_DIM(ws, 0); if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(gz) != ndims + 2) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error"); return 1; } if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x), x->ga.typecode, GA_C_ORDER, ctx) != 0) { PyErr_SetString(PyExc_RuntimeError, "GpuMaxPoolGrad: failed to allocate memory"); return 1; } { // scope for running kernel size_t w[3]; size_t s[3]; size_t p[3]; for(int i = 0; i < ndims; i++) { w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); } int err; const size_t* z_dims = PyGpuArray_DIMS(gz); const size_t* x_dims = PyGpuArray_DIMS(x); if (ndims == 2) { size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, x_dims[0], x_dims[1], x_dims[2], x_dims[3], z_dims[2], z_dims[3], x->ga.data, gz->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], INC_PAD, SUM_MODE, (*gx)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.", GpuKernel_error(&k_ave_pool2d_grad_kernel, err)); return 1; } } else if (ndims == 3) { size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], z_dims[2], z_dims[3], z_dims[4], x->ga.data, gz->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], INC_PAD, SUM_MODE, (*gx)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.", GpuKernel_error(&k_ave_pool3d_grad_kernel, err)); return 1; } } } return 0; }
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **S, PyGpuArrayObject **U, // may be NULL PyGpuArrayObject **VT, // may be NULL PARAMS_TYPE* params) { bool compute_uv = (U != NULL); magma_int_t *iwork = NULL, iunused[1]; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_vec_t jobz; size_t s_dims[1], u_dims[2], vt_dims[2]; float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL, *work = NULL; float dummy[1]; int res = -1, lwork; if (A->ga.typecode != GA_FLOAT) { PyErr_SetString(PyExc_TypeError, "GpuMagmaMatrixInverse: Unsupported data type"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(params->context->ctx); magma_init(); if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: requires data to be C-contiguous"); return 1; } if (PyGpuArray_NDIM(A) != 2) { PyErr_SetString(PyExc_ValueError, "GpuMagmaMatrixInverse: matrix rank error"); goto fail; } // magma matrix svd // reverse dimensions because MAGMA expects column-major matrices: M = PyGpuArray_DIM(A, 1); N = PyGpuArray_DIM(A, 0); K = std::min(M, N); if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float), cudaMemcpyDeviceToDevice); if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (compute_uv) { if (params->full_matrices) { jobz = MagmaAllVec; } else { jobz = MagmaSomeVec; } M_U = (jobz == MagmaAllVec ? M : K); N_VT = (jobz == MagmaAllVec ? N : K); ldu = M; ldv = N_VT; if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } } else { jobz = MagmaNoVec; ldu = M; ldv = N; } // query for workspace size magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv, dummy, -1, iunused, &info); lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]); if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate working memory"); goto fail; } // compute svd magma_sgesdd(jobz, M, N, a_data, M, s_data, u_data, ldu, vt_data, ldv, work, lwork, iwork, &info); if (info > 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)", info); goto fail; } else if (info < 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info); goto fail; } s_dims[0] = K; if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float), cudaMemcpyDeviceToDevice); if (compute_uv) { u_dims[0] = N; u_dims[1] = N_VT; if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float), cudaMemcpyDeviceToDevice); vt_dims[0] = M_U; vt_dims[1] = M; if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaSVD: failed to allocate memory"); goto fail; } // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // to match numpy.linalg.svd output cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float), cudaMemcpyDeviceToDevice); } res = 0; fail: if (a_data != NULL) magma_free_pinned(a_data); if (s_data != NULL) magma_free_pinned(s_data); if (u_data != NULL) magma_free_pinned(u_data); if (vt_data != NULL) magma_free_pinned(vt_data); if (work != NULL) magma_free_pinned(work); if (iwork != NULL) magma_free_cpu(iwork); magma_finalize(); cuda_exit(params->context->ctx); return res; }
int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error) { size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0}; size_t pl; gpudata *errbuf; #if DEBUG char *errstr = NULL; #endif GpuKernel k; unsigned int j; unsigned int argp; int err, kerr = 0; int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) return GA_INVALID_ERROR; if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) return GA_UNALIGNED_ERROR; /* a and i have to be C contiguous */ if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i)) return GA_INVALID_ERROR; /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd || a->dimensions[0] != i->dimensions[0]) return GA_INVALID_ERROR; n[0] = i->dimensions[0]; n[1] = 1; for (j = 1; j < v->nd; j++) { if (a->dimensions[j] != v->dimensions[j]) return GA_INVALID_ERROR; n[1] *= v->dimensions[j]; } if (n[0] * n[1] < SADDR32_MAX) { addr32 = 1; } err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf); if (err != GA_NO_ERROR) return err; err = gen_take1_kernel(&k, GpuArray_context(a), #if DEBUG &errstr, #else NULL, #endif a, v, i, addr32); #if DEBUG if (errstr != NULL) { fprintf(stderr, "%s\n", errstr); free(errstr); } #endif if (err != GA_NO_ERROR) return err; err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; /* This may not be the best scheduling, but it's good enough */ err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl); ls[0] = ls[1] / pl; ls[1] = pl; if (n[1] > n[0]) { pl = ls[0]; ls[0] = ls[1]; ls[1] = pl; gs[0] = 1; } else { gs[0] = gs[1]; gs[1] = 1; } argp = 0; GpuKernel_setarg(&k, argp++, a->data); GpuKernel_setarg(&k, argp++, (void *)&a->offset); GpuKernel_setarg(&k, argp++, v->data); /* The cast is to avoid a warning about const */ GpuKernel_setarg(&k, argp++, (void *)&v->offset); for (j = 0; j < v->nd; j++) { GpuKernel_setarg(&k, argp++, &v->strides[j]); GpuKernel_setarg(&k, argp++, &v->dimensions[j]); } GpuKernel_setarg(&k, argp++, i->data); GpuKernel_setarg(&k, argp++, (void *)&i->offset); GpuKernel_setarg(&k, argp++, &n[0]); GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { err = GA_VALUE_ERROR; kerr = 0; /* We suppose this will not fail */ gpudata_write(errbuf, 0, &kerr, sizeof(int)); } } out: GpuKernel_clear(&k); return err; }
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_, PyGpuArrayObject **D, PyGpuArrayObject **V, // may be NULL PARAMS_TYPE *params) { PyGpuArrayObject *A = NULL; magma_int_t N, liwork, *iwork_data = NULL; size_t d_dims[1], v_dims[2]; magma_uplo_t uplo; magma_vec_t jobz; float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork; int res = -1, info; if (A_->ga.typecode != GA_FLOAT) { PyErr_SetString(PyExc_TypeError, "GpuMagmaEigh: Unsupported data type"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(params->context->ctx); if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: requires data to be C-contiguous"); goto fail; } if (PyGpuArray_NDIM(A_) != 2) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: matrix rank error"); goto fail; } if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) { PyErr_SetString(PyExc_ValueError, "GpuMagmaEigh: matrix is not square"); goto fail; } A = pygpu_copy(A_, GA_F_ORDER); if (A == NULL) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to change to column-major order"); return -1; } // magma matrix eigen decomposition of a symmetric matrix N = PyGpuArray_DIM(A, 0); if (params->lower) { uplo = MagmaLower; } else { uplo = MagmaUpper; } if (params->compute_v) { jobz = MagmaVec; } else { jobz = MagmaNoVec; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } // query for workspace size magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork, -1, &liwork, -1, &info); if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate working memory"); goto fail; } magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data, wA_data, N, work_data, (size_t)lwork, iwork_data, liwork, &info); if (info > 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero", info); goto fail; } else if (info < 0) { PyErr_Format( PyExc_RuntimeError, "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info); goto fail; } d_dims[0] = N; if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){ PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate memory for the output"); goto fail; } cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float), cudaMemcpyDeviceToDevice); if (params->compute_v) { *V = theano_try_copy(*V, A); if (*V == NULL) { PyErr_SetString(PyExc_RuntimeError, "GpuMagmaEigh: failed to allocate memory for the output"); goto fail; } } res = 0; fail: if (w_data != NULL) magma_free_pinned(w_data); if (wA_data != NULL) magma_free_pinned(wA_data); if (work_data != NULL) magma_free_pinned(work_data); if (iwork_data != NULL) magma_free_cpu(iwork_data); Py_XDECREF(A); cuda_exit(params->context->ctx); return res; }
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, PyGpuArrayObject *ex, PyArrayObject *ws, PyArrayObject *stride, PyArrayObject *pad, PyGpuArrayObject **z, PyGpuContextObject *ctx) { if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga)) { PyErr_Format(PyExc_ValueError, "GpuMaxPoolRop: requires data to be C-contiguous"); return 1; } size_t ndims = PyArray_DIM(ws, 0); if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error"); return 1; } // prepare output const size_t* x_dims = PyGpuArray_DIMS(x); size_t z_dims[5]; // avoid warning if use 2 + nd size_t w[3]; size_t s[3]; size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1]; int nonzero_padding = 0; for (int i = 0; i < ndims; i++) { w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i)); s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i)); p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i)); z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]); if (p[i] > 0) { nonzero_padding = 1; } } if (!IGNORE_BORDER && nonzero_padding) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: padding works only with ignore_border=True"); return 1; } if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims, ex->ga.typecode, GA_C_ORDER, ctx) != 0) { PyErr_SetString(PyExc_RuntimeError, "GpuMaxPoolRop: failed to allocate memory"); return 1; } { // scope for running kernel int err; if (ndims == 2) { size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], x_dims[2], x_dims[3], x->ga.data, ex->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], (*z)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuMaxPoolRop: max_pool2d_rop_kernel %s.", GpuKernel_error(&k_max_pool2d_rop_kernel, err)); return 1; } } else if (ndims == 3) { size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], x_dims[2], x_dims[3], x_dims[4], x->ga.data, ex->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], (*z)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuMaxPoolRop: max_pool3d_rop_kernel %s.", GpuKernel_error(&k_max_pool2d_rop_kernel, err)); return 1; } } } return 0; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH); const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW); const size_t topDepthNoDD = (bottomDepth + 2*padD - dil_kD); // the above values might be negative so we need to use Python-like // flooring integer division to be compatible with get_conv_output. // note: this macro implements Python's // for negative x only #define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y)) const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; const size_t topDepth = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1; #undef _CONV_FLOORDIV if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM forward encountered an error running gemm."); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights encountered an error running gemm."); Py_DECREF(col); return NULL; } } if (batchSize == 0) { err = GpuArray_memset(&weight->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns switch (top->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad inputs encountered an error running gemm."); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }