int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PARAMS_TYPE* params) { PyGpuContextObject *c = input->context; void *alpha_p; void *beta_p; float af = alpha, bf = beta; cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) { PyErr_SetString(PyExc_ValueError, "Number of filters must be divisible by number of groups"); return 1; } switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } if (params->inplace) { Py_XDECREF(*output); *output = om; Py_INCREF(*output); } else { if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; } if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { int err2 = GpuArray_memset(&(*output)->ga, 0); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv could not fill the output with zeros: %d", err2); return 1; } return 0; } if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) return 1; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) return 1; size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; cudnnConvolutionFwdAlgo_t algo = params->conv_algo; #ifdef DEBUG char algorithm_name[128]; #endif cuda_enter(c->ctx); if (params->choose_algo) { if (!params->choose_once) { reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } } if (!reuse_algo) { size_t free; int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU"); cuda_exit(c->ctx); return 1; } // Guess 4Mb if the info is not available if (free == 0) free = 4 * 1024 * 1024; if (params->choose_time) { int count; cudnnConvolutionFwdAlgoPerf_t choice; gpudata *tmpmem; tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); if (tmpmem == NULL) { PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); return -1; } // We don't sync the buffer as we don't care about the values. err = cudnnFindConvolutionForwardAlgorithmEx( params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), 1, &count, &choice, *(void **)tmpmem, free); gpudata_release(tmpmem); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #ifdef DEBUG if (count == 0) { PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); return 1; } else if (choice.status != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting best-timed FWD algo: %s", cudnnGetErrorString(choice.status)); return 1; } // Else, count is necessarly 1 for current implementation. #endif } else { err = cudnnGetConvolutionForwardAlgorithm( params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } } prev_algo = algo; } else { algo = prev_algo; } #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; // NB: This is printed only when algorithm is chosen at runtime. if (reuse_algo) fprintf(stderr, "(reused %s)\n", algorithm_name); else fprintf(stderr, "(using %s)\n", algorithm_name); #endif if (params->choose_once) { reuse_algo = 1; } else { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } } } /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (PyGpuArray_NDIM(input) == 5 && !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) { #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Issue should be resolved for cuDNN > V6.0. if (cudnnGetVersion() < 6100 && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && PyGpuArray_DIM(input, 0) > 65536) { #ifdef DEBUG fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM " "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n"); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. The tiled-FFT implementation // does not support strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters /* NB: TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1. New conditions apply and may depend on number of dimensions (2D or 3D) e.g. for FFT_TILING. TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms? */ if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int dilation[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s error getting worksize: " "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); for ( int g = 0; g < params->num_groups; g++) { err = cudnnConvolutionForward( params->handle, alpha_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); } if (worksize != 0) gpudata_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, PyGpuArrayObject *gz, PyArrayObject *ws, PyArrayObject *stride, PyArrayObject *pad, PyGpuArrayObject **gx, PyGpuContextObject *ctx) { if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&gz->ga)) { PyErr_Format(PyExc_ValueError, "GpuMaxPoolGrad: requires data to be C-contiguous"); return 1; } size_t ndims = PyArray_DIM(ws, 0); if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(gz) != ndims + 2) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error"); return 1; } if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x), x->ga.typecode, GA_C_ORDER, ctx) != 0) { PyErr_SetString(PyExc_RuntimeError, "GpuMaxPoolGrad: failed to allocate memory"); return 1; } { // scope for running kernel size_t w[3]; size_t s[3]; size_t p[3]; for(int i = 0; i < ndims; i++) { w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); } int err; const size_t* z_dims = PyGpuArray_DIMS(gz); const size_t* x_dims = PyGpuArray_DIMS(x); if (ndims == 2) { size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, x_dims[0], x_dims[1], x_dims[2], x_dims[3], z_dims[2], z_dims[3], x->ga.data, gz->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], INC_PAD, SUM_MODE, (*gx)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.", GpuKernel_error(&k_ave_pool2d_grad_kernel, err)); return 1; } } else if (ndims == 3) { size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], z_dims[2], z_dims[3], z_dims[4], x->ga.data, gz->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], INC_PAD, SUM_MODE, (*gx)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.", GpuKernel_error(&k_ave_pool3d_grad_kernel, err)); return 1; } } } return 0; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1; const size_t topWidth = (bottomWidth + 2*padW - dil_kW) / dW + 1; const size_t topDepth = (bottomDepth + 2*padD - dil_kD) / dD + 1; if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Get the max threads per blocks size_t max_threads_dim; err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_Format(PyExc_RuntimeError, "Could not fetch max_threads_dim."); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; const DTYPE_INPUT_0 one = 1.0f; const DTYPE_INPUT_0 zero = 0.0f; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, one, col->ga.data, 0, N_, weight->ga.data, 0, K_, zero, top->ga.data, n * top_stride, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col(max_threads_dim, bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, one, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? zero : one, weight->ga.data, 0, K_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, one, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, zero, col->ga.data, 0, N_); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM encountered an error running sgemm.\n"); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(max_threads_dim, col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO /* Static variables are only initialized once so this will not * reset the previous algo every time */ static int reuse_algo = 0; static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_kern_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionFwdAlgoPerf_t choice; err = cudnnFindConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } #endif #endif /* These two algos are not supported for 3d conv */ if (PyGpuArray_NDIM(input) == 5 && (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } } #endif #if CUDNN_VERSION < 3000 /* cuDNN before v3 does not support kernels larger than input even * if appropriate padding is selected. */ for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) { if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) { PyErr_SetString(PyExc_RuntimeError, "the current version " "of CuDNN does not support kernels larger than the " "inputs in any spatial dimension, even if the inputs " "are padded such that the padded inputs are larger " "than the kernels. Update your installation of CuDNN " "to V3 or more recent to solve the issue."); cuda_exit(c->ctx); return 1; } } #endif { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } err = cudnnConvolutionForward( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); if (worksize != 0) c->ops->buffer_release(workspace); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, PyGpuArrayObject *ex, PyArrayObject *ws, PyArrayObject *stride, PyArrayObject *pad, PyGpuArrayObject **z, PyGpuContextObject *ctx) { if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga)) { PyErr_Format(PyExc_ValueError, "GpuMaxPoolRop: requires data to be C-contiguous"); return 1; } size_t ndims = PyArray_DIM(ws, 0); if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error"); return 1; } // prepare output const size_t* x_dims = PyGpuArray_DIMS(x); size_t z_dims[5]; // avoid warning if use 2 + nd size_t w[3]; size_t s[3]; size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1]; int nonzero_padding = 0; for (int i = 0; i < ndims; i++) { w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i)); s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i)); p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i)); z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]); if (p[i] > 0) { nonzero_padding = 1; } } if (!IGNORE_BORDER && nonzero_padding) { PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: padding works only with ignore_border=True"); return 1; } if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims, ex->ga.typecode, GA_C_ORDER, ctx) != 0) { PyErr_SetString(PyExc_RuntimeError, "GpuMaxPoolRop: failed to allocate memory"); return 1; } { // scope for running kernel int err; if (ndims == 2) { size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], x_dims[2], x_dims[3], x->ga.data, ex->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], (*z)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuMaxPoolRop: max_pool2d_rop_kernel %s.", GpuKernel_error(&k_max_pool2d_rop_kernel, err)); return 1; } } else if (ndims == 3) { size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], x_dims[2], x_dims[3], x_dims[4], x->ga.data, ex->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], (*z)->ga.data); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuMaxPoolRop: max_pool3d_rop_kernel %s.", GpuKernel_error(&k_max_pool2d_rop_kernel, err)); return 1; } } } return 0; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu // Adaptation for 3d PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dD = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t dilD = 1, const size_t padH = 0, const size_t padW = 0, const size_t padD = 0) { if (PyGpuArray_NDIM(bottom) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3], PyGpuArray_STRIDES(bottom)[4]); return NULL; } if (PyGpuArray_NDIM(weight) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3], PyGpuArray_STRIDES(weight)[4]); return NULL; } if (PyGpuArray_NDIM(top) != 5) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3], PyGpuArray_STRIDES(top)[4]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4]; // weights: (nFilters, nChannels, rows, columns, slices) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; const size_t kD = PyGpuArray_DIMS(weight)[4]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; const size_t dil_kD = (kD - 1) * dilD + 1; // top: (batchSize, nFilters, topHeight, topWidth, topDepth) const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH); const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW); const size_t topDepthNoDD = (bottomDepth + 2*padD - dil_kD); // the above values might be negative so we need to use Python-like // flooring integer division to be compatible with get_conv_output. // note: this macro implements Python's // for negative x only #define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y)) const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; const size_t topDepth = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1; #undef _CONV_FLOORDIV if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3] || topDepth != PyGpuArray_DIMS(top)[4]) { PyErr_Format(PyExc_ValueError, "GpuCorr3dMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth, nFilters, nChannels, kH, kW, kD, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4], batchSize, nFilters, topHeight, topWidth, topDepth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH * kD; col_dim[1] = topHeight * topWidth * topDepth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid correlation: im3d2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM forward encountered an error running gemm."); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid convolution: im3col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im3d2col err = im3d2col( bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights encountered an error running gemm."); Py_DECREF(col); return NULL; } } if (batchSize == 0) { err = GpuArray_memset(&weight->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // full convolution: gemm, then col2im3d // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns switch (top->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorr3dMM grad inputs encountered an error running gemm."); Py_DECREF(col); return NULL; } // col2im3d back to the data err = col2im3d(col->ga.data, nChannels, bottomHeight, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, PyArrayObject * in_labels, PyArrayObject * in_input_lengths, PyGpuArrayObject ** out_costs, PyGpuArrayObject ** out_gradients, PyGpuContextObject * gpu_context) { ctc_context_t ctc_object; ctc_context_t * context = &ctc_object; size_t gpu_workspace_size; int ctc_error = 0; const size_t num_activations = PyGpuArray_DIMS( in_activations )[0]; const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1]; const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2]; const size_t cost_size = minibatch_size; const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size }; float * costs = NULL, * activations = NULL, * gradients = NULL; cuda_enter( gpu_context->ctx ); ctc_context_init( context, gpu_context ); switch (in_activations->ga.typecode) { case GA_FLOAT: activations = (float *) PyGpuArray_DEV_DATA( in_activations ); break; default: ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_SetString( PyExc_TypeError, "GpuConnectionistTemporalClassification: Unsupported type for activations." ); return 1; } create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) ); if ( NULL == context->input_lengths ) { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." ); return 1; } // flatten labels to conform with library memory layout create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) ); if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) ) { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." ); return 1; } if ( theano_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode, GA_C_ORDER, gpu_context ) != 0 ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } GpuArray_memset( &((*out_costs)->ga), 0 ); costs = (float *) PyGpuArray_DEV_DATA( *out_costs ); if ( NULL != out_gradients ) // if gradient computation is not disabled { if ( theano_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode, GA_C_ORDER, gpu_context ) != 0 ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } GpuArray_memset( &((*out_gradients)->ga), 0 ); gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients ); } ctc_error = ctc_check_result( get_workspace_size( context->label_lengths, context->input_lengths, alphabet_size, minibatch_size, context->options, &gpu_workspace_size ), "Failed to obtain CTC workspace size." ); if ( ctc_error ) // Exception is set by ctc_check_result, return error here { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL ); if ( NULL == context->workspace ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." ); return 1; } cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( out_gradients != NULL ) cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients, context->flat_labels, context->label_lengths, context->input_lengths, alphabet_size, minibatch_size, costs, *(void **)context->workspace, context->options ), "Failed to compute CTC loss function." ); cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( out_gradients != NULL ) cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( ctc_error ) // Exception is set by ctc_check_result, return error here { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 0; }
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, PyGpuArrayObject *w, PyGpuArrayObject *x, PyGpuArrayObject *hx, PyGpuArrayObject *cx, gpudata **reserve, PyGpuArrayObject **y, PyGpuArrayObject **hy, PyGpuArrayObject **cy, cudnnHandle_t _handle) { PyGpuContextObject *c = x->context; cudnnTensorDescriptor_t xdesc = NULL; cudnnTensorDescriptor_t hxdesc = NULL; cudnnTensorDescriptor_t cxdesc = NULL; cudnnTensorDescriptor_t ydesc = NULL; cudnnTensorDescriptor_t hydesc = NULL; cudnnTensorDescriptor_t cydesc = NULL; cudnnFilterDescriptor_t wdesc = NULL; cudnnTensorDescriptor_t *xl = NULL; cudnnTensorDescriptor_t *yl = NULL; gpudata *workspace = NULL; size_t worksize, ressize; size_t seqLength = PyGpuArray_DIM(x, 0); size_t miniBatch = PyGpuArray_DIM(x, 1); size_t inputSize = PyGpuArray_DIM(x, 2); size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2); size_t shape[3]; int strs[3], dims[3]; cudnnStatus_t err; cudnnDataType_t dt; int res = -1; switch (x->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported data type for x"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(c->ctx); err = cudnnCreateTensorDescriptor(&xdesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create xdesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = PyGpuArray_DIM(x, 1); dims[1] = PyGpuArray_DIM(x, 2); dims[2] = 1; strs[0] = dims[1] * dims[2]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %s", cudnnGetErrorString(err)); goto fail; } if (c_make_tensorNd(hx, &hxdesc) != 0) goto fail; if (cx != NULL) if (c_make_tensorNd(cx, &cxdesc) != 0) goto fail; if (c_make_filter(w, &wdesc) != 0) goto fail; shape[0] = seqLength; shape[1] = miniBatch; shape[2] = hiddenSizeDir; if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0) goto fail; err = cudnnCreateTensorDescriptor(&ydesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create ydesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = shape[1]; dims[1] = shape[2]; dims[2] = 1; strs[0] = dims[2] * dims[1]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set ydesc: %s", cudnnGetErrorString(err)); goto fail; } if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx), hx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*hy, &hydesc) != 0) goto fail; if (cy != NULL) { if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx), cx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*cy, &cydesc) != 0) goto fail; } xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (xl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) xl[i] = xdesc; yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (yl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) yl[i] = ydesc; err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, xl, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get worksize: %s", cudnnGetErrorString(err)); goto fail; } workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace"); goto fail; } err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength, xl, &ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get reserve size: %s", cudnnGetErrorString(err)); goto fail; } *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL); if (*reserve == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve"); goto fail; } err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength, xl, PyGpuArray_DEV_DATA(x), hxdesc, PyGpuArray_DEV_DATA(hx), cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL, wdesc, PyGpuArray_DEV_DATA(w), yl, PyGpuArray_DEV_DATA(*y), hydesc, PyGpuArray_DEV_DATA(*hy), cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL, *(void **)workspace, worksize, *(void **)(*reserve), ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could run RNN: %s", cudnnGetErrorString(err)); goto fail; } res = 0; fail: if (xdesc != NULL) cudnnDestroyTensorDescriptor(xdesc); if (hxdesc != NULL) cudnnDestroyTensorDescriptor(hxdesc); if (cxdesc != NULL) cudnnDestroyTensorDescriptor(cxdesc); if (wdesc != NULL) cudnnDestroyFilterDescriptor(wdesc); if (ydesc != NULL) cudnnDestroyTensorDescriptor(ydesc); if (hydesc != NULL) cudnnDestroyTensorDescriptor(hydesc); if (cydesc != NULL) cudnnDestroyTensorDescriptor(cydesc); free(xl); free(yl); if (workspace != NULL) gpudata_release(workspace); cuda_exit(c->ctx); return res; }
int APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, PyGpuArrayObject *im, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **input, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " "stack size"); return 1; } if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (im->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*input); *input = im; Py_INCREF(*input); #else if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), im->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*input, im)) return 1; #endif if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) return 1; cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_kern_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdDataAlgoPerf_t choice; err = cudnnFindConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } #endif size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardDataWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardData_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }