static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ void* args[11]; /** * Argument Marshalling. This the grossest gross thing in here. */ const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->strides, flags, 0); ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->dimensions, flags, 0); ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), ctx->chunkSize, flags, 0); ctx->dstMaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstMax->strides, flags, 0); ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArgmax->strides, flags, 0); args[ 0] = (void*) ctx->src->data; args[ 1] = (void*)&ctx->src->offset; args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; args[ 5] = (void*) ctx->dstMax->data; args[ 6] = (void*)&ctx->dstMax->offset; args[ 7] = (void*) ctx->dstMaxStepsGD; args[ 8] = (void*) ctx->dstArgmax->data; args[ 9] = (void*)&ctx->dstArgmax->offset; args[10] = (void*) ctx->dstArgmaxStepsGD; if(ctx->srcStepsGD && ctx->srcSizeGD && ctx->chunkSizeGD && ctx->dstMaxStepsGD && ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, ctx->gridSize, ctx->blockSize, 0, args); }else{ ctx->ret = GA_MEMORY_ERROR; } gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->chunkSizeGD); gpudata_release(ctx->dstMaxStepsGD); gpudata_release(ctx->dstArgmaxStepsGD); return ctx->ret; }
void GpuArray_clear(GpuArray *a) { if (a->data) gpudata_release(a->data); free(a->dimensions); free(a->strides); memset(a, 0, sizeof(*a)); }
int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides) { char *base = (char *)buf; size_t offset = 0; size_t size = gpuarray_get_elsize(typecode); gpudata *b; int err; unsigned int i; for (i = 0; i < nd; i++) { if (dims[i] == 0) { size = 0; base = (char *)buf; break; } if (strides[i] < 0) base += (dims[i]-1) * strides[i]; else size += (dims[i]-1) * strides[i]; } offset = (char *)buf - base; size += offset; b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err); if (b == NULL) return err; err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1); gpudata_release(b); return err; }
void ctc_context_destroy(ctc_context_t * context) { gpudata_release( context->workspace ); free( context->input_lengths ); free( context->flat_labels ); free( context->label_lengths ); }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PARAMS_TYPE* params) { PyGpuContextObject *c = input->context; void *alpha_p; void *beta_p; float af = alpha, bf = beta; cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) { PyErr_SetString(PyExc_ValueError, "Number of filters must be divisible by number of groups"); return 1; } switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } if (params->inplace) { Py_XDECREF(*output); *output = om; Py_INCREF(*output); } else { if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; } if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { int err2 = GpuArray_memset(&(*output)->ga, 0); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv could not fill the output with zeros: %d", err2); return 1; } return 0; } if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) return 1; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) return 1; size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; cudnnConvolutionFwdAlgo_t algo = params->conv_algo; #ifdef DEBUG char algorithm_name[128]; #endif cuda_enter(c->ctx); if (params->choose_algo) { if (!params->choose_once) { reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } } if (!reuse_algo) { size_t free; int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU"); cuda_exit(c->ctx); return 1; } // Guess 4Mb if the info is not available if (free == 0) free = 4 * 1024 * 1024; if (params->choose_time) { int count; cudnnConvolutionFwdAlgoPerf_t choice; gpudata *tmpmem; tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); if (tmpmem == NULL) { PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); return -1; } // We don't sync the buffer as we don't care about the values. err = cudnnFindConvolutionForwardAlgorithmEx( params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), 1, &count, &choice, *(void **)tmpmem, free); gpudata_release(tmpmem); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #ifdef DEBUG if (count == 0) { PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); return 1; } else if (choice.status != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting best-timed FWD algo: %s", cudnnGetErrorString(choice.status)); return 1; } // Else, count is necessarly 1 for current implementation. #endif } else { err = cudnnGetConvolutionForwardAlgorithm( params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } } prev_algo = algo; } else { algo = prev_algo; } #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; // NB: This is printed only when algorithm is chosen at runtime. if (reuse_algo) fprintf(stderr, "(reused %s)\n", algorithm_name); else fprintf(stderr, "(using %s)\n", algorithm_name); #endif if (params->choose_once) { reuse_algo = 1; } else { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } } } /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (PyGpuArray_NDIM(input) == 5 && !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) { #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Issue should be resolved for cuDNN > V6.0. if (cudnnGetVersion() < 6100 && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && PyGpuArray_DIM(input, 0) > 65536) { #ifdef DEBUG fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM " "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n"); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. The tiled-FFT implementation // does not support strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters /* NB: TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1. New conditions apply and may depend on number of dimensions (2D or 3D) e.g. for FFT_TILING. TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms? */ if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int dilation[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s error getting worksize: " "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); for ( int g = 0; g < params->num_groups; g++) { err = cudnnConvolutionForward( params->handle, alpha_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); } if (worksize != 0) gpudata_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, PyGpuArrayObject *w, PyGpuArrayObject *x, PyGpuArrayObject *hx, PyGpuArrayObject *cx, gpudata **reserve, PyGpuArrayObject **y, PyGpuArrayObject **hy, PyGpuArrayObject **cy, cudnnHandle_t _handle) { PyGpuContextObject *c = x->context; cudnnTensorDescriptor_t xdesc = NULL; cudnnTensorDescriptor_t hxdesc = NULL; cudnnTensorDescriptor_t cxdesc = NULL; cudnnTensorDescriptor_t ydesc = NULL; cudnnTensorDescriptor_t hydesc = NULL; cudnnTensorDescriptor_t cydesc = NULL; cudnnFilterDescriptor_t wdesc = NULL; cudnnTensorDescriptor_t *xl = NULL; cudnnTensorDescriptor_t *yl = NULL; gpudata *workspace = NULL; size_t worksize, ressize; size_t seqLength = PyGpuArray_DIM(x, 0); size_t miniBatch = PyGpuArray_DIM(x, 1); size_t inputSize = PyGpuArray_DIM(x, 2); size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2); size_t shape[3]; int strs[3], dims[3]; cudnnStatus_t err; cudnnDataType_t dt; int res = -1; switch (x->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported data type for x"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(c->ctx); err = cudnnCreateTensorDescriptor(&xdesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create xdesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = PyGpuArray_DIM(x, 1); dims[1] = PyGpuArray_DIM(x, 2); dims[2] = 1; strs[0] = dims[1] * dims[2]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %s", cudnnGetErrorString(err)); goto fail; } if (c_make_tensorNd(hx, &hxdesc) != 0) goto fail; if (cx != NULL) if (c_make_tensorNd(cx, &cxdesc) != 0) goto fail; if (c_make_filter(w, &wdesc) != 0) goto fail; shape[0] = seqLength; shape[1] = miniBatch; shape[2] = hiddenSizeDir; if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0) goto fail; err = cudnnCreateTensorDescriptor(&ydesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create ydesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = shape[1]; dims[1] = shape[2]; dims[2] = 1; strs[0] = dims[2] * dims[1]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set ydesc: %s", cudnnGetErrorString(err)); goto fail; } if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx), hx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*hy, &hydesc) != 0) goto fail; if (cy != NULL) { if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx), cx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*cy, &cydesc) != 0) goto fail; } xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (xl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) xl[i] = xdesc; yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (yl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) yl[i] = ydesc; err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, xl, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get worksize: %s", cudnnGetErrorString(err)); goto fail; } workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace"); goto fail; } err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength, xl, &ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get reserve size: %s", cudnnGetErrorString(err)); goto fail; } *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL); if (*reserve == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve"); goto fail; } err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength, xl, PyGpuArray_DEV_DATA(x), hxdesc, PyGpuArray_DEV_DATA(hx), cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL, wdesc, PyGpuArray_DEV_DATA(w), yl, PyGpuArray_DEV_DATA(*y), hydesc, PyGpuArray_DEV_DATA(*hy), cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL, *(void **)workspace, worksize, *(void **)(*reserve), ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could run RNN: %s", cudnnGetErrorString(err)); goto fail; } res = 0; fail: if (xdesc != NULL) cudnnDestroyTensorDescriptor(xdesc); if (hxdesc != NULL) cudnnDestroyTensorDescriptor(hxdesc); if (cxdesc != NULL) cudnnDestroyTensorDescriptor(cxdesc); if (wdesc != NULL) cudnnDestroyFilterDescriptor(wdesc); if (ydesc != NULL) cudnnDestroyTensorDescriptor(ydesc); if (hydesc != NULL) cudnnDestroyTensorDescriptor(hydesc); if (cydesc != NULL) cudnnDestroyTensorDescriptor(cydesc); free(xl); free(yl); if (workspace != NULL) gpudata_release(workspace); cuda_exit(c->ctx); return res; }