예제 #1
0
static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
	void* args[11];

	/**
	 * Argument Marshalling. This the grossest gross thing in here.
	 */

	const int flags       = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
	ctx->srcStepsGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
	                                      ctx->src->strides,       flags, 0);
	ctx->srcSizeGD        = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
	                                      ctx->src->dimensions,    flags, 0);
	ctx->chunkSizeGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
	                                      ctx->chunkSize,          flags, 0);
	ctx->dstMaxStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
	                                      ctx->dstMax->strides,    flags, 0);
	ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
	                                      ctx->dstArgmax->strides, flags, 0);
	args[ 0] = (void*) ctx->src->data;
	args[ 1] = (void*)&ctx->src->offset;
	args[ 2] = (void*) ctx->srcStepsGD;
	args[ 3] = (void*) ctx->srcSizeGD;
	args[ 4] = (void*) ctx->chunkSizeGD;
	args[ 5] = (void*) ctx->dstMax->data;
	args[ 6] = (void*)&ctx->dstMax->offset;
	args[ 7] = (void*) ctx->dstMaxStepsGD;
	args[ 8] = (void*) ctx->dstArgmax->data;
	args[ 9] = (void*)&ctx->dstArgmax->offset;
	args[10] = (void*) ctx->dstArgmaxStepsGD;

	if(ctx->srcStepsGD      &&
	   ctx->srcSizeGD       &&
	   ctx->chunkSizeGD     &&
	   ctx->dstMaxStepsGD   &&
	   ctx->dstArgmaxStepsGD){
		ctx->ret = GpuKernel_call(&ctx->kernel,
		                          ctx->ndh>0 ? ctx->ndh : 1,
		                          ctx->gridSize,
		                          ctx->blockSize,
		                          0,
		                          args);
	}else{
		ctx->ret = GA_MEMORY_ERROR;
	}

	gpudata_release(ctx->srcStepsGD);
	gpudata_release(ctx->srcSizeGD);
	gpudata_release(ctx->chunkSizeGD);
	gpudata_release(ctx->dstMaxStepsGD);
	gpudata_release(ctx->dstArgmaxStepsGD);

	return ctx->ret;
}
예제 #2
0
void GpuArray_clear(GpuArray *a) {
  if (a->data)
    gpudata_release(a->data);
  free(a->dimensions);
  free(a->strides);
  memset(a, 0, sizeof(*a));
}
예제 #3
0
int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf,
                            int typecode, unsigned int nd, const size_t *dims,
                            const ssize_t *strides) {
  char *base = (char *)buf;
  size_t offset = 0;
  size_t size = gpuarray_get_elsize(typecode);
  gpudata *b;
  int err;
  unsigned int i;

  for (i = 0; i < nd; i++) {
    if (dims[i] == 0) {
      size = 0;
      base = (char *)buf;
      break;
    }

    if (strides[i] < 0)
      base += (dims[i]-1) * strides[i];
    else
      size += (dims[i]-1) * strides[i];
  }
  offset = (char *)buf - base;
  size += offset;

  b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err);
  if (b == NULL) return err;

  err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1);
  gpudata_release(b);
  return err;
}
예제 #4
0
void ctc_context_destroy(ctc_context_t * context)
{
    gpudata_release( context->workspace );

    free( context->input_lengths );

    free( context->flat_labels );

    free( context->label_lengths );
}
예제 #5
0
파일: dnn_fwd.c 프로젝트: athiwatp/Theano
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         PyGpuArrayObject *om,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
                         PARAMS_TYPE* params) {
  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
		    "images and kernel must have the same stack size");
    return 1;
  }
  if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
    PyErr_SetString(PyExc_ValueError,
		    "Number of filters must be divisible by number of groups");
    return 1;
  }

  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
    return 1;
  }

  if (params->inplace) {
    Py_XDECREF(*output);
    *output = om;
    Py_INCREF(*output);
  } else {
    if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
                           om->ga.typecode, GA_C_ORDER, c) != 0)
      return 1;
    if (beta != 0.0 && pygpu_move(*output, om))
      return 1;
  }

  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
    int err2 = GpuArray_memset(&(*output)->ga, 0);
    if (err2 != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuDnnConv could not fill the output with zeros: %d", err2);
        return 1;
    }
    return 0;
  }

  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
    return 1;
  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
    return 1;
  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;

  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
  #ifdef DEBUG
  char algorithm_name[128];
  #endif

  cuda_enter(c->ctx);

  if (params->choose_algo) {
    if (!params->choose_once) {
      reuse_algo = 1;
      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
      }
    }

    if (!reuse_algo) {
      size_t free;

      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
      if (err2 != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                     "memory information on the GPU");
        cuda_exit(c->ctx);
        return 1;
      }

      // Guess 4Mb if the info is not available
      if (free == 0) free = 4 * 1024 * 1024;

      if (params->choose_time) {
        int count;
        cudnnConvolutionFwdAlgoPerf_t choice;
        gpudata *tmpmem;

        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
          return -1;
        }
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
          1, &count, &choice, *(void **)tmpmem,
          free);
        gpudata_release(tmpmem);

        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
                       cudnnGetErrorString(err));
          cuda_exit(c->ctx);
          return 1;
        }
        algo = choice.algo;

        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed FWD algo: %s",
                         cudnnGetErrorString(choice.status));
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif

      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
          if (err != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error selecting convolution algo: %s",
                         cudnnGetErrorString(err));
            cuda_exit(c->ctx);
            return 1;
          }
      }
      prev_algo = algo;
    } else {
      algo = prev_algo;
    }

    #ifdef DEBUG
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
    // NB: This is printed only when algorithm is chosen at runtime.
    if (reuse_algo)
        fprintf(stderr, "(reused %s)\n", algorithm_name);
    else
        fprintf(stderr, "(using %s)\n", algorithm_name);
    #endif

    if (params->choose_once) {
      reuse_algo = 1;
    } else {
      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
        prev_img_dims[i] = PyGpuArray_DIM(input, i);
        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
      }
    }
  }

  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
  if (PyGpuArray_NDIM(input) == 5 &&
      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
  {
    #ifdef DEBUG
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
    #endif
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  }

  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
  // Issue should be resolved for cuDNN > V6.0.
  if (cudnnGetVersion() < 6100 &&
      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
      PyGpuArray_DIM(input, 0) > 65536)
  {
    #ifdef DEBUG
    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
    #endif
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  }

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
  // does not support strides.
  // If the chosen implementation is FFT or tiled-FFT, validate that it can
  // be used on the current data and default to a safe implementation if it
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
  /* NB:
  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
  New conditions apply and may depend on number of dimensions (2D or 3D)
  e.g. for FFT_TILING.
  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
  */
  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {

    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int dilation[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             dilation, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
      if (stride[0] != 1 || stride[1] != 1 ||
          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
      {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    } else {
      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
      if (stride[0] != 1 || stride[1] != 1) {
        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      }
    }
  }

  {
    size_t worksize;
    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);

    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
      // Fallback to none algo if not supported

      #ifdef DEBUG
      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
        return 1;
      fprintf(stderr, "(%s error getting worksize: "
                      "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
      #endif

      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
                                                    APPLY_SPECIFIC(output),
                                                    algo,
                                                    &worksize);
    }

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }

    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
    }

    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    for ( int g = 0; g < params->num_groups; g++) {
    err = cudnnConvolutionForward(
      params->handle,
      alpha_p,
      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
    }

    if (worksize != 0)
      gpudata_release(workspace);

    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
}
예제 #6
0
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
                PyGpuArrayObject *w, PyGpuArrayObject *x,
                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
                gpudata **reserve, PyGpuArrayObject **y,
                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
                cudnnHandle_t _handle) {
  PyGpuContextObject *c = x->context;
  cudnnTensorDescriptor_t xdesc = NULL;
  cudnnTensorDescriptor_t hxdesc = NULL;
  cudnnTensorDescriptor_t cxdesc = NULL;
  cudnnTensorDescriptor_t ydesc = NULL;
  cudnnTensorDescriptor_t hydesc = NULL;
  cudnnTensorDescriptor_t cydesc = NULL;
  cudnnFilterDescriptor_t wdesc = NULL;
  cudnnTensorDescriptor_t *xl = NULL;
  cudnnTensorDescriptor_t *yl = NULL;
  gpudata *workspace = NULL;
  size_t worksize, ressize;

  size_t seqLength = PyGpuArray_DIM(x, 0);
  size_t miniBatch = PyGpuArray_DIM(x, 1);
  size_t inputSize = PyGpuArray_DIM(x, 2);
  size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
  size_t shape[3];
  int strs[3], dims[3];
  cudnnStatus_t err;
  cudnnDataType_t dt;

  int res = -1;

  switch (x->ga.typecode) {
  case GA_FLOAT:
    dt = CUDNN_DATA_FLOAT;
    break;
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
  case GA_HALF:
    dt = CUDNN_DATA_HALF;
    break;
  default:
    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
    return -1;
  }

  // This is early to match the exit() in the fail label.
  cuda_enter(c->ctx);

  err = cudnnCreateTensorDescriptor(&xdesc);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not create xdesc: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  dims[0] = PyGpuArray_DIM(x, 1);
  dims[1] = PyGpuArray_DIM(x, 2);
  dims[2] = 1;

  strs[0] = dims[1] * dims[2];
  strs[1] = dims[2];
  strs[2] = 1;

  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not set xdesc: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  if (c_make_tensorNd(hx, &hxdesc) != 0)
    goto fail;

  if (cx != NULL)
    if (c_make_tensorNd(cx, &cxdesc) != 0)
      goto fail;

  if (c_make_filter(w, &wdesc) != 0)
    goto fail;

  shape[0] = seqLength;
  shape[1] = miniBatch;
  shape[2] = hiddenSizeDir;
  if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
    goto fail;

  err = cudnnCreateTensorDescriptor(&ydesc);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not create ydesc: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  dims[0] = shape[1];
  dims[1] = shape[2];
  dims[2] = 1;

  strs[0] = dims[2] * dims[1];
  strs[1] = dims[2];
  strs[2] = 1;

  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not set ydesc: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
                         hx->ga.typecode, GA_C_ORDER, c) != 0)
    goto fail;

  if (c_make_tensorNd(*hy, &hydesc) != 0)
    goto fail;

  if (cy != NULL) {
    if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
                           cx->ga.typecode, GA_C_ORDER, c) != 0)
      goto fail;

    if (c_make_tensorNd(*cy, &cydesc) != 0)
      goto fail;
  }

  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
  if (xl == NULL) {
    PyErr_NoMemory();
    goto fail;
  }

  for (size_t i = 0; i < seqLength; i++)
    xl[i] = xdesc;

  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
  if (yl == NULL) {
    PyErr_NoMemory();
    goto fail;
  }

  for (size_t i = 0; i < seqLength; i++)
    yl[i] = ydesc;

  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
                                 xl, &worksize);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not get worksize: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
  if (workspace == NULL) {
    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
    goto fail;
  }

  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
                                       xl, &ressize);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could not get reserve size: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
  if (*reserve == NULL) {
    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
    goto fail;
  }

  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
                                xl, PyGpuArray_DEV_DATA(x),
                                hxdesc, PyGpuArray_DEV_DATA(hx),
                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
                                wdesc, PyGpuArray_DEV_DATA(w),
                                yl, PyGpuArray_DEV_DATA(*y),
                                hydesc, PyGpuArray_DEV_DATA(*hy),
                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
                                *(void **)workspace, worksize,
                                *(void **)(*reserve), ressize);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "Could run RNN: %s",
                 cudnnGetErrorString(err));
    goto fail;
  }

  res = 0;
 fail:
  if (xdesc != NULL)
    cudnnDestroyTensorDescriptor(xdesc);
  if (hxdesc != NULL)
    cudnnDestroyTensorDescriptor(hxdesc);
  if (cxdesc != NULL)
    cudnnDestroyTensorDescriptor(cxdesc);
  if (wdesc != NULL)
    cudnnDestroyFilterDescriptor(wdesc);
  if (ydesc != NULL)
    cudnnDestroyTensorDescriptor(ydesc);
  if (hydesc != NULL)
    cudnnDestroyTensorDescriptor(hydesc);
  if (cydesc != NULL)
    cudnnDestroyTensorDescriptor(cydesc);
  free(xl);
  free(yl);
  if (workspace != NULL)
    gpudata_release(workspace);
  cuda_exit(c->ctx);
  return res;
}