void CuDnnRNNExecutor<ElemType>::ForwardCore( const GPUMatrix<ElemType>& weightsW, const GPUMatrix<ElemType>& inputX, GPUMatrix<ElemType>& outputY, const vector<size_t>& numSequencesForFrame, const RnnAttributes& rnnAttributes, GPUMatrix<ElemType>& reserve, GPUMatrix<ElemType>& workspace ) { // test that the RNN shape is correct if (!m_rnnT->IsCompatible(rnnAttributes)) LogicError("RNN Layout has changed during processing"); if (m_yDim != (m_rnnT->isBidirectional() ? 2 : 1) * m_rnnT->GetNumHidden()) InvalidArgument("CuDnn ForwardCore: Output leading dimension must be twice hidden size for bidirectional networks"); // set up the input and output descriptors SetDescriptors(m_xDim, numSequencesForFrame, xDesc); SetDescriptors(m_yDim, numSequencesForFrame, yDesc); // ensure workspace and reserve are large enough m_seqLength = numSequencesForFrame.size(); size_t workSize; size_t reserveSize; // Need for every pass CUDNN_CALL(cudnnGetRNNWorkspaceSize(*m_cudnn, *m_rnnT, (int)m_seqLength, xDesc.data(), &workSize)); // Only needed in training, can't be touched between passes. CUDNN_CALL(cudnnGetRNNTrainingReserveSize(*m_cudnn, *m_rnnT, (int)m_seqLength, xDesc.data(), &reserveSize)); // convert from bytes to ElemType workSize = (workSize + sizeof(ElemType) - 1) / (sizeof(ElemType)); reserveSize = (reserveSize + sizeof(ElemType) - 1) / sizeof(ElemType); reserve.Resize(reserveSize, 1); workspace.Resize(workSize, 1); wDesc = make_unique<CuDnnFilter<ElemType>>(*m_rnnT, xDesc[0]); if (wDesc->GetSize() != weightsW.GetNumElements()) InvalidArgument("RNN needs %ld parameters, but %ld were allocated", wDesc->GetSize(), weightsW.GetNumElements()); CUDNN_CALL(cudnnRNNForwardTraining( *m_cudnn, *m_rnnT, (int)m_seqLength, xDesc.data(), inputX.Data(), 0, 0, 0, 0, *wDesc, weightsW.Data(), yDesc.data(), outputY.Data(), 0, 0, 0, 0, workspace.Data(), workspace.GetNumElements()*sizeof(ElemType), reserve.Data(), reserve.GetNumElements()*sizeof(ElemType))); m_BackwardDataCalledYet = false; }
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, PyGpuArrayObject *w, PyGpuArrayObject *x, PyGpuArrayObject *hx, PyGpuArrayObject *cx, gpudata **reserve, PyGpuArrayObject **y, PyGpuArrayObject **hy, PyGpuArrayObject **cy, cudnnHandle_t _handle) { PyGpuContextObject *c = x->context; cudnnTensorDescriptor_t xdesc = NULL; cudnnTensorDescriptor_t hxdesc = NULL; cudnnTensorDescriptor_t cxdesc = NULL; cudnnTensorDescriptor_t ydesc = NULL; cudnnTensorDescriptor_t hydesc = NULL; cudnnTensorDescriptor_t cydesc = NULL; cudnnFilterDescriptor_t wdesc = NULL; cudnnTensorDescriptor_t *xl = NULL; cudnnTensorDescriptor_t *yl = NULL; gpudata *workspace = NULL; size_t worksize, ressize; size_t seqLength = PyGpuArray_DIM(x, 0); size_t miniBatch = PyGpuArray_DIM(x, 1); size_t inputSize = PyGpuArray_DIM(x, 2); size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2); size_t shape[3]; int strs[3], dims[3]; cudnnStatus_t err; cudnnDataType_t dt; int res = -1; switch (x->ga.typecode) { case GA_FLOAT: dt = CUDNN_DATA_FLOAT; break; case GA_DOUBLE: dt = CUDNN_DATA_DOUBLE; break; case GA_HALF: dt = CUDNN_DATA_HALF; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported data type for x"); return -1; } // This is early to match the exit() in the fail label. cuda_enter(c->ctx); err = cudnnCreateTensorDescriptor(&xdesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create xdesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = PyGpuArray_DIM(x, 1); dims[1] = PyGpuArray_DIM(x, 2); dims[2] = 1; strs[0] = dims[1] * dims[2]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %s", cudnnGetErrorString(err)); goto fail; } if (c_make_tensorNd(hx, &hxdesc) != 0) goto fail; if (cx != NULL) if (c_make_tensorNd(cx, &cxdesc) != 0) goto fail; if (c_make_filter(w, &wdesc) != 0) goto fail; shape[0] = seqLength; shape[1] = miniBatch; shape[2] = hiddenSizeDir; if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0) goto fail; err = cudnnCreateTensorDescriptor(&ydesc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create ydesc: %s", cudnnGetErrorString(err)); goto fail; } dims[0] = shape[1]; dims[1] = shape[2]; dims[2] = 1; strs[0] = dims[2] * dims[1]; strs[1] = dims[2]; strs[2] = 1; err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set ydesc: %s", cudnnGetErrorString(err)); goto fail; } if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx), hx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*hy, &hydesc) != 0) goto fail; if (cy != NULL) { if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx), cx->ga.typecode, GA_C_ORDER, c) != 0) goto fail; if (c_make_tensorNd(*cy, &cydesc) != 0) goto fail; } xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (xl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) xl[i] = xdesc; yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength); if (yl == NULL) { PyErr_NoMemory(); goto fail; } for (size_t i = 0; i < seqLength; i++) yl[i] = ydesc; err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, xl, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get worksize: %s", cudnnGetErrorString(err)); goto fail; } workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace"); goto fail; } err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength, xl, &ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not get reserve size: %s", cudnnGetErrorString(err)); goto fail; } *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL); if (*reserve == NULL) { PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve"); goto fail; } err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength, xl, PyGpuArray_DEV_DATA(x), hxdesc, PyGpuArray_DEV_DATA(hx), cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL, wdesc, PyGpuArray_DEV_DATA(w), yl, PyGpuArray_DEV_DATA(*y), hydesc, PyGpuArray_DEV_DATA(*hy), cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL, *(void **)workspace, worksize, *(void **)(*reserve), ressize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could run RNN: %s", cudnnGetErrorString(err)); goto fail; } res = 0; fail: if (xdesc != NULL) cudnnDestroyTensorDescriptor(xdesc); if (hxdesc != NULL) cudnnDestroyTensorDescriptor(hxdesc); if (cxdesc != NULL) cudnnDestroyTensorDescriptor(cxdesc); if (wdesc != NULL) cudnnDestroyFilterDescriptor(wdesc); if (ydesc != NULL) cudnnDestroyTensorDescriptor(ydesc); if (hydesc != NULL) cudnnDestroyTensorDescriptor(hydesc); if (cydesc != NULL) cudnnDestroyTensorDescriptor(cydesc); free(xl); free(yl); if (workspace != NULL) gpudata_release(workspace); cuda_exit(c->ctx); return res; }