int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, PyGpuArrayObject *bias, PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, PyGpuArrayObject **x_invstd, PyGpuContextObject *c) { if (c_set_tensorNd(inp, bn_input) != 0) return 1; if (c_set_tensorNd(scale, bn_params) != 0) return 1; if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (c_set_tensorNd(*outp, bn_output) != 0) return 1; { const float falpha = 1.; const float fbeta = 0.; const double dalpha = 1.; const double dbeta = 0.; void *alpha; void *beta; if (inp->ga.typecode == GA_DOUBLE) { alpha = (void *)&dalpha; beta = (void *)&dbeta; } else { alpha = (void *)&falpha; beta = (void *)&fbeta; } cudnnStatus_t err = cudnnBatchNormalizationForwardTraining( APPLY_SPECIFIC(_handle), MODE, alpha, beta, bn_input, PyGpuArray_DEV_DATA(inp), bn_output, PyGpuArray_DEV_DATA(*outp), bn_params, PyGpuArray_DEV_DATA(scale), PyGpuArray_DEV_DATA(bias), 0, NULL, // running mean, deliberately unused NULL, // running var, deliberately unused EPSILON, PyGpuArray_DEV_DATA(*x_mean), PyGpuArray_DEV_DATA(*x_invstd) ); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n", cudnnGetErrorString(err)); return 1; } } return 0; }
static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) { cudnnStatus_t err; err = cudnnCreateTensorDescriptor(desc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create tensor descriptor: %s", cudnnGetErrorString(err)); return -1; } if (c_set_tensorNd(var, *desc) != 0) { cudnnDestroyTensorDescriptor(*desc); return -1; } return 0; }
int APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, CudaNdarray *im, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **input) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; int nb_dim = CudaNdarray_NDIM(output); #ifdef CONV_INPLACE Py_XDECREF(*input); *input = im; Py_INCREF(*input); #else if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im)) return 1; #endif if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) return 1; #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000 { size_t worksize; void *workspace; cudnnConvolutionBwdDataAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics, if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the the kernels and the outputs // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] == APPLY_SPECIFIC(previous_kerns_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] == APPLY_SPECIFIC(previous_output_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the kernel and output // shapes. Either by choosing one according to heuristics or by making // CuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionBackwardDataAlgorithm(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error selecting convolution algo: " "%s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // Choose the convolution implementation using heuristics based on the // shapes of the inputs and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionBackwardDataAlgorithm(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the kernels and output as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_kerns_shape)[i] = CudaNdarray_HOST_DIMS(kerns)[i]; APPLY_SPECIFIC(previous_output_shape)[i] = CudaNdarray_HOST_DIMS(output)[i]; } } else { // Reuse the previously chosen convlution implementation chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo); } } else { chosen_algo = CONV_ALGO; } // The FFT implementation (only in v3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can be used // on the current data and default on a safe implementation if it // can't. if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4) { // Extract the properties of the convolution descriptor int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y; cudnnConvolutionMode_t mode; err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w, &stride_v, &stride_h, &upscale_x, &upscale_y, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(kerns)[3]; int filter_w = CudaNdarray_HOST_DIMS(kerns)[4]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(*input)[3]; int input_w = CudaNdarray_HOST_DIMS(*input)[4]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (stride_v != 1 || stride_h != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } // Infer required workspace size from the chosen implementation err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), chosen_algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } // Allocate workspace for the convolution workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; // Perform the convolution err = cudnnConvolutionBackwardData_v3( _handle, (void *)&alpha, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); } #else err = cudnnConvolutionBackwardData( _handle, (void *)&alpha, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, (void *)&beta, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); #endif if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, CudaNdarray *im, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **input) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } int nb_dim = CudaNdarray_NDIM(output); #ifdef CONV_INPLACE Py_XDECREF(*input); *input = im; Py_INCREF(*input); #else if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im)) return 1; #endif if (CudaNdarray_DIMS(im)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) { cudaError_t err2 = cudaMemset((*input)->devdata, 0, CudaNdarray_SIZE(*input) * sizeof(real)); if (err2 != cudaSuccess) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %s", cudaGetErrorString(err2)); return 1; } return 0; } if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) return 1; int expected_output_dims[5] = {0}; err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), nb_dim, expected_output_dims); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s", cudnnGetErrorString(err)); return 1; } if (nb_dim == 4) { if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) || (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) || (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) || (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) { PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld" " but received gradient with shape %ldx%ldx%ldx%ld", (long int)expected_output_dims[0], (long int)expected_output_dims[1], (long int)expected_output_dims[2], (long int)expected_output_dims[3], (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1], (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]); return 1; } } else if (nb_dim == 5) { if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) || (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) || (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) || (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) || (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) { PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld" " but received gradient with shape %ldx%ldx%ldx%ldx%ld", (long int)expected_output_dims[0], (long int)expected_output_dims[1], (long int)expected_output_dims[2], (long int)expected_output_dims[3], (long int)expected_output_dims[4], (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1], (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3], (long int)CudaNdarray_HOST_DIMS(output)[4]); return 1; } } { size_t worksize; void *workspace; cudnnConvolutionBwdDataAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics, if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the the kernels and the outputs // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] == APPLY_SPECIFIC(previous_kerns_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] == APPLY_SPECIFIC(previous_output_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the kernel and output // shapes. Either by choosing one according to heuristics or by making // cuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionBackwardDataAlgorithm(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error selecting convolution algo: " "%s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // Choose the convolution implementation using heuristics based on the // shapes of the inputs and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionBackwardDataAlgorithm(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the kernels and output as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo_set) = true; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_kerns_shape)[i] = CudaNdarray_HOST_DIMS(kerns)[i]; APPLY_SPECIFIC(previous_output_shape)[i] = CudaNdarray_HOST_DIMS(output)[i]; } } else { // Reuse the previously chosen convlution implementation chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo); } } else { chosen_algo = CONV_ALGO; } if (0){ char * a; switch(chosen_algo){ case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0: a = "implicit gemm (0)"; break; case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: a = "precomp gemm (1)"; break; case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT: a = "fft (2)"; break; case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING: a = "fft tiling (3)"; break; #if CUDNN_VERSION > 5000 case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD: a = "winograd (4)"; break; #endif } printf("GpuDNNConvGI: algo %s\n", a); } // The FFT implementation (only in V3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // The tiled-FFT implementation (only in V4 onward) does not support // strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default on a safe implementation if it // can't. // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are // defined only for 2d-filters if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(kerns)[2]; int filter_w = CudaNdarray_HOST_DIMS(kerns)[3]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(*input)[2]; int input_w = CudaNdarray_HOST_DIMS(*input)[3]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } else { // chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } } // Infer required workspace size from the chosen implementation err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), chosen_algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } // Allocate workspace for the convolution workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; // Perform the convolution err = cudnnConvolutionBackwardData( _handle, (void *)&alpha, APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, CudaNdarray *om, cudnnConvolutionDescriptor_t desc, float alpha, float beta, CudaNdarray **output) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size\n"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; int nb_dim = CudaNdarray_NDIM(input); #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0) return 1; if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; { size_t worksize; void *workspace; cudnnConvolutionFwdAlgo_t chosen_algo; if (CHOOSE_ALGO) { // A new convolution implementation should be selected, based either on // timing or heuristics if in one of the two following cases : // - The implementation should only be chosen during the first execution // of an apply node and this is the first execution of the apply node. // - The implementation should be chosen as often as necessary and the // shapes of the inputs differ from the last time an implementation // was chosen. bool reuse_previous_algo; if (CHOOSE_ALGO_ONCE) { // Only choose a new implementation of none has been chosen before. reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set); } else { // Reuse the previous implementation if the inputs and the kernels // have the same shapes as they had when the previous implementation // was selected bool same_shapes = true; for (int i = 0; (i < nb_dim) && same_shapes; i++) { same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] == APPLY_SPECIFIC(previous_input_shape)[i]); same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] == APPLY_SPECIFIC(previous_kerns_shape)[i]); } reuse_previous_algo = same_shapes; } // If the previously choosen implementation can't be reused, select a // new one based on the shapes of the current inputs if (!reuse_previous_algo) { // Obtain a convolution algorithm appropriate for the input and kernel // shapes. Either by choosing one according to heuristics or by making // cuDNN time every implementation and choose the best one. if (CHOOSE_ALGO_TIME) { // Time the different implementations to choose the best one int requestedCount = 1; int count; cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf; err = cudnnFindConvolutionForwardAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), requestedCount, &count, &choosen_algo_perf); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } chosen_algo = choosen_algo_perf.algo; } else { // The implementation should be chosen using heuristics based on the // input shapes and the amount of memory available. // Get the amount of available memory size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); fprintf(stderr, "Error when trying to find the memory information" " on the GPU: %s\n", cudaGetErrorString(err2)); return 1; } // Use heuristics to choose the implementation err = cudnnGetConvolutionForwardAlgorithm(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &chosen_algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error selecting convolution algo: %s", cudnnGetErrorString(err)); return 1; } } // Store the shapes of the inputs and kernels as well as the chosen // algorithm for future use. APPLY_SPECIFIC(previous_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo_set) = true; for (int i = 0; i < nb_dim; i++) { APPLY_SPECIFIC(previous_input_shape)[i] = CudaNdarray_HOST_DIMS(input)[i]; APPLY_SPECIFIC(previous_kerns_shape)[i] = CudaNdarray_HOST_DIMS(kerns)[i]; } } else { // Reuse the previously chosen convolution implementation chosen_algo = APPLY_SPECIFIC(previous_algo); } } else { chosen_algo = CONV_ALGO; } // The FFT implementation (only in V3 and onward) does not support strides, // 1x1 filters or inputs with a spatial dimension larger than 1024. // The tiled-FFT implementation (only in V4 onward) does not support // strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default on a safe implementation if it // can't. // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are // defined only for 2d-filters if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error getting convolution properties: %s", cudnnGetErrorString(err)); return 1; } // Extract the spatial size of the filters int filter_h = CudaNdarray_HOST_DIMS(kerns)[2]; int filter_w = CudaNdarray_HOST_DIMS(kerns)[3]; // Extract the spatial size of the input int input_h = CudaNdarray_HOST_DIMS(input)[2]; int input_w = CudaNdarray_HOST_DIMS(input)[3]; // Ensure that the selected implementation supports the requested // convolution. Fall back to a safe implementation otherwise. if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 || input_w > 1024 || (filter_h == 1 && filter_w == 1)) { chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } err = cudnnGetConvolutionForwardWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), chosen_algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported // TODO: Print a warning chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), chosen_algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error getting worksize: %s", cudnnGetErrorString(err)); return 1; } workspace = get_work_mem(worksize); if (workspace == NULL && worksize != 0) return 1; err = cudnnConvolutionForward( _handle, (void *)&alpha, APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input), APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), desc, chosen_algo, workspace, worksize, (void *)&beta, APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output)); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO /* Static variables are only initialized once so this will not * reset the previous algo every time */ static int reuse_algo = 0; static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_kern_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionFwdAlgoPerf_t choice; err = cudnnFindConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } #endif #endif /* These two algos are not supported for 3d conv */ if (PyGpuArray_NDIM(input) == 5 && (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } } #endif #if CUDNN_VERSION < 3000 /* cuDNN before v3 does not support kernels larger than input even * if appropriate padding is selected. */ for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) { if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) { PyErr_SetString(PyExc_RuntimeError, "the current version " "of CuDNN does not support kernels larger than the " "inputs in any spatial dimension, even if the inputs " "are padded such that the padded inputs are larger " "than the kernels. Update your installation of CuDNN " "to V3 or more recent to solve the issue."); cuda_exit(c->ctx); return 1; } } #endif { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } err = cudnnConvolutionForward( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); if (worksize != 0) c->ops->buffer_release(workspace); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, PyGpuArrayObject *im, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **input, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " "stack size"); return 1; } if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (im->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*input); *input = im; Py_INCREF(*input); #else if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), im->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*input, im)) return 1; #endif if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) return 1; cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_kern_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdDataAlgoPerf_t choice; err = cudnnFindConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } #endif size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardDataWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardData_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }