ConvBC01CuDNN<T>::ConvBC01CuDNN(int pad_y, int pad_x, int stride_y, int stride_x) : pad_y(pad_y), pad_x(pad_x), stride_y(stride_y), stride_x(stride_x), n_imgs(0), n_channels(0), n_filters(0), img_h(0), img_w(0), filter_h(0), filter_w(0), workspace_size(0) { CUDNN_CHECK(cudnnCreateTensorDescriptor(&imgs_desc)); CUDNN_CHECK(cudnnCreateTensorDescriptor(&convout_desc)); CUDNN_CHECK(cudnnCreateFilterDescriptor(&filters_desc)); CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc)); }
convolution_layer_updater_cuda::convolution_layer_updater_cuda() : input_data_desc(0) , output_data_desc(0) , weights_desc(0) , convolution_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateFilterDescriptor(&weights_desc)); cudnn_safe_call(cudnnCreateConvolutionDescriptor(&convolution_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, cudnnConvolutionDescriptor_t *desc) { cudnnStatus_t err; int pad[3] = {PAD_0, PAD_1, PAD_2}; int strides[3] = {SUB_0, SUB_1, SUB_2}; int upscale[3] = {1, 1, 1}; #if BORDER_MODE == 0 pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1; pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1; #if NB_DIMS > 2 pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1; #endif #elif BORDER_MODE == 2 pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) / 2; pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) / 2; #if NB_DIMS > 2 pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) / 2; #endif #endif if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) { PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: " "expected %d, got %lld.", NB_DIMS, (long long)PyArray_DIM(filt_shp, 0)); return -1; } err = cudnnCreateConvolutionDescriptor(desc); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_MemoryError, "could not allocate convolution " "descriptor: %s", cudnnGetErrorString(err)); return -1; } err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale, CONV_MODE, PRECISION); return 0; }
inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv, int pad_h, int pad_w, int stride_h, int stride_w) { CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); }
inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = pad; l.batch_normalize = batch_normalize; l.filters = calloc(c*n*size*size, sizeof(float)); l.filter_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); if(binary){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } #ifdef GPU l.filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); } if(xnor){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.filterDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dfilterDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); int padding = l.pad ? l.size/2 : 0; cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l.srcTensorDesc, l.filterDesc, l.convDesc, l.dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l.fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l.filterDesc, l.ddstTensorDesc, l.convDesc, l.dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l.bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l.srcTensorDesc, l.ddstTensorDesc, l.convDesc, l.dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l.bf_algo); #endif #endif l.workspace_size = get_workspace_size(l); l.activation = activation; fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); return l; }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.groups = groups; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = padding; l.batch_normalize = batch_normalize; l.weights = calloc(c/groups*n*size*size, sizeof(float)); l.weight_updates = calloc(c/groups*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); l.nweights = c/groups*n*size*size; l.nbiases = n; // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c/l.groups)); //scale = .02; //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal(); int out_w = convolutional_out_width(l); int out_h = convolutional_out_height(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*l.outputs, sizeof(float)); l.delta = calloc(l.batch*l.outputs, sizeof(float)); l.forward = forward_convolutional_layer; l.backward = backward_convolutional_layer; l.update = update_convolutional_layer; if(binary){ l.binary_weights = calloc(l.nweights, sizeof(float)); l.cweights = calloc(l.nweights, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_weights = calloc(l.nweights, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.mean_delta = calloc(n, sizeof(float)); l.variance_delta = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); l.x = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } if(adam){ l.m = calloc(l.nweights, sizeof(float)); l.v = calloc(l.nweights, sizeof(float)); l.bias_m = calloc(n, sizeof(float)); l.scale_m = calloc(n, sizeof(float)); l.bias_v = calloc(n, sizeof(float)); l.scale_v = calloc(n, sizeof(float)); } #ifdef GPU l.forward_gpu = forward_convolutional_layer_gpu; l.backward_gpu = backward_convolutional_layer_gpu; l.update_gpu = update_convolutional_layer_gpu; if(gpu_index >= 0){ if (adam) { l.m_gpu = cuda_make_array(l.m, l.nweights); l.v_gpu = cuda_make_array(l.v, l.nweights); l.bias_m_gpu = cuda_make_array(l.bias_m, n); l.bias_v_gpu = cuda_make_array(l.bias_v, n); l.scale_m_gpu = cuda_make_array(l.scale_m, n); l.scale_v_gpu = cuda_make_array(l.scale_v, n); } l.weights_gpu = cuda_make_array(l.weights, l.nweights); l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); } if(xnor){ l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.weightDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dweightDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnn_convolutional_setup(&l); #endif } #endif l.workspace_size = get_workspace_size(l); l.activation = activation; //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; }
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; int nInputPlane = module->SpatialConvolution.nInputPlane; int nOutputPlane = module->SpatialConvolution.nOutputPlane; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int sizes[4]; int pad[2], filterStride[2], upscale[2]; cudnnTensorDescriptor_t dinput, dbias, doutput; cudnnConvolutionDescriptor_t dconv; cudnnFilterDescriptor_t dweight; float one = 1, zero = 0; size_t reqwssize; static void *ws; static size_t wssize; static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; pad[0] = padH; pad[1] = padW; filterStride[0] = dH; filterStride[1] = dW; upscale[0] = 1; upscale[1] = 1; if(input->nDimension <= 2) { // Here we use the SpatialConvolution module to perform a linear transformation errcheck(cudnnCreateTensorDescriptor(&dinput)); if(input->nDimension == 1) errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1)); else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1)); } else errcheck(THcudnn_TensorDescriptor(&dinput, input)); errcheck(cudnnCreateFilterDescriptor(&dweight)); errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW)); errcheck(cudnnCreateTensorDescriptor(&dbias)); errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1)); errcheck(cudnnCreateConvolutionDescriptor(&dconv)); errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype)); errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes)); THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize)); if(reqwssize > wssize) { wssize = reqwssize; errcheck(cudaMalloc(&ws, reqwssize)); } } errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input), dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero, doutput, THFloatTensor_data(output))); errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias), &one, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyFilterDescriptor(dweight); cudnnDestroyTensorDescriptor(dbias); cudnnDestroyTensorDescriptor(doutput); cudnnDestroyConvolutionDescriptor(dconv); return output; }