void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, WORKSPACE_LIMIT, &fwd_algo )); CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &workspace_size )); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }
virtual void Forward(bool is_train, const std::vector<Node<gpu>*> &nodes_in, const std::vector<Node<gpu>*> &nodes_out, ConnectState<gpu> *p_cstate) { mshadow::Tensor<gpu,4> &tmp = p_cstate->states[0]; if (!init_cudnn_) { init_cudnn_ = true; CUDA_CHECK(cudnnSetStream(handle_, nodes_out[0]->data.stream_->stream_)); mshadow::Tensor<gpu, 4, float> &in = nodes_in[0]->data; mshadow::Tensor<gpu, 4, float> &out = nodes_out[0]->data; CUDA_CHECK(cudnnSetTensor4dDescriptor(in_desc_, CUDNN_TENSOR_NCHW, dtype_, in.shape_[0], in.shape_[1], in.shape_[2], in.shape_[3])); CUDA_CHECK(cudnnSetTensor4dDescriptor(out_desc_, CUDNN_TENSOR_NCHW, dtype_, out.shape_[0], out.shape_[1], out.shape_[2], out.shape_[3])); } float alpha = 1.0f; float beta = 0.0f; utils::Assert(nodes_in[0]->data.CheckContiguous(), "contiguous in conv"); utils::Assert(nodes_out[0]->data.CheckContiguous(), "contiguous in conv"); utils::Assert(tmp.CheckContiguous(), "contiguous in conv"); CUDA_CHECK(cudnnPoolingForward(handle_, pooling_desc_, &alpha, in_desc_, nodes_in[0]->data.dptr_, &beta, out_desc_, tmp.dptr_)); mshadow::Copy(nodes_out[0]->data, tmp, nodes_out[0]->data.stream_); }
void convolution_layer_updater_cuda::enqueue_backprop( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { if (!backprop_required) throw neural_network_exception("convolution_layer_updater_cuda is not configured to do backprop but requested to"); cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionBackwardData( cuda_config->get_cudnn_handle(), &alpha, weights_desc, *data[0], output_data_desc, *output_errors_buffer, convolution_desc, &beta, input_data_desc, *input_errors_buffer)); } }
void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); #if CUDNN_MAJOR >= 6 cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); #else cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); #endif #if CUDNN_MAJOR >= 7 cudnnSetConvolutionGroupCount(l->convDesc, l->groups); #else if(l->groups > 1){ error("CUDNN < 7 doesn't support groups, please upgrade!"); } #endif cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
/*cudnn set tensor dim*/ void setTensorDesc(cudnnTensorDescriptor_t& tensorDesc, cudnnTensorFormat_t& tensorFormat, cudnnDataType_t& dataType, int n, int c, int h, int w){ #if SIMPLE_TENSOR_DESCRIPTOR /*cudnn set 4d tensor*/ checkCUDNN(cudnnSetTensor4dDescriptor(tensorDesc, tensorFormat, dataType, n, c, h, w)); #elif defined(ND_TENSOR_DESCRIPTOR) const int nDim = 4; int dimA[nDim] = {n,c,h,w}; int strideA[nDim] = {c*h*w, h*w, w, 1}; checkCUDNN(cudnnSetTensorNdDescriptor(tensorDesc, dataType, 4, dimA, strideA)); #else checkCUDNN(cudnnSetTensor4dDescriptorEx(tensorDesc, dataType, n, c, h, w, c*h*w, h*w, w, 1)); #endif }
void softmax_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnSoftmaxForward( cuda_config->get_cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, input_data_desc, *input_buffer, &beta, input_data_desc, *additional_buffers[0])); }
void set_batch_network(network *net, int b) { net->batch = b; int i; for(i = 0; i < net->n; ++i){ net->layers[i].batch = b; #ifdef CUDNN if(net->layers[i].type == CONVOLUTIONAL){ cudnn_convolutional_setup(net->layers + i); } if(net->layers[i].type == DECONVOLUTIONAL){ layer *l = net->layers + i; cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); } #endif } }
void convolution_1x1_layer_tester_cuda::tester_configured() { nnforge_shared_ptr<const convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const convolution_layer>(layer_schema); cudnn_safe_call(cudnnSetTensor4dDescriptor( bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_configuration_specific.feature_map_count, 1, 1)); }
void resize_deconvolutional_layer(layer *l, int h, int w) { l->h = h; l->w = w; l->out_h = (l->h) * l->stride + l->size/2 - l->pad; l->out_w = (l->w) * l->stride + l->size/2 - l->pad; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); if(l->batch_normalize){ l->x = realloc(l->x, l->batch*l->outputs*sizeof(float)); l->x_norm = realloc(l->x_norm, l->batch*l->outputs*sizeof(float)); } #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); if(l->batch_normalize){ cuda_free(l->x_gpu); cuda_free(l->x_norm_gpu); l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs); l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs); } #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); #endif #endif l->workspace_size = get_workspace_size(*l); }
void cudnn_convolutional_setup(layer *l) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); //cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->weightDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dweightDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); }
void convolution_layer_updater_cuda::updater_configured() { nnforge_shared_ptr<const convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const convolution_layer>(layer_schema); window_sizes = layer_derived->window_sizes; zero_padding = layer_derived->left_zero_padding; for(int i = 0; i < window_sizes.size(); ++i) { if (zero_padding[i] != layer_derived->right_zero_padding[i]) throw neural_network_exception("cuDNN is not able to run convolution when left and right padding sizes don't match"); } cudnn_safe_call(cudnnSetFilter4dDescriptor( weights_desc, CUDNN_DATA_FLOAT, output_configuration_specific.feature_map_count, input_configuration_specific.feature_map_count, (window_sizes.size() > 1) ? window_sizes[1] : 1, window_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_configuration_specific.feature_map_count, 1, 1)); cudnn_safe_call(cudnnSetConvolution2dDescriptor( convolution_desc, (zero_padding.size() > 1) ? zero_padding[1] : 1, zero_padding[0], 1, 1, 1, 1, CUDNN_CROSS_CORRELATION)); }
void convolution_layer_updater_cuda::enqueue_test( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { cudnnConvolutionFwdAlgo_t algo; cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm( cuda_config->get_cudnn_handle(), input_data_desc, weights_desc, convolution_desc, output_data_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, additional_buffers[0]->get_size(), &algo)); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, weights_desc, *data[0], convolution_desc, algo, *additional_buffers[0], additional_buffers[0]->get_size(), &beta, output_data_desc, *output_neurons_buffer)); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_neurons_buffer)); } }
void convolution_1x1_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { { cuda_util::transpose( *cuda_config, *input_buffer, *additional_buffers[1], input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, entry_count, stream_id); cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_configuration_specific.feature_map_count, entry_count * input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, &alpha, *data[0], input_configuration_specific.feature_map_count, *additional_buffers[1], input_configuration_specific.feature_map_count, &beta, *additional_buffers[2], output_configuration_specific.feature_map_count)); cuda_util::transpose( *cuda_config, *additional_buffers[2], *additional_buffers[0], output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, 1, output_elem_count_per_feature_map)); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *additional_buffers[0])); } }
void convolution_layer_updater_cuda::enqueue_update_weights( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device_smart_ptr>& gradient, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardFilter( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, output_data_desc, *output_errors_buffer, convolution_desc, &beta, weights_desc, *gradient[0])); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
layer make_batchnorm_layer(int batch, int w, int h, int c) { fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c); layer l = {}; l.type = BATCHNORM; l.batch = batch; l.h = l.out_h = h; l.w = l.out_w = w; l.c = l.out_c = c; l.output = (float*)calloc(h * w * c * batch, sizeof(float)); l.delta = (float*)calloc(h * w * c * batch, sizeof(float)); l.inputs = w*h*c; l.outputs = l.inputs; l.scales = (float*)calloc(c, sizeof(float)); l.scale_updates = (float*)calloc(c, sizeof(float)); l.biases = (float*)calloc(c, sizeof(float)); l.bias_updates = (float*)calloc(c, sizeof(float)); int i; for(i = 0; i < c; ++i){ l.scales[i] = 1; } l.mean = (float*)calloc(c, sizeof(float)); l.variance = (float*)calloc(c, sizeof(float)); l.rolling_mean = (float*)calloc(c, sizeof(float)); l.rolling_variance = (float*)calloc(c, sizeof(float)); l.forward = forward_batchnorm_layer; l.backward = backward_batchnorm_layer; #ifdef GPU l.forward_gpu = forward_batchnorm_layer_gpu; l.backward_gpu = backward_batchnorm_layer_gpu; l.output_gpu = cuda_make_array(l.output, h * w * c * batch); l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch); l.biases_gpu = cuda_make_array(l.biases, c); l.bias_updates_gpu = cuda_make_array(l.bias_updates, c); l.scales_gpu = cuda_make_array(l.scales, c); l.scale_updates_gpu = cuda_make_array(l.scale_updates, c); l.mean_gpu = cuda_make_array(l.mean, c); l.variance_gpu = cuda_make_array(l.variance, c); l.rolling_mean_gpu = cuda_make_array(l.mean, c); l.rolling_variance_gpu = cuda_make_array(l.variance, c); l.mean_delta_gpu = cuda_make_array(l.mean, c); l.variance_delta_gpu = cuda_make_array(l.variance, c); l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs); l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs); #ifdef CUDNN cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); #endif #endif return l; }
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.binary = binary; l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; l.pad = pad; l.batch_normalize = batch_normalize; l.filters = calloc(c*n*size*size, sizeof(float)); l.filter_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); if(binary){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } if(xnor){ l.binary_filters = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } #ifdef GPU l.filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); if(binary){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); } if(xnor){ l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.filterDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dfilterDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); cudnnSetTensor4dDescriptor(l.dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetFilter4dDescriptor(l.filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l.n, l.c, l.size, l.size); int padding = l.pad ? l.size/2 : 0; cudnnSetConvolution2dDescriptor(l.convDesc, padding, padding, l.stride, l.stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l.srcTensorDesc, l.filterDesc, l.convDesc, l.dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l.fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l.filterDesc, l.ddstTensorDesc, l.convDesc, l.dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l.bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l.srcTensorDesc, l.ddstTensorDesc, l.convDesc, l.dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l.bf_algo); #endif #endif l.workspace_size = get_workspace_size(l); l.activation = activation; fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); return l; }
void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*out_h * out_w * l->n*sizeof(float)); l->delta = realloc(l->delta, l->batch*out_h * out_w * l->n*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); int padding = l->pad ? l->size/2 : 0; cudnnSetConvolution2dDescriptor(l->convDesc, padding, padding, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->filterDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->filterDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); #endif #endif l->workspace_size = get_workspace_size(*l); }
void ConvBC01CuDNN<T>::fprop(const T *imgs, const T *filters, int n_imgs, int n_channels, int n_filters, int img_h, int img_w, int filter_h, int filter_w, T *convout) { bool set_conv_desc = false; if (n_imgs != this->n_imgs || n_channels != this->n_channels || img_h != this->img_h || img_w != this->img_w) { CUDNN_CHECK(cudnnSetTensor4dDescriptor( imgs_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n_imgs, n_channels, img_h, img_w )); this->n_imgs = n_imgs; this->n_channels = n_channels; this->img_h = img_h; this->img_w = img_w; set_conv_desc = true; } if (n_filters != this->n_filters || n_channels != this->n_channels || filter_h != this->filter_h || filter_w != this->filter_w) { CUDNN_CHECK(cudnnSetFilter4dDescriptor( filters_desc, CUDNN_DATA_FLOAT, n_filters, n_channels, filter_h, filter_w )); this->n_filters = n_filters; this->n_channels = n_channels; this->filter_h = filter_h; this->filter_w = filter_w; set_conv_desc = true; } if (set_conv_desc) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor( conv_desc, pad_y, pad_x, stride_y, stride_x, 1, 1, CUDNN_CONVOLUTION )); int n, c, h, w; CUDNN_CHECK(cudnnGetConvolution2dForwardOutputDim( conv_desc, imgs_desc, filters_desc, &n, &c, &h, &w )); CUDNN_CHECK(cudnnSetTensor4dDescriptor( convout_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w )); const int n_requestedAlgo = 10; int n_returnedAlgo; cudnnConvolutionFwdAlgoPerf_t fwd_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, n_requestedAlgo, &n_returnedAlgo, fwd_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionFwdAlgoPerf_t found"); } fwd_algo = fwd_algo_perf[0].algo; cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc, n_requestedAlgo, &n_returnedAlgo, bwd_data_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionBwdDataAlgoPerf_t found"); } bwd_imgs_algo = bwd_data_algo_perf[0].algo; cudnnConvolutionBwdFilterAlgoPerf_t bwd_filters_algo_perf[n_requestedAlgo]; CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc, n_requestedAlgo, &n_returnedAlgo, bwd_filters_algo_perf )); if (n_returnedAlgo == 0) { throw std::runtime_error("No cudnnConvolutionBwdFilterAlgoPerf_t found"); } bwd_filters_algo = bwd_filters_algo_perf[0].algo; size_t fwd_workspace_size; size_t bwd_imgs_workspace_size; size_t bwd_filters_workspace_size; CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( CUDNN::handle(), imgs_desc, filters_desc, conv_desc, convout_desc, fwd_algo, &fwd_workspace_size )); CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( CUDNN::handle(), filters_desc, convout_desc, conv_desc, imgs_desc, bwd_imgs_algo, &bwd_imgs_workspace_size )); CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( CUDNN::handle(), imgs_desc, convout_desc, conv_desc, filters_desc, bwd_filters_algo, &bwd_filters_workspace_size )); workspace_size = std::max(fwd_workspace_size, bwd_imgs_workspace_size); workspace_size = std::max(workspace_size, bwd_filters_workspace_size); } void *workspace = NULL; if (workspace_size > 0) { workspace = CUDA::buffer(workspace_size); } CUDNN_CHECK(cudnnConvolutionForward( CUDNN::handle(), &CUDNN::one, imgs_desc, imgs, filters_desc, filters, conv_desc, fwd_algo, workspace, workspace_size, &CUDNN::zero, convout_desc, convout )); }
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; int nInputPlane = module->SpatialConvolution.nInputPlane; int nOutputPlane = module->SpatialConvolution.nOutputPlane; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int sizes[4]; int pad[2], filterStride[2], upscale[2]; cudnnTensorDescriptor_t dinput, dbias, doutput; cudnnConvolutionDescriptor_t dconv; cudnnFilterDescriptor_t dweight; float one = 1, zero = 0; size_t reqwssize; static void *ws; static size_t wssize; static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; pad[0] = padH; pad[1] = padW; filterStride[0] = dH; filterStride[1] = dW; upscale[0] = 1; upscale[1] = 1; if(input->nDimension <= 2) { // Here we use the SpatialConvolution module to perform a linear transformation errcheck(cudnnCreateTensorDescriptor(&dinput)); if(input->nDimension == 1) errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1)); else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1)); } else errcheck(THcudnn_TensorDescriptor(&dinput, input)); errcheck(cudnnCreateFilterDescriptor(&dweight)); errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW)); errcheck(cudnnCreateTensorDescriptor(&dbias)); errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1)); errcheck(cudnnCreateConvolutionDescriptor(&dconv)); errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype)); errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes)); THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize)); if(reqwssize > wssize) { wssize = reqwssize; errcheck(cudaMalloc(&ws, reqwssize)); } } errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input), dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero, doutput, THFloatTensor_data(output))); errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias), &one, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyFilterDescriptor(dweight); cudnnDestroyTensorDescriptor(dbias); cudnnDestroyTensorDescriptor(doutput); cudnnDestroyConvolutionDescriptor(dconv); return output; }
layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam) { fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; layer l = { 0 }; l.batch = batch; l.type = LSTM; l.steps = steps; l.inputs = inputs; l.uf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.uf) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uf->batch = batch; l.ui = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.ui) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ui->batch = batch; l.ug = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.ug) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ug->batch = batch; l.uo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.uo) = make_connected_layer(batch * steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uo->batch = batch; l.wf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.wf) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wf->batch = batch; l.wi = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.wi) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wi->batch = batch; l.wg = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.wg) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wg->batch = batch; l.wo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.wo) = make_connected_layer(batch * steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wo->batch = batch; l.batch_normalize = batch_normalize; l.outputs = outputs; l.output = calloc(outputs * batch * steps, sizeof(real_t)); l.state = calloc(outputs * batch, sizeof(real_t)); l.forward = forward_lstm_layer; l.update = update_lstm_layer; l.prev_state_cpu = calloc(batch * outputs, sizeof(real_t)); l.prev_cell_cpu = calloc(batch * outputs, sizeof(real_t)); l.cell_cpu = calloc(batch * outputs * steps, sizeof(real_t)); l.f_cpu = calloc(batch * outputs, sizeof(real_t)); l.i_cpu = calloc(batch * outputs, sizeof(real_t)); l.g_cpu = calloc(batch * outputs, sizeof(real_t)); l.o_cpu = calloc(batch * outputs, sizeof(real_t)); l.c_cpu = calloc(batch * outputs, sizeof(real_t)); l.h_cpu = calloc(batch * outputs, sizeof(real_t)); l.temp_cpu = calloc(batch * outputs, sizeof(real_t)); l.temp2_cpu = calloc(batch * outputs, sizeof(real_t)); l.temp3_cpu = calloc(batch * outputs, sizeof(real_t)); l.dc_cpu = calloc(batch * outputs, sizeof(real_t)); l.dh_cpu = calloc(batch * outputs, sizeof(real_t)); #ifdef GPU l.forward_gpu = forward_lstm_layer_gpu; l.backward_gpu = backward_lstm_layer_gpu; l.update_gpu = update_lstm_layer_gpu; l.output_gpu = cuda_make_array(0, batch * outputs * steps); l.delta_gpu = cuda_make_array(0, batch * l.outputs * steps); l.prev_state_gpu = cuda_make_array(0, batch * outputs); l.prev_cell_gpu = cuda_make_array(0, batch * outputs); l.cell_gpu = cuda_make_array(0, batch * outputs * steps); l.f_gpu = cuda_make_array(0, batch * outputs); l.i_gpu = cuda_make_array(0, batch * outputs); l.g_gpu = cuda_make_array(0, batch * outputs); l.o_gpu = cuda_make_array(0, batch * outputs); l.c_gpu = cuda_make_array(0, batch * outputs); l.h_gpu = cuda_make_array(0, batch * outputs); l.temp_gpu = cuda_make_array(0, batch * outputs); l.temp2_gpu = cuda_make_array(0, batch * outputs); l.temp3_gpu = cuda_make_array(0, batch * outputs); l.dc_gpu = cuda_make_array(0, batch * outputs); l.dh_gpu = cuda_make_array(0, batch * outputs); #ifdef CUDNN cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w); cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w); cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w); cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w); cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w); cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w); cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w); cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w); #endif #endif return l; }
layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize) { int i; layer l = {0}; l.type = DECONVOLUTIONAL; l.h = h; l.w = w; l.c = c; l.n = n; l.batch = batch; l.stride = stride; l.size = size; l.weights = calloc(c*n*size*size, sizeof(float)); l.weight_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); float scale = 1./sqrt(size*size*c); for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal(); for(i = 0; i < n; ++i){ l.biases[i] = scale; } l.pad = l.size/2; l.out_h = (l.h) * l.stride + l.size/2 - l.pad; l.out_w = (l.w) * l.stride + l.size/2 - l.pad; l.out_c = n; l.outputs = l.out_w * l.out_h * l.out_c; l.inputs = l.w * l.h * l.c; l.output = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); l.delta = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); l.forward = forward_deconvolutional_layer; l.backward = backward_deconvolutional_layer; l.update = update_deconvolutional_layer; l.batch_normalize = batch_normalize; if(batch_normalize){ l.scales = calloc(n, sizeof(float)); l.scale_updates = calloc(n, sizeof(float)); for(i = 0; i < n; ++i){ l.scales[i] = 1; } l.mean = calloc(n, sizeof(float)); l.variance = calloc(n, sizeof(float)); l.mean_delta = calloc(n, sizeof(float)); l.variance_delta = calloc(n, sizeof(float)); l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); l.x = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } #ifdef GPU l.forward_gpu = forward_deconvolutional_layer_gpu; l.backward_gpu = backward_deconvolutional_layer_gpu; l.update_gpu = update_deconvolutional_layer_gpu; if(gpu_index >= 0){ l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); l.biases_gpu = cuda_make_array(l.biases, n); l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); l.scales_gpu = cuda_make_array(l.scales, n); l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); l.x_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); } } #ifdef CUDNN cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); #endif #endif l.activation = activation; l.workspace_size = get_workspace_size(l); fprintf(stderr, "deconv%5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; }