sparse_fully_connected_1x1_layer_tester_cuda::sparse_fully_connected_1x1_layer_tester_cuda() : output_data_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
void softmax_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnSoftmaxForward( cuda_config->get_cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, input_data_desc, *input_buffers[0], &beta, input_data_desc, *output_buffer)); }
convolution_1x1_layer_tester_cuda::convolution_1x1_layer_tester_cuda() : output_data_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
void softmax_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnSoftmaxForward( cuda_config->get_cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, input_data_desc, *input_buffer, &beta, input_data_desc, *additional_buffers[0])); }
activation_layer_cudnn_updater_cuda::activation_layer_cudnn_updater_cuda(cudnnActivationMode_t af) : input_data_desc(0) , activation_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc)); cudnn_safe_call(cudnnCreateActivationDescriptor(&activation_desc)); cudnnSetActivationDescriptor(activation_desc, af, CUDNN_NOT_PROPAGATE_NAN, 0.0F); }
void sparse_fully_connected_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { // Update weights { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 1.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, input_elem_count_per_entry_list[0], output_elem_count_per_entry, entry_count, &alpha, *input_neurons_buffers[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *gradient[0], input_elem_count_per_entry_list[0])); } // Update biases if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
sparse_1x1_layer_tester_cuda::sparse_1x1_layer_tester_cuda() : output_data_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_strided_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&input_converted_NHWC_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&input_converted_CNHW_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
void fully_connected_layer_updater_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::ptr temporary_fixed_buffer, cuda_linear_buffer_device::ptr temporary_per_entry_buffer, unsigned int entry_count) { { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], &alpha, *data[0], input_elem_count_per_entry_list[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_updater_cuda::enqueue_backprop( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { if (!backprop_required) throw neural_network_exception("convolution_layer_updater_cuda is not configured to do backprop but requested to"); cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionBackwardData( cuda_config->get_cudnn_handle(), &alpha, weights_desc, *data[0], output_data_desc, *output_errors_buffer, convolution_desc, &beta, input_data_desc, *input_errors_buffer)); } }
convolution_layer_updater_cuda::convolution_layer_updater_cuda() : input_data_desc(0) , output_data_desc(0) , weights_desc(0) , convolution_desc(0) , bias_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&output_data_desc)); cudnn_safe_call(cudnnCreateFilterDescriptor(&weights_desc)); cudnn_safe_call(cudnnCreateConvolutionDescriptor(&convolution_desc)); cudnn_safe_call(cudnnCreateTensorDescriptor(&bias_desc)); }
void activation_layer_cudnn_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = add_update_to_destination ? 1.0F : 0.0F; cudnn_safe_call(cudnnActivationBackward_v4( cuda_config->get_cudnn_handle(), activation_desc, &alpha, input_data_desc, *output_neurons_buffer, input_data_desc, *output_errors_buffer, input_data_desc, *input_neurons_buffers[0], &beta, input_data_desc, *input_errors_buffer)); }
void convolution_layer_updater_cuda::updater_configured() { nnforge_shared_ptr<const convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const convolution_layer>(layer_schema); window_sizes = layer_derived->window_sizes; zero_padding = layer_derived->left_zero_padding; for(int i = 0; i < window_sizes.size(); ++i) { if (zero_padding[i] != layer_derived->right_zero_padding[i]) throw neural_network_exception("cuDNN is not able to run convolution when left and right padding sizes don't match"); } cudnn_safe_call(cudnnSetFilter4dDescriptor( weights_desc, CUDNN_DATA_FLOAT, output_configuration_specific.feature_map_count, input_configuration_specific.feature_map_count, (window_sizes.size() > 1) ? window_sizes[1] : 1, window_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_configuration_specific.feature_map_count, 1, 1)); cudnn_safe_call(cudnnSetConvolution2dDescriptor( convolution_desc, (zero_padding.size() > 1) ? zero_padding[1] : 1, zero_padding[0], 1, 1, 1, 1, CUDNN_CROSS_CORRELATION)); }
void convolution_1x1_layer_tester_cuda::tester_configured() { nnforge_shared_ptr<const convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const convolution_layer>(layer_schema); cudnn_safe_call(cudnnSetTensor4dDescriptor( bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_configuration_specific.feature_map_count, 1, 1)); }
void cudnn_util::set_pooling_descriptor( cudnnPoolingDescriptor_t pooling_desc, cudnnPoolingMode_t pooling_mode, const std::vector<unsigned int>& subsampling_sizes) { std::vector<int> padding(subsampling_sizes.size(), 0); std::vector<int> dimensions(subsampling_sizes.rbegin(), subsampling_sizes.rend()); cudnn_safe_call(cudnnSetPoolingNdDescriptor( pooling_desc, pooling_mode, static_cast<int>(subsampling_sizes.size()), &dimensions[0], &padding[0], &dimensions[0])); }
void cudnn_util::set_convolution_descriptor( cudnnConvolutionDescriptor_t convolution_desc, const std::vector<unsigned int> zero_padding) { std::vector<int> conv_padding(zero_padding.rbegin(), zero_padding.rend()); std::vector<int> filter_stride(zero_padding.size(), 1); std::vector<int> upscale(zero_padding.size(), 1); cudnn_safe_call(cudnnSetConvolutionNdDescriptor_v3( convolution_desc, static_cast<int>(zero_padding.size()), &conv_padding[0], &filter_stride[0], &upscale[0], CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); }
void cudnn_util::set_filter_descriptor( cudnnFilterDescriptor_t filter_desc, unsigned int output_feature_map_count, unsigned int input_feature_map_count, const std::vector<unsigned int>& windows_sizes) { std::vector<int> filter_dimensions(windows_sizes.size() + 2); filter_dimensions[0] = output_feature_map_count; filter_dimensions[1] = input_feature_map_count; for(int i = 0; i < windows_sizes.size(); ++i) filter_dimensions[i + 2] = windows_sizes[windows_sizes.size() - 1 - i]; cudnn_safe_call(cudnnSetFilterNdDescriptor( filter_desc, CUDNN_DATA_FLOAT, static_cast<int>(filter_dimensions.size()), &filter_dimensions[0])); }
void cudnn_util::set_tensor_descriptor( cudnnTensorDescriptor_t tensor_desc, const layer_configuration_specific& config, unsigned int entry_count) { std::vector<int> tensor_dimensions(config.dimension_sizes.size() + 2); tensor_dimensions[0] = entry_count; tensor_dimensions[1] = config.feature_map_count; for(int i = 0; i < config.dimension_sizes.size(); ++i) tensor_dimensions[i + 2] = config.dimension_sizes[config.dimension_sizes.size() - 1 - i]; std::vector<int> tensor_strides(tensor_dimensions.size()); tensor_strides.back() = 1; for(int i = static_cast<int>(tensor_strides.size()) - 2; i >= 0; --i) tensor_strides[i] = tensor_strides[i + 1] * tensor_dimensions[i + 1]; cudnn_safe_call(cudnnSetTensorNdDescriptor( tensor_desc, CUDNN_DATA_FLOAT, static_cast<int>(tensor_dimensions.size()), &tensor_dimensions[0], &tensor_strides[0])); }
void convolution_layer_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionBwdDataAlgo_t algo = cuda_config->cudnn_find_convolution_backward_data_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *output_errors_buffer, *data[0], *temporary_working_per_entry_buffer, workspace, workspace_size); float alpha = 1.0F; float beta = (add_update_to_destination ? 1.0F : 0.0F); cudnn_safe_call(cudnnConvolutionBackwardData( cuda_config->get_cudnn_handle(), &alpha, weights_desc, *data[0], output_data_desc, *output_errors_buffer, convolution_desc, algo, workspace, workspace_size, &beta, input_data_desc, *input_errors_buffer)); } }
softmax_layer_tester_cuda::softmax_layer_tester_cuda() : input_data_desc(0) { cudnn_safe_call(cudnnCreateTensorDescriptor(&input_data_desc)); }
void convolution_layer_updater_cuda::enqueue_test( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { cudnnConvolutionFwdAlgo_t algo; cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm( cuda_config->get_cudnn_handle(), input_data_desc, weights_desc, convolution_desc, output_data_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, additional_buffers[0]->get_size(), &algo)); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, weights_desc, *data[0], convolution_desc, algo, *additional_buffers[0], additional_buffers[0]->get_size(), &beta, output_data_desc, *output_neurons_buffer)); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_neurons_buffer)); } }
void convolution_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionFwdAlgo_t algo = cuda_config->cudnn_find_convolution_forward_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *input_buffers[0], *data[0], *output_buffer, workspace, workspace_size); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, *input_buffers[0], weights_desc, *data[0], convolution_desc, algo, workspace, workspace_size, &beta, output_data_desc, *output_buffer)); } if (bias) { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void sparse_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { // Convert input data strided NCHW to packed CNHW format if (unit_stride) { cuda_util::transpose23( *cuda_config, *input_buffers[0], *temporary_working_per_entry_buffer, input_elem_count_per_feature_map_list[0], input_configuration_specific_list[0].feature_map_count, entry_count, stream_id); } else { std::vector<unsigned int> input_converted_CNHW_strides = input_converted_CNHW_strides_base; input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 2] = input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 1] * entry_count; cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_strided_data_desc, input_strided_config, entry_count, input_strides); cudnn_util::set_tensor_descriptor( input_converted_CNHW_data_desc, input_strided_config, entry_count, input_converted_CNHW_strides); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, input_strided_data_desc, *input_buffers[0], &beta, input_converted_CNHW_data_desc, *temporary_working_per_entry_buffer)); } { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm2( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, output_configuration_specific.feature_map_count, entry_count * output_elem_count_per_feature_map, input_strided_config.feature_map_count, feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *temporary_working_per_entry_buffer, entry_count * output_elem_count_per_feature_map, &beta, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, output_configuration_specific.feature_map_count)); } // Convert output from NHWC to NCHW { cuda_util::transpose( *cuda_config, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, *output_buffer, output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_updater_cuda::enqueue_update_weights( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device_smart_ptr>& gradient, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardFilter( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, output_data_desc, *output_errors_buffer, convolution_desc, &beta, weights_desc, *gradient[0])); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
void convolution_1x1_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { { cuda_util::transpose( *cuda_config, *input_buffer, *additional_buffers[1], input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, entry_count, stream_id); cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_configuration_specific.feature_map_count, entry_count * input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, &alpha, *data[0], input_configuration_specific.feature_map_count, *additional_buffers[1], input_configuration_specific.feature_map_count, &beta, *additional_buffers[2], output_configuration_specific.feature_map_count)); cuda_util::transpose( *cuda_config, *additional_buffers[2], *additional_buffers[0], output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, 1, output_elem_count_per_feature_map)); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *additional_buffers[0])); } }
void convolution_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionBwdFilterAlgo_t algo = cuda_config->cudnn_find_convolution_backward_weights_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *input_neurons_buffers[0], *output_errors_buffer, (unsigned char *)workspace, (unsigned char *)workspace + update_weights_find_algo_working_buffer_size, workspace_size - update_weights_find_algo_working_buffer_size); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardFilter( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, *input_neurons_buffers[0], output_data_desc, *output_errors_buffer, convolution_desc, algo, workspace, workspace_size, &beta, weights_desc, *gradient[0])); } if (bias) { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }