void sparse_fully_connected_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void fully_connected_layer_updater_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::ptr temporary_fixed_buffer, cuda_linear_buffer_device::ptr temporary_per_entry_buffer, unsigned int entry_count) { { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], &alpha, *data[0], input_elem_count_per_entry_list[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_updater_cuda::enqueue_test( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { cudnnConvolutionFwdAlgo_t algo; cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm( cuda_config->get_cudnn_handle(), input_data_desc, weights_desc, convolution_desc, output_data_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, additional_buffers[0]->get_size(), &algo)); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, weights_desc, *data[0], convolution_desc, algo, *additional_buffers[0], additional_buffers[0]->get_size(), &beta, output_data_desc, *output_neurons_buffer)); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_neurons_buffer)); } }
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\ dispatch(const std::vector<DataTensor_in*>& inputs, std::vector<DataTensor_out*>& outputs, ConvActivePoolingParam<OpTensor>& param) { const InDataType *in_data = (const InDataType*)inputs[0]->data(); InDataType *inner_data = (InDataType*)_inner_tensor.mutable_data(); InDataType *out_data = (InDataType*)outputs[0]->mutable_data(); const float *weight_data = (const float *) param.conv_param.weight()->data(); if (param.has_activation == false) { CUDNN_CHECK(cudnnConvolutionForward(_handle, cudnn::cudnnTypeWrapper<float>::kOne(), _input_descs, in_data, _filter_desc, weight_data, _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, cudnn::cudnnTypeWrapper<float>::kZero(), _inner_descs, inner_data )); if (param.conv_param.bias()->size() > 0) { // add up bias. const float * bias_data = (const float*)param.conv_param.bias()->data(); CUDNN_CHECK(cudnnAddTensor(_handle, cudnn::cudnnTypeWrapper<float>::kOne(), _bias_desc, bias_data, cudnn::cudnnTypeWrapper<float>::kOne(), _inner_descs, inner_data)); } CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, cudnn::cudnnTypeWrapper<InDataType>::kOne(), _inner_descs, inner_data, cudnn::cudnnTypeWrapper<InDataType>::kZero(), _output_descs, out_data )); return SaberSuccess; } if (param.conv_param.bias()->size() > 0) { const float * bias_data = (const float*)param.conv_param.bias()->data(); CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, cudnn::cudnnTypeWrapper<float>::kOne(), _input_descs, in_data, _filter_desc, weight_data, _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, cudnn::cudnnTypeWrapper<float>::kZero(), _inner_descs, inner_data, _bias_desc, bias_data, _active_descs, _inner_descs, inner_data)); CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, cudnn::cudnnTypeWrapper<InDataType>::kOne(), _inner_descs, inner_data, cudnn::cudnnTypeWrapper<InDataType>::kZero(), _output_descs, out_data )); } else { CUDNN_CHECK(cudnnConvolutionForward(_handle, cudnn::cudnnTypeWrapper<float>::kOne(), _input_descs, in_data, _filter_desc, weight_data, _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, cudnn::cudnnTypeWrapper<float>::kZero(), _inner_descs, inner_data )); CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, cudnn::cudnnTypeWrapper<InDataType>::kOne(), _inner_descs, inner_data, cudnn::cudnnTypeWrapper<InDataType>::kZero(), _inner_descs, inner_data )); CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs, cudnn::cudnnTypeWrapper<InDataType>::kOne(), _inner_descs, inner_data, cudnn::cudnnTypeWrapper<InDataType>::kZero(), _output_descs, out_data )); } return SaberSuccess; }
void convolution_1x1_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { { cuda_util::transpose( *cuda_config, *input_buffer, *additional_buffers[1], input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, entry_count, stream_id); cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_configuration_specific.feature_map_count, entry_count * input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, &alpha, *data[0], input_configuration_specific.feature_map_count, *additional_buffers[1], input_configuration_specific.feature_map_count, &beta, *additional_buffers[2], output_configuration_specific.feature_map_count)); cuda_util::transpose( *cuda_config, *additional_buffers[2], *additional_buffers[0], output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, 1, output_elem_count_per_feature_map)); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *additional_buffers[0])); } }
void sparse_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { // Convert input data strided NCHW to packed CNHW format if (unit_stride) { cuda_util::transpose23( *cuda_config, *input_buffers[0], *temporary_working_per_entry_buffer, input_elem_count_per_feature_map_list[0], input_configuration_specific_list[0].feature_map_count, entry_count, stream_id); } else { std::vector<unsigned int> input_converted_CNHW_strides = input_converted_CNHW_strides_base; input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 2] = input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 1] * entry_count; cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_strided_data_desc, input_strided_config, entry_count, input_strides); cudnn_util::set_tensor_descriptor( input_converted_CNHW_data_desc, input_strided_config, entry_count, input_converted_CNHW_strides); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, input_strided_data_desc, *input_buffers[0], &beta, input_converted_CNHW_data_desc, *temporary_working_per_entry_buffer)); } { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm2( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, output_configuration_specific.feature_map_count, entry_count * output_elem_count_per_feature_map, input_strided_config.feature_map_count, feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *temporary_working_per_entry_buffer, entry_count * output_elem_count_per_feature_map, &beta, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, output_configuration_specific.feature_map_count)); } // Convert output from NHWC to NCHW { cuda_util::transpose( *cuda_config, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, *output_buffer, output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionFwdAlgo_t algo = cuda_config->cudnn_find_convolution_forward_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *input_buffers[0], *data[0], *output_buffer, workspace, workspace_size); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, *input_buffers[0], weights_desc, *data[0], convolution_desc, algo, workspace, workspace_size, &beta, output_data_desc, *output_buffer)); } if (bias) { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }