void sparse_fully_connected_1x1_layer_tester_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			unsigned int entry_count)
		{
			{
				cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cusparseMatDescr_t mat_descr;
				cusparse_safe_call(cusparseCreateMatDescr(&mat_descr));
				cusparse_safe_call(cusparseScsrmm(
					cuda_config->get_cusparse_handle(),
					CUSPARSE_OPERATION_NON_TRANSPOSE,
					output_elem_count_per_entry,
					entry_count,
					input_elem_count_per_entry_list[0],
					feature_map_connection_count,
					&alpha,
					mat_descr,
					*data[0],
					*data_custom[1],
					*data_custom[0],
					*input_buffers[0],
					input_elem_count_per_entry_list[0],
					&beta,
					*output_buffer,
					output_elem_count_per_entry));
			}

			// Add bias
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}
		void fully_connected_layer_updater_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_per_entry_buffer,
			unsigned int entry_count)
		{
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_T,
					CUBLAS_OP_N,
					output_elem_count_per_entry,
					entry_count,
					input_elem_count_per_entry_list[0],
					&alpha,
					*data[0],
					input_elem_count_per_entry_list[0],
					*input_buffers[0],
					input_elem_count_per_entry_list[0],
					&beta,
					*output_buffer,
					output_elem_count_per_entry));
			}

			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}
		void convolution_layer_updater_cuda::enqueue_test(
			unsigned int offset_input_entry_id,
			cudaStream_t stream_id,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device_smart_ptr>& data,
			const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom,
			const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer,
			cuda_linear_buffer_device_smart_ptr output_neurons_buffer,
			const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers,
			std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects,
			unsigned int entry_count,
			bool force_deterministic)
		{
			cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));

			cudnn_safe_call(cudnnSetTensor4dDescriptor(
				input_data_desc,
				CUDNN_TENSOR_NCHW,
				CUDNN_DATA_FLOAT,
				entry_count,
				input_configuration_specific.feature_map_count,
				(input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1,
				input_configuration_specific.dimension_sizes[0]));
			cudnn_safe_call(cudnnSetTensor4dDescriptor(
				output_data_desc,
				CUDNN_TENSOR_NCHW,
				CUDNN_DATA_FLOAT,
				entry_count,
				output_configuration_specific.feature_map_count,
				(output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1,
				output_configuration_specific.dimension_sizes[0]));

			{
				cudnnConvolutionFwdAlgo_t algo;
				cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm(
					cuda_config->get_cudnn_handle(),
					input_data_desc,
					weights_desc,
					convolution_desc,
					output_data_desc,
					CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
					additional_buffers[0]->get_size(),
					&algo));

				float alpha = 1.0F;
				float beta = 0.0F;
				cudnn_safe_call(cudnnConvolutionForward(
					cuda_config->get_cudnn_handle(),
					&alpha,
					input_data_desc,
					(const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id,
					weights_desc,
					*data[0],
					convolution_desc,
					algo,
					*additional_buffers[0],
					additional_buffers[0]->get_size(),
					&beta,
					output_data_desc,
					*output_neurons_buffer));
			}

			{
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					CUDNN_ADD_SAME_C,
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_neurons_buffer));
			}
		}
Esempio n. 4
0
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\
    dispatch(const std::vector<DataTensor_in*>& inputs,
                std::vector<DataTensor_out*>& outputs,
                ConvActivePoolingParam<OpTensor>& param) {

    const InDataType *in_data = (const InDataType*)inputs[0]->data();
    InDataType *inner_data = (InDataType*)_inner_tensor.mutable_data();
    InDataType *out_data = (InDataType*)outputs[0]->mutable_data();

    const float *weight_data = (const float *) param.conv_param.weight()->data();
    if (param.has_activation == false) {
        CUDNN_CHECK(cudnnConvolutionForward(_handle,
                                            cudnn::cudnnTypeWrapper<float>::kOne(),
                                            _input_descs, in_data,
                                            _filter_desc, weight_data,
                                            _conv_descs,  _fwd_algo, _workspace, _workspace_fwd_sizes,
                                            cudnn::cudnnTypeWrapper<float>::kZero(),
                                            _inner_descs, inner_data
        ));
        if (param.conv_param.bias()->size() > 0) {
            // add up bias.
            const float * bias_data = (const float*)param.conv_param.bias()->data();
            CUDNN_CHECK(cudnnAddTensor(_handle,
                                       cudnn::cudnnTypeWrapper<float>::kOne(),
                                       _bias_desc, bias_data,
                                       cudnn::cudnnTypeWrapper<float>::kOne(),
                                       _inner_descs, inner_data));
        }
        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
                                        _inner_descs, inner_data,
                                        cudnn::cudnnTypeWrapper<InDataType>::kZero(),
                                        _output_descs, out_data
        ));
        return SaberSuccess;
    }

    if (param.conv_param.bias()->size() > 0) {
        const float * bias_data = (const float*)param.conv_param.bias()->data();
        CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle,
                                                          cudnn::cudnnTypeWrapper<float>::kOne(),
                                                          _input_descs, in_data,
                                                          _filter_desc, weight_data,
                                                          _conv_descs, _fwd_algo,
                                                          _workspace, _workspace_fwd_sizes,
                                                          cudnn::cudnnTypeWrapper<float>::kZero(),
                                                          _inner_descs, inner_data,
                                                          _bias_desc,  bias_data,
                                                          _active_descs, _inner_descs, inner_data));

        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
                                        _inner_descs, inner_data,
                                        cudnn::cudnnTypeWrapper<InDataType>::kZero(),
                                        _output_descs, out_data
        ));

    } else {

        CUDNN_CHECK(cudnnConvolutionForward(_handle,
                                            cudnn::cudnnTypeWrapper<float>::kOne(),
                                            _input_descs, in_data,
                                            _filter_desc, weight_data,
                                            _conv_descs, _fwd_algo,
                                            _workspace, _workspace_fwd_sizes,
                                            cudnn::cudnnTypeWrapper<float>::kZero(),
                                            _inner_descs, inner_data
        ));

        CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs,
                                           cudnn::cudnnTypeWrapper<InDataType>::kOne(),
                                           _inner_descs, inner_data,
                                           cudnn::cudnnTypeWrapper<InDataType>::kZero(),
                                           _inner_descs, inner_data
        ));
        CUDNN_CHECK(cudnnPoolingForward(_handle, _pooling_descs,
                                        cudnn::cudnnTypeWrapper<InDataType>::kOne(),
                                        _inner_descs, inner_data,
                                        cudnn::cudnnTypeWrapper<InDataType>::kZero(),
                                        _output_descs, out_data
        ));
    }
    return SaberSuccess;
}
		void convolution_1x1_layer_tester_cuda::enqueue_test(
			cudaStream_t stream_id,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom,
			cuda_linear_buffer_device_smart_ptr input_buffer,
			const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers,
			unsigned int entry_count)
		{
			{
				cuda_util::transpose(
					*cuda_config,
					*input_buffer,
					*additional_buffers[1],
					input_elem_count_per_feature_map,
					input_configuration_specific.feature_map_count,
					entry_count,
					stream_id);

				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_T,
					CUBLAS_OP_N,
					output_configuration_specific.feature_map_count,
					entry_count * input_elem_count_per_feature_map,
					input_configuration_specific.feature_map_count,
					&alpha,
					*data[0],
					input_configuration_specific.feature_map_count,
					*additional_buffers[1],
					input_configuration_specific.feature_map_count,
					&beta,
					*additional_buffers[2],
					output_configuration_specific.feature_map_count));

				cuda_util::transpose(
					*cuda_config,
					*additional_buffers[2],
					*additional_buffers[0],
					output_configuration_specific.feature_map_count,
					output_elem_count_per_feature_map,
					entry_count,
					stream_id);
			}

			// Add bias
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_safe_call(cudnnSetTensor4dDescriptor(
					output_data_desc,
					CUDNN_TENSOR_NCHW,
					CUDNN_DATA_FLOAT,
					entry_count,
					output_configuration_specific.feature_map_count,
					1,
					output_elem_count_per_feature_map));

				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					CUDNN_ADD_SAME_C,
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*additional_buffers[0]));
			}
		}
		void sparse_1x1_layer_tester_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			unsigned int entry_count)
		{
			// Convert input data strided NCHW to packed CNHW format
			if (unit_stride)
			{
				cuda_util::transpose23(
					*cuda_config,
					*input_buffers[0],
					*temporary_working_per_entry_buffer,
					input_elem_count_per_feature_map_list[0],
					input_configuration_specific_list[0].feature_map_count,
					entry_count,
					stream_id);
			}
			else
			{
				std::vector<unsigned int> input_converted_CNHW_strides = input_converted_CNHW_strides_base;
				input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 2] = input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 1] * entry_count;
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					input_strided_data_desc,
					input_strided_config,
					entry_count,
					input_strides);
				cudnn_util::set_tensor_descriptor(
					input_converted_CNHW_data_desc,
					input_strided_config,
					entry_count,
					input_converted_CNHW_strides);
				float alpha = 1.0F;
				float beta = 0.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					input_strided_data_desc,
					*input_buffers[0],
					&beta,
					input_converted_CNHW_data_desc,
					*temporary_working_per_entry_buffer));
			}

			{
				cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cusparseMatDescr_t mat_descr;
				cusparse_safe_call(cusparseCreateMatDescr(&mat_descr));
				cusparse_safe_call(cusparseScsrmm2(
					cuda_config->get_cusparse_handle(),
					CUSPARSE_OPERATION_NON_TRANSPOSE,
					CUSPARSE_OPERATION_TRANSPOSE,
					output_configuration_specific.feature_map_count,
					entry_count * output_elem_count_per_feature_map,
					input_strided_config.feature_map_count,
					feature_map_connection_count,
					&alpha,
					mat_descr,
					*data[0],
					*data_custom[1],
					*data_custom[0],
					*temporary_working_per_entry_buffer,
					entry_count * output_elem_count_per_feature_map,
					&beta,
					((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count,
					output_configuration_specific.feature_map_count));
			}

			// Convert output from NHWC to NCHW
			{
				cuda_util::transpose(
					*cuda_config,
					((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count,
					*output_buffer,
					output_configuration_specific.feature_map_count,
					output_elem_count_per_feature_map,
					entry_count,
					stream_id);
			}

			// Add bias
			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}
		void convolution_layer_tester_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			unsigned int entry_count)
		{
			cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));

			cudnn_util::set_tensor_descriptor(
				input_data_desc,
				input_configuration_specific_list[0],
				entry_count);
			cudnn_util::set_tensor_descriptor(
				output_data_desc,
				output_configuration_specific,
				entry_count);

			{
				void * workspace = 0;
				size_t workspace_size = 0;
				if (temporary_working_fixed_buffer)
				{
					workspace = *temporary_working_fixed_buffer;
					workspace_size = temporary_working_fixed_buffer->get_size();
				}

				cudnnConvolutionFwdAlgo_t algo = cuda_config->cudnn_find_convolution_forward_algo(
					input_data_desc,
					weights_desc,
					convolution_desc,
					output_data_desc,
					*input_buffers[0],
					*data[0],
					*output_buffer,
					workspace,
					workspace_size);

				float alpha = 1.0F;
				float beta = 0.0F;
				cudnn_safe_call(cudnnConvolutionForward(
					cuda_config->get_cudnn_handle(),
					&alpha,
					input_data_desc,
					*input_buffers[0],
					weights_desc,
					*data[0],
					convolution_desc,
					algo,
					workspace,
					workspace_size,
					&beta,
					output_data_desc,
					*output_buffer));
			}

			if (bias)
			{
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}