void softmax_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnSoftmaxForward( cuda_config->get_cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, input_data_desc, *input_buffer, &beta, input_data_desc, *additional_buffers[0])); }
virtual void Forward(bool is_train, const std::vector<Node<gpu>*> &nodes_in, const std::vector<Node<gpu>*> &nodes_out, ConnectState<gpu> *p_cstate) { mshadow::Tensor<gpu,4> &tmp = p_cstate->states[0]; if (!init_cudnn_) { init_cudnn_ = true; CUDA_CHECK(cudnnSetStream(handle_, nodes_out[0]->data.stream_->stream_)); mshadow::Tensor<gpu, 4, float> &in = nodes_in[0]->data; mshadow::Tensor<gpu, 4, float> &out = nodes_out[0]->data; CUDA_CHECK(cudnnSetTensor4dDescriptor(in_desc_, CUDNN_TENSOR_NCHW, dtype_, in.shape_[0], in.shape_[1], in.shape_[2], in.shape_[3])); CUDA_CHECK(cudnnSetTensor4dDescriptor(out_desc_, CUDNN_TENSOR_NCHW, dtype_, out.shape_[0], out.shape_[1], out.shape_[2], out.shape_[3])); } float alpha = 1.0f; float beta = 0.0f; utils::Assert(nodes_in[0]->data.CheckContiguous(), "contiguous in conv"); utils::Assert(nodes_out[0]->data.CheckContiguous(), "contiguous in conv"); utils::Assert(tmp.CheckContiguous(), "contiguous in conv"); CUDA_CHECK(cudnnPoolingForward(handle_, pooling_desc_, &alpha, in_desc_, nodes_in[0]->data.dptr_, &beta, out_desc_, tmp.dptr_)); mshadow::Copy(nodes_out[0]->data, tmp, nodes_out[0]->data.stream_); }
void softmax_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnSoftmaxForward( cuda_config->get_cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, input_data_desc, *input_buffers[0], &beta, input_data_desc, *output_buffer)); }
void sparse_fully_connected_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { // Update weights { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 1.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, input_elem_count_per_entry_list[0], output_elem_count_per_entry, entry_count, &alpha, *input_neurons_buffers[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *gradient[0], input_elem_count_per_entry_list[0])); } // Update biases if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
GpuDevice::Impl::Impl(int d) : device(d) { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream[i])); CUBLAS_CALL(cublasCreate(&cublas_handle[i])); CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i])); } }
void fully_connected_layer_updater_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::ptr temporary_fixed_buffer, cuda_linear_buffer_device::ptr temporary_per_entry_buffer, unsigned int entry_count) { { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], &alpha, *data[0], input_elem_count_per_entry_list[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_updater_cuda::enqueue_backprop( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { if (!backprop_required) throw neural_network_exception("convolution_layer_updater_cuda is not configured to do backprop but requested to"); cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionBackwardData( cuda_config->get_cudnn_handle(), &alpha, weights_desc, *data[0], output_data_desc, *output_errors_buffer, convolution_desc, &beta, input_data_desc, *input_errors_buffer)); } }
void cudnn_affine_grid_generator_backward( THCState* state, cudnnHandle_t handle, cudnnDataType_t dataType, THVoidTensor* grad_theta, THVoidTensor* grad_grid, int N, int C, int H, int W) { CHECK(cudnnSetStream(handle, THCState_getCurrentStream(state))); assertSameGPU(dataType, grad_theta, grad_grid); checkIOSize(grad_theta, grad_grid, N, H, W); SpatialTransformerDescriptor desc; setSamplerDescriptor(desc, dataType, N, C, H, W); CHECK(cudnnSpatialTfGridGeneratorBackward(handle, desc.desc, tensorPointer(dataType, grad_grid), tensorPointer(dataType, grad_theta))); }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace = NULL; workspace = NULL; workspaceSizeInBytes = (size_t)0; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
void activation_layer_cudnn_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = add_update_to_destination ? 1.0F : 0.0F; cudnn_safe_call(cudnnActivationBackward_v4( cuda_config->get_cudnn_handle(), activation_desc, &alpha, input_data_desc, *output_neurons_buffer, input_data_desc, *output_errors_buffer, input_data_desc, *input_neurons_buffers[0], &beta, input_data_desc, *input_errors_buffer)); }
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) { CUDA_CALL(cudaSetDevice(device_)); cudaFree(0); // Initialize auto allocator = [this](size_t len) -> void* { void* ret; CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaMalloc(&ret, len)); return ret; }; auto deallocator = [this](void* ptr) { CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaFree(ptr)); }; data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream_[i])); CUBLAS_CALL(cublasCreate(&cublas_handle_[i])); CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle_[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i])); } }
void convolution_layer_updater_cuda::enqueue_test( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { cudnnConvolutionFwdAlgo_t algo; cudnn_safe_call(cudnnGetConvolutionForwardAlgorithm( cuda_config->get_cudnn_handle(), input_data_desc, weights_desc, convolution_desc, output_data_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, additional_buffers[0]->get_size(), &algo)); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, weights_desc, *data[0], convolution_desc, algo, *additional_buffers[0], additional_buffers[0]->get_size(), &beta, output_data_desc, *output_neurons_buffer)); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_neurons_buffer)); } }
void convolution_layer_updater_cuda::enqueue_update_weights( unsigned int offset_input_entry_id, cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device_smart_ptr>& gradient, const std::vector<cuda_linear_buffer_device_smart_ptr>& data_custom, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, std::vector<cuda_memobject_smart_ptr>& dynamic_memobjects, unsigned int entry_count, bool force_deterministic) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( input_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, input_configuration_specific.feature_map_count, (input_configuration_specific.dimension_sizes.size() > 1) ? input_configuration_specific.dimension_sizes[1] : 1, input_configuration_specific.dimension_sizes[0])); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, (output_configuration_specific.dimension_sizes.size() > 1) ? output_configuration_specific.dimension_sizes[1] : 1, output_configuration_specific.dimension_sizes[0])); { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardFilter( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, (const float *)(*input_neurons_buffer) + input_elem_count_per_entry * offset_input_entry_id, output_data_desc, *output_errors_buffer, convolution_desc, &beta, weights_desc, *gradient[0])); } { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
void convolution_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionBwdFilterAlgo_t algo = cuda_config->cudnn_find_convolution_backward_weights_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *input_neurons_buffers[0], *output_errors_buffer, (unsigned char *)workspace, (unsigned char *)workspace + update_weights_find_algo_working_buffer_size, workspace_size - update_weights_find_algo_working_buffer_size); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardFilter( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, *input_neurons_buffers[0], output_data_desc, *output_errors_buffer, convolution_desc, algo, workspace, workspace_size, &beta, weights_desc, *gradient[0])); } if (bias) { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
void convolution_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionFwdAlgo_t algo = cuda_config->cudnn_find_convolution_forward_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *input_buffers[0], *data[0], *output_buffer, workspace, workspace_size); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnConvolutionForward( cuda_config->get_cudnn_handle(), &alpha, input_data_desc, *input_buffers[0], weights_desc, *data[0], convolution_desc, algo, workspace, workspace_size, &beta, output_data_desc, *output_buffer)); } if (bias) { float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; // Initialize algorithm arrays fwd_algo_ = new cudnnConvolutionFwdAlgo_t[bottom.size()]; bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; // initialize size arrays workspace_fwd_sizes_ = new uint_tp[bottom.size()]; workspace_bwd_filter_sizes_ = new uint_tp[bottom.size()]; workspace_bwd_data_sizes_ = new uint_tp[bottom.size()]; // workspace data workspaceSizeInBytes = 0; workspaceData = NULL; workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; for (uint_tp i = 0; i < bottom.size(); ++i) { // initialize all to default algorithms fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0; bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0; bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0; // default algorithms don't require workspace workspace_fwd_sizes_[i] = 0; workspace_bwd_data_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; } for (int_tp g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); workspace[g] = NULL; } // Set the indexing parameters. bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. const int_tp* kernel_shape_data = this->kernel_shape_.cpu_data(); const int_tp kernel_h = kernel_shape_data[0]; const int_tp kernel_w = kernel_shape_data[1]; cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, kernel_h, kernel_w); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int_tp i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\ create(const std::vector<DataTensor_in *>& inputs, std::vector<DataTensor_out *>& outputs, ConvActivePoolingParam<OpTensor>& param, Context<NV> &ctx) { if (!(ctx == this->_ctx)) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } this->_ctx = ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } int input_num = inputs[0]->num(); int input_channel = inputs[0]->channel(); int input_height = inputs[0]->height(); int input_width = inputs[0]->width(); int output_channel = outputs[0]->channel(); int output_height = outputs[0]->height(); int output_width = outputs[0]->width(); { _inner_shape = inputs[0]->shape(); _inner_shape[0] = input_num; _inner_shape[1] = param.conv_param.weight()->num(); int kernel_exten = param.conv_param.dilation_h * (param.conv_param.weight()->height() - 1) + 1; int output_dim = (input_height + 2 * param.conv_param.pad_h - kernel_exten) / param.conv_param.stride_h + 1; _inner_shape[2] = output_dim; kernel_exten = param.conv_param.dilation_w * (param.conv_param.weight()->width() - 1) + 1; output_dim = (input_width + 2 * param.conv_param.pad_w - kernel_exten) / param.conv_param.stride_w + 1; _inner_shape[3] = output_dim; _inner_tensor.re_alloc(_inner_shape); } int kernel_h = param.conv_param.weight()->height(); int kernel_w = param.conv_param.weight()->width(); int filter_dim_a[] = {output_channel, input_channel / param.conv_param.group, kernel_h, kernel_w}; cudnn::setNDFilterDesc<OpDataType>(&_filter_desc, param.conv_param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); Shape in_stride = inputs[0]->get_stride(); Shape inner_stride = _inner_tensor.get_stride(); Shape out_stride = outputs[0]->get_stride(); int dim_a[] = {input_num, input_channel, input_height, input_width}; int dim_inner[] = {_inner_shape[0], _inner_shape[1], _inner_shape[2], _inner_shape[3]}; int dim_b[] = {input_num, output_channel, output_height, output_width}; cudnn::setTensorNdDesc<InDataType >(&_input_descs, inputs[0]->dims(), dim_a, &in_stride[0]); cudnn::setTensorNdDesc<InDataType >(&_inner_descs, 4, dim_inner, &inner_stride[0]); cudnn::setTensorNdDesc<InDataType>(&_output_descs, outputs[0]->dims(), dim_b, &out_stride[0]); int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; cudnn::setConvolutionNdDesc<OpDataType >(&_conv_descs, inputs[0]->dims() - 2, pad_a, filter_stride_a, dilation_a); // set activation descriptor if (param.has_activation) { cudnn::set_activation_des<OpDataType>(&_active_descs, param.activation_param.active); } if (param.has_pooling) { int windowHeight[] = {param.pooling_param.window_h, param.pooling_param.window_w}; int padding[] = {param.pooling_param.pad_h, param.pooling_param.pad_w}; int stride[] = {param.pooling_param.stride_h, param.pooling_param.stride_w}; cudnn::set_nd_pooling_des<OpDataType >(&_pooling_descs, param.pooling_param.pooling_type, _inner_tensor.dims() - 2, windowHeight, padding,stride); } // true: use tensor core // false: disable tensor core cudnn::set_math_type<OpDataType>(&_conv_descs, _use_tensor_core); cudnn::set_group_count<OpDataType>(&_conv_descs, param.conv_param.group); // Get fastest implement of cudnn // set up algo and workspace size if (param.conv_param.group == inputs[0]->channel() && \ inputs[0]->channel() == outputs[0]->channel()) { _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;//CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } else { CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \ _input_descs, _filter_desc, _conv_descs, _inner_descs, \ _preference, _workspace_limit_bytes, &_fwd_algo)); } CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, _input_descs, _filter_desc, _conv_descs, _inner_descs, _fwd_algo, &_workspace_fwd_sizes)); if (_workspace_fwd_sizes > _workspaceSizeInBytes) { _workspaceSizeInBytes = _workspace_fwd_sizes; if (_workspaceData != NULL) { cudaFree(_workspaceData); } cudaMalloc(&_workspaceData, _workspaceSizeInBytes); _workspace = reinterpret_cast<char*>(_workspaceData); } if (param.conv_param.bias()->size()> 0) { int dim_bias[] = {1, output_channel, 1, 1}; int stride_bias[] = {output_channel, 1, 1, 1}; cudnn::setTensorNdDesc<OpDataType >(&_bias_desc, 4, dim_bias, stride_bias); } return SaberSuccess; }
void convolution_1x1_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { { cuda_util::transpose( *cuda_config, *input_buffer, *additional_buffers[1], input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, entry_count, stream_id); cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_configuration_specific.feature_map_count, entry_count * input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, &alpha, *data[0], input_configuration_specific.feature_map_count, *additional_buffers[1], input_configuration_specific.feature_map_count, &beta, *additional_buffers[2], output_configuration_specific.feature_map_count)); cuda_util::transpose( *cuda_config, *additional_buffers[2], *additional_buffers[0], output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, 1, output_elem_count_per_feature_map)); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *additional_buffers[0])); } }
void sparse_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { // Convert input data strided NCHW to packed CNHW format if (unit_stride) { cuda_util::transpose23( *cuda_config, *input_buffers[0], *temporary_working_per_entry_buffer, input_elem_count_per_feature_map_list[0], input_configuration_specific_list[0].feature_map_count, entry_count, stream_id); } else { std::vector<unsigned int> input_converted_CNHW_strides = input_converted_CNHW_strides_base; input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 2] = input_converted_CNHW_strides[input_converted_CNHW_strides.size() - 1] * entry_count; cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_strided_data_desc, input_strided_config, entry_count, input_strides); cudnn_util::set_tensor_descriptor( input_converted_CNHW_data_desc, input_strided_config, entry_count, input_converted_CNHW_strides); float alpha = 1.0F; float beta = 0.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, input_strided_data_desc, *input_buffers[0], &beta, input_converted_CNHW_data_desc, *temporary_working_per_entry_buffer)); } { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm2( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, output_configuration_specific.feature_map_count, entry_count * output_elem_count_per_feature_map, input_strided_config.feature_map_count, feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *temporary_working_per_entry_buffer, entry_count * output_elem_count_per_feature_map, &beta, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, output_configuration_specific.feature_map_count)); } // Convert output from NHWC to NCHW { cuda_util::transpose( *cuda_config, ((float *)*temporary_working_per_entry_buffer) + input_converted_elem_count_per_entry_aligned * entry_count, *output_buffer, output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
void convolution_layer_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( input_data_desc, input_configuration_specific_list[0], entry_count); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); { void * workspace = 0; size_t workspace_size = 0; if (temporary_working_fixed_buffer) { workspace = *temporary_working_fixed_buffer; workspace_size = temporary_working_fixed_buffer->get_size(); } cudnnConvolutionBwdDataAlgo_t algo = cuda_config->cudnn_find_convolution_backward_data_algo( input_data_desc, weights_desc, convolution_desc, output_data_desc, *output_errors_buffer, *data[0], *temporary_working_per_entry_buffer, workspace, workspace_size); float alpha = 1.0F; float beta = (add_update_to_destination ? 1.0F : 0.0F); cudnn_safe_call(cudnnConvolutionBackwardData( cuda_config->get_cudnn_handle(), &alpha, weights_desc, *data[0], output_data_desc, *output_errors_buffer, convolution_desc, algo, workspace, workspace_size, &beta, input_data_desc, *input_errors_buffer)); } }
static PyObject *conv_dfilter_buffers(PyObject *self, PyObject *args) { cudaError_t err; cudnnStatus_t status; int PAD, gpu_ind, filters_ind, imgs_ind, conv_out_ind, out_ind, stream_ind; if (!PyArg_ParseTuple(args, "iiiiiii", &filters_ind, &imgs_ind, &conv_out_ind, &out_ind, &PAD, &stream_ind, &gpu_ind)) return NULL; if(filters_ind >= N_BUFFERS || filters_ind < 0 || imgs_ind >= N_BUFFERS || imgs_ind < 0 || conv_out_ind >= N_BUFFERS || conv_out_ind < 0 || out_ind >= N_BUFFERS || out_ind < 0){ printf("invalid buffer index\n"); return NULL; } if(gpu_ind < 0 || gpu_ind > N_GPUS){ printf("invalid gpu index %i\n", gpu_ind); return NULL; } if(stream_ind < 0 || stream_ind > N_ALT_STREAMS){ printf("invalid stream index %i\n", stream_ind); return NULL; } if(data_buffers[gpu_ind][filters_ind] == NULL || data_buffers[gpu_ind][imgs_ind] == NULL || data_buffers[gpu_ind][conv_out_ind] == NULL){ printf("one or more buffers not initialized on this gpu\n"); return NULL; } if(filter_flags[gpu_ind][filters_ind] == 0 || filter_flags[gpu_ind][imgs_ind] == 1 || filter_flags[gpu_ind][conv_out_ind] == 1){ printf("one or more buffers was not initialized correctly, filters when should be tensor or vice versa\n"); return NULL; } cudaSetDevice(gpu_ind); CHECK_CUDA_ERR cudaStreamSynchronize(streams[gpu_ind]); // make sure the inputs are in the buffers first cudnnSetStream(handle, alt_streams[gpu_ind][stream_ind]); int n_filters = data_dims[0][gpu_ind][filters_ind]; int n_channels = data_dims[1][gpu_ind][filters_ind]; int filter_sz = data_dims[2][gpu_ind][filters_ind]; if(data_buffers[gpu_ind][out_ind] == NULL){ // allocate output status = cudnnCreateFilterDescriptor(&desc_filters[gpu_ind][out_ind]); ERR_CHECK status = cudnnSetFilterDescriptor(desc_filters[gpu_ind][out_ind], dataType, n_filters, n_channels, filter_sz, filter_sz); ERR_CHECK err = cudaMalloc((void**) &data_buffers[gpu_ind][out_ind], n_filters*n_channels*filter_sz*filter_sz * DATA_TYPE_SZ); MALLOC_ERR_CHECK data_dims[0][gpu_ind][out_ind] = n_filters; data_dims[1][gpu_ind][out_ind] = n_channels; data_dims[2][gpu_ind][out_ind] = filter_sz; data_dims[3][gpu_ind][out_ind] = filter_sz; filter_flags[gpu_ind][out_ind] = 1; }else if(filter_flags[gpu_ind][out_ind] == 0 || data_dims[0][gpu_ind][out_ind] != n_filters || data_dims[1][gpu_ind][out_ind] != n_channels || data_dims[2][gpu_ind][out_ind] != filter_sz || data_dims[3][gpu_ind][out_ind] != filter_sz){ // make sure output buffer is of correct size printf("output buffer size is not matching output of this function and/or initialized as a tensor, %s %i\n", __FILE__, __LINE__); return NULL; } //--------------------------------------- // Set decriptors //--------------------------------------- status = cudnnSetConvolutionDescriptor(convDesc, desc_buffers[gpu_ind][imgs_ind], desc_filters[gpu_ind][out_ind], PAD, PAD, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION); ERR_CHECK //--------------------------------------- // Query output layout //--------------------------------------- int n_imgs_out, n_filters_out, conv_out_sz_x, conv_out_sz_y; status = cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD, &n_imgs_out, &n_filters_out, &conv_out_sz_x, &conv_out_sz_y); ERR_CHECK //-------------------------------------- // set filter and image values //-------------------------------------- if(n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x != data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]* data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]){ printf("predicted conv output not matching given input %s %i\n", __FILE__, __LINE__); printf("%i %i\n", n_imgs_out*n_filters_out*conv_out_sz_x*conv_out_sz_x, data_dims[0][gpu_ind][conv_out_ind]*data_dims[1][gpu_ind][conv_out_ind]* data_dims[2][gpu_ind][conv_out_ind]*data_dims[3][gpu_ind][conv_out_ind]); printf("%i %i\n", n_imgs_out, data_dims[0][gpu_ind][conv_out_ind]); printf("%i %i\n", n_filters_out, data_dims[1][gpu_ind][conv_out_ind]); printf("%i %i\n", conv_out_sz_x, data_dims[2][gpu_ind][conv_out_ind]); printf("%i %i\n", conv_out_sz_y, data_dims[3][gpu_ind][conv_out_ind]); //return NULL; } //-------------------------------------- // Convolution //-------------------------------------- status = cudnnConvolutionBackwardFilter(handle, desc_buffers[gpu_ind][imgs_ind], data_buffers[gpu_ind][imgs_ind], desc_buffers[gpu_ind][conv_out_ind], data_buffers[gpu_ind][conv_out_ind], convDesc, desc_filters[gpu_ind][out_ind], data_buffers[gpu_ind][out_ind], CUDNN_RESULT_NO_ACCUMULATE); ERR_CHECK cudnnSetStream(handle, NULL); cudaSetDevice(0); CHECK_CUDA_ERR Py_INCREF(Py_None); return Py_None; }
CuDNNHandle::CuDNNHandle(cudaStream_t stream) { if (cudnnCreate(&handle_) != CUDNN_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create cuDNN handle. cuDNN won't be available."; } CUDNN_CHECK(cudnnSetStream(handle_, stream)); }
void CudnnNdConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionParameter conv_param = this->layer_param_.convolution_param(); // Configure the kernel size, padding, stride, and inputs. CHECK(conv_param.has_kernel_shape()) << "Kernel shape is required."; if (conv_param.has_pad_shape()) { CHECK_EQ(conv_param.kernel_shape().dim_size(), conv_param.pad_shape().dim_size()) << "Kernel and Pad shape don't match !"; } if (conv_param.has_stride_shape()) { CHECK_EQ(conv_param.kernel_shape().dim_size(), conv_param.stride_shape().dim_size()) << "Kernel and Stride shape don't match !"; } for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { kernel_shape_.push_back(conv_param.kernel_shape().dim(i)); CHECK_GT(kernel_shape_[i], 0) << "Filter dimensions cannot be zero."; } if (conv_param.has_pad_shape()) { for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { pad_shape_.push_back(conv_param.pad_shape().dim(i)); } } else { pad_shape_ = std::vector<int>(kernel_shape_.size(), 0); } if (conv_param.has_stride_shape()) { for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { stride_shape_.push_back(conv_param.stride_shape().dim(i)); } } else { stride_shape_ = std::vector<int>(kernel_shape_.size(), 1); } // Configure output channels and groups. channels_ = bottom[0]->shape(1); num_output_ = this->layer_param_.convolution_param().num_output(); CHECK_GT(num_output_, 0); group_ = this->layer_param_.convolution_param().group(); CHECK_EQ(channels_ % group_, 0); CHECK_EQ(num_output_ % group_, 0) << "Number of output should be multiples of group."; // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights // - blobs_[1] holds the biases (optional) bias_term_ = this->layer_param_.convolution_param().bias_term(); vector<int> weight_shape(kernel_shape_); weight_shape.insert(weight_shape.begin(), channels_ / group_); weight_shape.insert(weight_shape.begin(), num_output_); if (this->blobs_.size() > 0) { LOG(INFO) << "Skipping parameter initialization"; } else { if (bias_term_) { this->blobs_.resize(2); } else { this->blobs_.resize(1); } // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width this->blobs_[0].reset(new Blob<Dtype>(weight_shape)); shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, initialize and fill the biases. if (bias_term_) { vector<int> bias_shape(1, num_output_); this->blobs_[1].reset(new Blob<Dtype>(bias_shape)); shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } // Propagate gradients to the parameters (as directed by backward pass). this->param_propagate_down_.resize(this->blobs_.size(), true); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace_data_ = NULL; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_shape[0] /= group_; weight_offset_ = 1; for (int i = 0; i < weight_shape.size(); ++i) { weight_offset_ *= weight_shape[i]; } bias_offset_ = weight_shape[0]; // Create filter descriptor. cudnn::createNdFilterDesc<Dtype>(&filter_desc_, weight_shape); bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; workspace_bwd_filter_sizes_ = new size_t[bottom.size()]; workspace_bwd_data_sizes_ = new size_t[bottom.size()]; workspace_ = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensorDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensorDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); workspace_bwd_data_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensorDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }