void Context::Init(int device_id) { device_id_ = device_id; SwitchDevice(); #if defined(USE_CUDA) if (blas_handle_ == nullptr) { CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_)); CHECK_NOTNULL(blas_handle_); } #endif #if defined(USE_CUDNN) if (cudnn_handle_ == nullptr) { CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_)); CHECK_NOTNULL(cudnn_handle_); } #endif #if defined(USE_NNPACK) if (nnpack_handle_ == nullptr) { CHECK_EQ(nnp_initialize(), nnp_status_success); nnpack_handle_ = pthreadpool_create(0); CHECK_NOTNULL(nnpack_handle_); } #endif }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); // stride const int* kernel_shape_data = this->kernel_shape_.cpu_data(); // stride const int* stride_data = this->stride_.cpu_data(); // padding const int* pad_data = this->pad_.cpu_data(); int kernel_shape[this->num_spatial_axes_]; int stride[this->num_spatial_axes_]; int pad[this->num_spatial_axes_]; for (int i = 0; i < this->num_spatial_axes_; i++){ kernel_shape[i] = kernel_shape_data[i]; stride[i] = stride_data[i]; pad[i] = pad_data[i]; } CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createPoolingNdDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->num_spatial_axes_, kernel_shape, pad, stride); handles_setup_ = true; }
void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { SoftmaxLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDNN. CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); handles_setup_ = true; }
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ReLULayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorNdDesc<Dtype>(&bottom_desc_); cudnn::createTensorNdDesc<Dtype>(&top_desc_); handles_setup_ = true; }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH); handles_setup_ = true; }
cudnnHandle_t cudnn_handle() { static int init = 0; static cudnnHandle_t handle; if(!init) { cudnnCreate(&handle); init = 1; } return handle; }
GpuDevice::Impl::Impl(int d) : device(d) { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream[i])); CUBLAS_CALL(cublasCreate(&cublas_handle[i])); CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i])); } }
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ReLULayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); handles_setup_ = true; cudnnCreateActivationDescriptor(&activation_desc_); cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0); }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensorDesc<Dtype>(&bottom_desc_); cudnn::createTensorDesc<Dtype>(&top_desc_); cudnn::createNdPoolingDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->kernel_shape_, this->pad_, this->stride_); handles_setup_ = true; }
cudnnHandle_t cudnn_handle() { static int init[16] = {0}; static cudnnHandle_t handle[16]; int i = cuda_get_device(); if(!init[i]) { cudnnCreate(&handle[i]); init[i] = 1; } return handle[i]; }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); const int N = bottom[0]->num(); const int K = bottom[0]->channels(); const int H = bottom[0]->height(); const int W = bottom[0]->width(); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W); cudnn::createTensor4dDesc<Dtype>(&top_desc_, N, K, H, W); }
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { TanHLayer<Dtype>::LayerSetUp(bottom, top); // initialize cuDNN CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); #if CUDNN_VERSION_MIN(5, 0, 0) cudnnCreateActivationDescriptor(&activation_desc_); cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN, 0); #endif handles_setup_ = true; }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { PoolingLayer<Dtype>::LayerSetUp(bottom, top); // Sanity check: CUDNN currently only supports pad == 0. CHECK_EQ(this->pad_h_, 0); CHECK_EQ(this->pad_w_, 0); CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); cudnn::createPoolingDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_); handles_setup_ = true; }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace = NULL; workspace = NULL; workspaceSizeInBytes = (size_t)0; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_offset_ = (this->num_output_ / this->group_) * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, this->kernel_h_, this->kernel_w_); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top, const bool init_ps, int* num_tables, map<string, vector<int> >* layer_name_to_blob_global_idx) { PoolingLayer<Dtype>::LayerSetUp(bottom, top, init_ps, num_tables, layer_name_to_blob_global_idx); CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); cudnn::createPoolingDesc<Dtype>(&pooling_desc_, this->layer_param_.pooling_param().pool(), &mode_, this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_, this->stride_h_, this->stride_w_); handles_setup_ = true; }
void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LRNLayer<Dtype>::LayerSetUp(bottom, top); CUDNN_CHECK(cudnnCreate(&handle_)); CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_)); cudnn::createTensor4dDesc<Dtype>(&bottom_desc_); cudnn::createTensor4dDesc<Dtype>(&top_desc_); // create a LRN handle handles_setup_ = true; size_ = this->layer_param().lrn_param().local_size(); alpha_ = this->layer_param().lrn_param().alpha(); beta_ = this->layer_param().lrn_param().beta(); k_ = this->layer_param().lrn_param().k(); }
//Constructor CDACN::CDACN(ActionDescriptor descriptor, float discount, int temporal_stride, int capacity, int burn_in, int batch_size, float base_learning_rate, float actor_learning_rate, float critic_learning_rate, float d, float advantage, unsigned int cycles) : Agent(descriptor,temporal_stride,84,84,capacity,burn_in,batch_size), discount_(discount), exploration_rate_(1.0), uniform_real(0.0,1.0), base_learning_rate_(base_learning_rate), actor_learning_rate_(actor_learning_rate), critic_learning_rate_(critic_learning_rate), d_(d), advantage_(advantage), cycles_(cycles) { cudnnCreate(&handle); actor_critic = ActorCritic(handle,{1,batch_size},temporal_stride,descriptor_.action_dim(),base_learning_rate_,actor_learning_rate_,critic_learning_rate_); actor_critic_n = ActorCritic(handle,{1,batch_size},temporal_stride,descriptor_.action_dim(),base_learning_rate_,actor_learning_rate_,critic_learning_rate_); //Sync parameters actor_critic.transfer(actor_critic_n); }
inline void InitCuDNN() { init_cudnn_ = false; dtype_ = CUDNN_DATA_FLOAT; switch(mode) { case kMaxPooling: mode_ = CUDNN_POOLING_MAX; break; // case kAvgPooling: mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; break; default: utils::Error("This should not happen -,-"); break; } CUDA_CHECK(cudnnCreate(&handle_)); CUDA_CHECK(cudnnCreateTensorDescriptor(&in_desc_)); CUDA_CHECK(cudnnCreateTensorDescriptor(&out_desc_)); CUDA_CHECK(cudnnCreatePoolingDescriptor(&pooling_desc_)); CUDA_CHECK(cudnnSetPooling2dDescriptor(pooling_desc_, mode_, Parent::param_.kernel_height, Parent::param_.kernel_width, 0, 0, Parent::param_.stride, Parent::param_.stride)); }
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) { CUDA_CALL(cudaSetDevice(device_)); cudaFree(0); // Initialize auto allocator = [this](size_t len) -> void* { void* ret; CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaMalloc(&ret, len)); return ret; }; auto deallocator = [this](void* ptr) { CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaFree(ptr)); }; data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream_[i])); CUBLAS_CALL(cublasCreate(&cublas_handle_[i])); CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle_[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i])); } }
CuDNNHandle::CuDNNHandle(cudaStream_t stream) { if (cudnnCreate(&handle_) != CUDNN_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create cuDNN handle. cuDNN won't be available."; } CUDNN_CHECK(cudnnSetStream(handle_, stream)); }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionLayer<Dtype>::LayerSetUp(bottom, top); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; // Initialize algorithm arrays fwd_algo_ = new cudnnConvolutionFwdAlgo_t[bottom.size()]; bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; // initialize size arrays workspace_fwd_sizes_ = new uint_tp[bottom.size()]; workspace_bwd_filter_sizes_ = new uint_tp[bottom.size()]; workspace_bwd_data_sizes_ = new uint_tp[bottom.size()]; // workspace data workspaceSizeInBytes = 0; workspaceData = NULL; workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; for (uint_tp i = 0; i < bottom.size(); ++i) { // initialize all to default algorithms fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0; bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0; bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0; // default algorithms don't require workspace workspace_fwd_sizes_[i] = 0; workspace_bwd_data_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; } for (int_tp g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); workspace[g] = NULL; } // Set the indexing parameters. bias_offset_ = (this->num_output_ / this->group_); // Create filter descriptor. const int_tp* kernel_shape_data = this->kernel_shape_.cpu_data(); const int_tp kernel_h = kernel_shape_data[0]; const int_tp kernel_w = kernel_shape_data[1]; cudnn::createFilterDesc<Dtype>(&filter_desc_, this->num_output_ / this->group_, this->channels_ / this->group_, kernel_h, kernel_w); // Create tensor descriptor(s) for data and corresponding convolution(s). for (int_tp i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensor4dDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensor4dDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensor4dDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
void CudnnNdConvolutionLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ConvolutionParameter conv_param = this->layer_param_.convolution_param(); // Configure the kernel size, padding, stride, and inputs. CHECK(conv_param.has_kernel_shape()) << "Kernel shape is required."; if (conv_param.has_pad_shape()) { CHECK_EQ(conv_param.kernel_shape().dim_size(), conv_param.pad_shape().dim_size()) << "Kernel and Pad shape don't match !"; } if (conv_param.has_stride_shape()) { CHECK_EQ(conv_param.kernel_shape().dim_size(), conv_param.stride_shape().dim_size()) << "Kernel and Stride shape don't match !"; } for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { kernel_shape_.push_back(conv_param.kernel_shape().dim(i)); CHECK_GT(kernel_shape_[i], 0) << "Filter dimensions cannot be zero."; } if (conv_param.has_pad_shape()) { for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { pad_shape_.push_back(conv_param.pad_shape().dim(i)); } } else { pad_shape_ = std::vector<int>(kernel_shape_.size(), 0); } if (conv_param.has_stride_shape()) { for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) { stride_shape_.push_back(conv_param.stride_shape().dim(i)); } } else { stride_shape_ = std::vector<int>(kernel_shape_.size(), 1); } // Configure output channels and groups. channels_ = bottom[0]->shape(1); num_output_ = this->layer_param_.convolution_param().num_output(); CHECK_GT(num_output_, 0); group_ = this->layer_param_.convolution_param().group(); CHECK_EQ(channels_ % group_, 0); CHECK_EQ(num_output_ % group_, 0) << "Number of output should be multiples of group."; // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights // - blobs_[1] holds the biases (optional) bias_term_ = this->layer_param_.convolution_param().bias_term(); vector<int> weight_shape(kernel_shape_); weight_shape.insert(weight_shape.begin(), channels_ / group_); weight_shape.insert(weight_shape.begin(), num_output_); if (this->blobs_.size() > 0) { LOG(INFO) << "Skipping parameter initialization"; } else { if (bias_term_) { this->blobs_.resize(2); } else { this->blobs_.resize(1); } // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width this->blobs_[0].reset(new Blob<Dtype>(weight_shape)); shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, initialize and fill the biases. if (bias_term_) { vector<int> bias_shape(1, num_output_); this->blobs_[1].reset(new Blob<Dtype>(bias_shape)); shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } // Propagate gradients to the parameters (as directed by backward pass). this->param_propagate_down_.resize(this->blobs_.size(), true); // Initialize CUDA streams and cuDNN. stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; workspaceSizeInBytes = 0; workspace_data_ = NULL; for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { CUDA_CHECK(cudaStreamCreate(&stream_[g])); CUDNN_CHECK(cudnnCreate(&handle_[g])); CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); } // Set the indexing parameters. weight_shape[0] /= group_; weight_offset_ = 1; for (int i = 0; i < weight_shape.size(); ++i) { weight_offset_ *= weight_shape[i]; } bias_offset_ = weight_shape[0]; // Create filter descriptor. cudnn::createNdFilterDesc<Dtype>(&filter_desc_, weight_shape); bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()]; bwd_data_algo_ = new cudnnConvolutionBwdDataAlgo_t[bottom.size()]; workspace_bwd_filter_sizes_ = new size_t[bottom.size()]; workspace_bwd_data_sizes_ = new size_t[bottom.size()]; workspace_ = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP]; // Create tensor descriptor(s) for data and corresponding convolution(s). for (int i = 0; i < bottom.size(); i++) { cudnnTensorDescriptor_t bottom_desc; cudnn::createTensorDesc<Dtype>(&bottom_desc); bottom_descs_.push_back(bottom_desc); cudnnTensorDescriptor_t top_desc; cudnn::createTensorDesc<Dtype>(&top_desc); top_descs_.push_back(top_desc); cudnnConvolutionDescriptor_t conv_desc; cudnn::createConvolutionDesc<Dtype>(&conv_desc); conv_descs_.push_back(conv_desc); workspace_bwd_data_sizes_[i] = 0; workspace_bwd_filter_sizes_[i] = 0; } // Tensor descriptor for bias. if (this->bias_term_) { cudnn::createTensorDesc<Dtype>(&bias_desc_); } handles_setup_ = true; }
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\ create(const std::vector<DataTensor_in *>& inputs, std::vector<DataTensor_out *>& outputs, ConvActivePoolingParam<OpTensor>& param, Context<NV> &ctx) { if (!(ctx == this->_ctx)) { if (_handle != NULL) { CUDNN_CHECK(cudnnDestroy(_handle)); } this->_ctx = ctx; cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } int input_num = inputs[0]->num(); int input_channel = inputs[0]->channel(); int input_height = inputs[0]->height(); int input_width = inputs[0]->width(); int output_channel = outputs[0]->channel(); int output_height = outputs[0]->height(); int output_width = outputs[0]->width(); { _inner_shape = inputs[0]->shape(); _inner_shape[0] = input_num; _inner_shape[1] = param.conv_param.weight()->num(); int kernel_exten = param.conv_param.dilation_h * (param.conv_param.weight()->height() - 1) + 1; int output_dim = (input_height + 2 * param.conv_param.pad_h - kernel_exten) / param.conv_param.stride_h + 1; _inner_shape[2] = output_dim; kernel_exten = param.conv_param.dilation_w * (param.conv_param.weight()->width() - 1) + 1; output_dim = (input_width + 2 * param.conv_param.pad_w - kernel_exten) / param.conv_param.stride_w + 1; _inner_shape[3] = output_dim; _inner_tensor.re_alloc(_inner_shape); } int kernel_h = param.conv_param.weight()->height(); int kernel_w = param.conv_param.weight()->width(); int filter_dim_a[] = {output_channel, input_channel / param.conv_param.group, kernel_h, kernel_w}; cudnn::setNDFilterDesc<OpDataType>(&_filter_desc, param.conv_param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); Shape in_stride = inputs[0]->get_stride(); Shape inner_stride = _inner_tensor.get_stride(); Shape out_stride = outputs[0]->get_stride(); int dim_a[] = {input_num, input_channel, input_height, input_width}; int dim_inner[] = {_inner_shape[0], _inner_shape[1], _inner_shape[2], _inner_shape[3]}; int dim_b[] = {input_num, output_channel, output_height, output_width}; cudnn::setTensorNdDesc<InDataType >(&_input_descs, inputs[0]->dims(), dim_a, &in_stride[0]); cudnn::setTensorNdDesc<InDataType >(&_inner_descs, 4, dim_inner, &inner_stride[0]); cudnn::setTensorNdDesc<InDataType>(&_output_descs, outputs[0]->dims(), dim_b, &out_stride[0]); int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w}; int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w}; int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w}; cudnn::setConvolutionNdDesc<OpDataType >(&_conv_descs, inputs[0]->dims() - 2, pad_a, filter_stride_a, dilation_a); // set activation descriptor if (param.has_activation) { cudnn::set_activation_des<OpDataType>(&_active_descs, param.activation_param.active); } if (param.has_pooling) { int windowHeight[] = {param.pooling_param.window_h, param.pooling_param.window_w}; int padding[] = {param.pooling_param.pad_h, param.pooling_param.pad_w}; int stride[] = {param.pooling_param.stride_h, param.pooling_param.stride_w}; cudnn::set_nd_pooling_des<OpDataType >(&_pooling_descs, param.pooling_param.pooling_type, _inner_tensor.dims() - 2, windowHeight, padding,stride); } // true: use tensor core // false: disable tensor core cudnn::set_math_type<OpDataType>(&_conv_descs, _use_tensor_core); cudnn::set_group_count<OpDataType>(&_conv_descs, param.conv_param.group); // Get fastest implement of cudnn // set up algo and workspace size if (param.conv_param.group == inputs[0]->channel() && \ inputs[0]->channel() == outputs[0]->channel()) { _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;//CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } else { CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \ _input_descs, _filter_desc, _conv_descs, _inner_descs, \ _preference, _workspace_limit_bytes, &_fwd_algo)); } CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle, _input_descs, _filter_desc, _conv_descs, _inner_descs, _fwd_algo, &_workspace_fwd_sizes)); if (_workspace_fwd_sizes > _workspaceSizeInBytes) { _workspaceSizeInBytes = _workspace_fwd_sizes; if (_workspaceData != NULL) { cudaFree(_workspaceData); } cudaMalloc(&_workspaceData, _workspaceSizeInBytes); _workspace = reinterpret_cast<char*>(_workspaceData); } if (param.conv_param.bias()->size()> 0) { int dim_bias[] = {1, output_channel, 1, 1}; int stride_bias[] = {output_channel, 1, 1, 1}; cudnn::setTensorNdDesc<OpDataType >(&_bias_desc, 4, dim_bias, stride_bias); } return SaberSuccess; }
CUDNN() { CUDNN_CHECK(cudnnCreate(&handle_)); }