Beispiel #1
0
void Context::Init(int device_id) {
  device_id_ = device_id;

  SwitchDevice();

#if defined(USE_CUDA)
  if (blas_handle_ == nullptr) {
    CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_));
    CHECK_NOTNULL(blas_handle_);
  }
#endif

#if defined(USE_CUDNN)
  if (cudnn_handle_ == nullptr) {
    CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_));
    CHECK_NOTNULL(cudnn_handle_);
  }
#endif

#if defined(USE_NNPACK)
  if (nnpack_handle_ == nullptr) {
    CHECK_EQ(nnp_initialize(), nnp_status_success);
    nnpack_handle_ = pthreadpool_create(0);
    CHECK_NOTNULL(nnpack_handle_);
  }
#endif
}
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  PoolingLayer<Dtype>::LayerSetUp(bottom, top);

  // stride
	const int* kernel_shape_data = this->kernel_shape_.cpu_data();
	// stride
	const int* stride_data = this->stride_.cpu_data();
	// padding
	const int* pad_data = this->pad_.cpu_data();

	int kernel_shape[this->num_spatial_axes_];
	int stride[this->num_spatial_axes_];
	int pad[this->num_spatial_axes_];
	for (int i = 0; i < this->num_spatial_axes_; i++){
		kernel_shape[i] = kernel_shape_data[i];
		stride[i] = stride_data[i];
		pad[i] = pad_data[i];
	}

	CUDNN_CHECK(cudnnCreate(&handle_));

	cudnn::createTensorDesc<Dtype>(&bottom_desc_);
	cudnn::createTensorDesc<Dtype>(&top_desc_);
	cudnn::createPoolingNdDesc<Dtype>(&pooling_desc_,
			this->layer_param_.pooling_param().pool(), &mode_,
			this->num_spatial_axes_, kernel_shape,
			pad, stride);
	handles_setup_ = true;
}
void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDNN.
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
  handles_setup_ = true;
}
Beispiel #4
0
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  ReLULayer<Dtype>::LayerSetUp(bottom, top);
  // initialize cuDNN
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensorNdDesc<Dtype>(&bottom_desc_);
  cudnn::createTensorNdDesc<Dtype>(&top_desc_);
  handles_setup_ = true;
}
Beispiel #5
0
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  TanHLayer<Dtype>::LayerSetUp(bottom, top);
  // initialize cuDNN
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensorDesc<Dtype>(&bottom_desc_);
  cudnn::createTensorDesc<Dtype>(&top_desc_);
  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH);
  handles_setup_ = true;
}
Beispiel #6
0
cudnnHandle_t cudnn_handle()
{
    static int init = 0;
    static cudnnHandle_t handle;
    if(!init) {
        cudnnCreate(&handle);
        init = 1;
    }
    return handle;
}
Beispiel #7
0
GpuDevice::Impl::Impl(int d) : device(d) {
  ActivateDevice();
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i]));
  }
}
Beispiel #8
0
void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  ReLULayer<Dtype>::LayerSetUp(bottom, top);
  // initialize cuDNN
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
  handles_setup_ = true;
  cudnnCreateActivationDescriptor(&activation_desc_);
  cudnnSetActivationDescriptor(activation_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0);
}
Beispiel #9
0
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensorDesc<Dtype>(&bottom_desc_);
  cudnn::createTensorDesc<Dtype>(&top_desc_);
  cudnn::createNdPoolingDesc<Dtype>(&pooling_desc_,
      this->layer_param_.pooling_param().pool(), &mode_,
      this->kernel_shape_, this->pad_, this->stride_);
  handles_setup_ = true;
}
Beispiel #10
0
cudnnHandle_t cudnn_handle()
{
    static int init[16] = {0};
    static cudnnHandle_t handle[16];
    int i = cuda_get_device();
    if(!init[i]) {
        cudnnCreate(&handle[i]);
        init[i] = 1;
    }
    return handle[i];
}
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      vector<Blob<Dtype>*>* top) {
  TanHLayer<Dtype>::LayerSetUp(bottom, top);
  // initialize cuDNN
  CUDNN_CHECK(cudnnCreate(&handle_));
  const int N = bottom[0]->num();
  const int K = bottom[0]->channels();
  const int H = bottom[0]->height();
  const int W = bottom[0]->width();
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
}
Beispiel #12
0
void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  TanHLayer<Dtype>::LayerSetUp(bottom, top);
  // initialize cuDNN
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
#if CUDNN_VERSION_MIN(5, 0, 0)
  cudnnCreateActivationDescriptor(&activation_desc_);
  cudnnSetActivationDescriptor(activation_desc_,
      CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN, 0);
#endif
  handles_setup_ = true;
}
 void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   PoolingLayer<Dtype>::LayerSetUp(bottom, top);
   // Sanity check: CUDNN currently only supports pad == 0.
   CHECK_EQ(this->pad_h_, 0);
   CHECK_EQ(this->pad_w_, 0);
   CUDNN_CHECK(cudnnCreate(&handle_));
   cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
   cudnn::createTensor4dDesc<Dtype>(&top_desc_);
   cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
       this->layer_param_.pooling_param().pool(), &mode_,
       this->kernel_h_, this->kernel_w_, this->stride_h_, this->stride_w_);
   handles_setup_ = true;
 }
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  workspaceSizeInBytes = 0;
  workspace = NULL;

  workspace = NULL;
  workspaceSizeInBytes = (size_t)0;

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensorDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensorDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }

  handles_setup_ = true;
}
Beispiel #15
0
void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    vector<Blob<Dtype>*>* top, const bool init_ps, int* num_tables,
    map<string, vector<int> >* layer_name_to_blob_global_idx) {
  PoolingLayer<Dtype>::LayerSetUp(bottom, top, init_ps, num_tables,
      layer_name_to_blob_global_idx);

  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
      this->layer_param_.pooling_param().pool(), &mode_,
      this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_,
      this->stride_h_, this->stride_w_);
  handles_setup_ = true;

}
Beispiel #16
0
void CuDNNLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
                                      const vector<Blob<Dtype>*>& top) {
    LRNLayer<Dtype>::LayerSetUp(bottom, top);

    CUDNN_CHECK(cudnnCreate(&handle_));
    CUDNN_CHECK(cudnnCreateLRNDescriptor(&norm_desc_));
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
    cudnn::createTensor4dDesc<Dtype>(&top_desc_);

    // create a LRN handle
    handles_setup_ = true;

    size_ = this->layer_param().lrn_param().local_size();
    alpha_ = this->layer_param().lrn_param().alpha();
    beta_ = this->layer_param().lrn_param().beta();
    k_ = this->layer_param().lrn_param().k();
}
Beispiel #17
0
//Constructor
CDACN::CDACN(ActionDescriptor descriptor, float discount, int temporal_stride, int capacity, int burn_in, int batch_size, float base_learning_rate, float actor_learning_rate, float critic_learning_rate, float d, float advantage, unsigned int cycles) : 
			Agent(descriptor,temporal_stride,84,84,capacity,burn_in,batch_size),
			discount_(discount),
			exploration_rate_(1.0),
			uniform_real(0.0,1.0),
			base_learning_rate_(base_learning_rate),
			actor_learning_rate_(actor_learning_rate),
			critic_learning_rate_(critic_learning_rate),
			d_(d),
			advantage_(advantage),
			cycles_(cycles)
{
	cudnnCreate(&handle);
	actor_critic   = ActorCritic(handle,{1,batch_size},temporal_stride,descriptor_.action_dim(),base_learning_rate_,actor_learning_rate_,critic_learning_rate_);
	actor_critic_n = ActorCritic(handle,{1,batch_size},temporal_stride,descriptor_.action_dim(),base_learning_rate_,actor_learning_rate_,critic_learning_rate_);
	//Sync parameters
	actor_critic.transfer(actor_critic_n);
}
    inline void InitCuDNN() {
      init_cudnn_ = false;
      dtype_ = CUDNN_DATA_FLOAT;
      switch(mode) {
       case kMaxPooling: mode_ = CUDNN_POOLING_MAX; break;
       // case kAvgPooling: mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; break;
       default: utils::Error("This should not happen -,-"); break;
      }
      CUDA_CHECK(cudnnCreate(&handle_));
      CUDA_CHECK(cudnnCreateTensorDescriptor(&in_desc_));
      CUDA_CHECK(cudnnCreateTensorDescriptor(&out_desc_));
      CUDA_CHECK(cudnnCreatePoolingDescriptor(&pooling_desc_));
      CUDA_CHECK(cudnnSetPooling2dDescriptor(pooling_desc_, mode_,
                                             Parent::param_.kernel_height,
                                             Parent::param_.kernel_width,
                                             0, 0,
                                             Parent::param_.stride,
                                             Parent::param_.stride));

    }
Beispiel #19
0
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) {
  CUDA_CALL(cudaSetDevice(device_));
  cudaFree(0);  // Initialize
  auto allocator = [this](size_t len) -> void* {
    void* ret;
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaMalloc(&ret, len));
    return ret;
  };
  auto deallocator = [this](void* ptr) {
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaFree(ptr));
  };
  data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator);
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream_[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle_[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle_[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i]));
  }
}
CuDNNHandle::CuDNNHandle(cudaStream_t stream) {
  if (cudnnCreate(&handle_) != CUDNN_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create cuDNN handle. cuDNN won't be available.";
  }
  CUDNN_CHECK(cudnnSetStream(handle_, stream));
}
Beispiel #21
0
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  // Initialize algorithm arrays
  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];

  // initialize size arrays
  workspace_fwd_sizes_ = new uint_tp[bottom.size()];
  workspace_bwd_filter_sizes_ = new uint_tp[bottom.size()];
  workspace_bwd_data_sizes_ = new uint_tp[bottom.size()];

  // workspace data
  workspaceSizeInBytes = 0;
  workspaceData = NULL;
  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (uint_tp i = 0; i < bottom.size(); ++i) {
    // initialize all to default algorithms
    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
    // default algorithms don't require workspace
    workspace_fwd_sizes_[i] = 0;
    workspace_bwd_data_sizes_[i] = 0;
    workspace_bwd_filter_sizes_[i] = 0;
  }

  for (int_tp g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
    workspace[g] = NULL;
  }

  // Set the indexing parameters.
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  const int_tp* kernel_shape_data = this->kernel_shape_.cpu_data();
  const int_tp kernel_h = kernel_shape_data[0];
  const int_tp kernel_w = kernel_shape_data[1];
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      kernel_h, kernel_w);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int_tp i = 0; i < bottom.size(); i++) {
    cudnnTensorDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensorDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }

  handles_setup_ = true;
}
void CudnnNdConvolutionLayer<Dtype>::LayerSetUp(
  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionParameter conv_param =
    this->layer_param_.convolution_param();
  // Configure the kernel size, padding, stride, and inputs.
  CHECK(conv_param.has_kernel_shape())
      << "Kernel shape is required.";
  if (conv_param.has_pad_shape()) {
    CHECK_EQ(conv_param.kernel_shape().dim_size(),
             conv_param.pad_shape().dim_size())
        << "Kernel and Pad shape don't match !";
  }
  if (conv_param.has_stride_shape()) {
    CHECK_EQ(conv_param.kernel_shape().dim_size(),
             conv_param.stride_shape().dim_size())
        << "Kernel and Stride shape don't match !";
  }
  for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) {
    kernel_shape_.push_back(conv_param.kernel_shape().dim(i));
    CHECK_GT(kernel_shape_[i], 0) << "Filter dimensions cannot be zero.";
  }
  if (conv_param.has_pad_shape()) {
    for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) {
      pad_shape_.push_back(conv_param.pad_shape().dim(i));
    }
  } else {
    pad_shape_ = std::vector<int>(kernel_shape_.size(), 0);
  }
  if (conv_param.has_stride_shape()) {
    for (int i = 0; i < conv_param.kernel_shape().dim_size(); ++i) {
      stride_shape_.push_back(conv_param.stride_shape().dim(i));
    }
  } else {
    stride_shape_ = std::vector<int>(kernel_shape_.size(), 1);
  }
  // Configure output channels and groups.
  channels_ = bottom[0]->shape(1);
  num_output_ = this->layer_param_.convolution_param().num_output();
  CHECK_GT(num_output_, 0);
  group_ = this->layer_param_.convolution_param().group();
  CHECK_EQ(channels_ % group_, 0);
  CHECK_EQ(num_output_ % group_, 0)
      << "Number of output should be multiples of group.";

  // Handle the parameters: weights and biases.
  // - blobs_[0] holds the filter weights
  // - blobs_[1] holds the biases (optional)
  bias_term_ = this->layer_param_.convolution_param().bias_term();

  vector<int> weight_shape(kernel_shape_);
  weight_shape.insert(weight_shape.begin(), channels_ / group_);
  weight_shape.insert(weight_shape.begin(), num_output_);

  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    if (bias_term_) {
      this->blobs_.resize(2);
    } else {
      this->blobs_.resize(1);
    }
    // Initialize and fill the weights:
    // output channels x input channels per-group x kernel height x kernel width
    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
          this->layer_param_.convolution_param().weight_filler()));
    weight_filler->Fill(this->blobs_[0].get());
    // If necessary, initialize and fill the biases.
    if (bias_term_) {
      vector<int> bias_shape(1, num_output_);
      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
            this->layer_param_.convolution_param().bias_filler()));
      bias_filler->Fill(this->blobs_[1].get());
    }
  }

  // Propagate gradients to the parameters (as directed by backward pass).
  this->param_propagate_down_.resize(this->blobs_.size(), true);

  // Initialize CUDA streams and cuDNN.
  stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  workspaceSizeInBytes = 0;
  workspace_data_ = NULL;

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_shape[0] /= group_;
  weight_offset_ = 1;
  for (int i = 0; i < weight_shape.size(); ++i) {
    weight_offset_ *= weight_shape[i];
  }
  bias_offset_ = weight_shape[0];

  // Create filter descriptor.
  cudnn::createNdFilterDesc<Dtype>(&filter_desc_, weight_shape);

  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
  workspace_ = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensorDescriptor_t bottom_desc;
    cudnn::createTensorDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensorDescriptor_t top_desc;
    cudnn::createTensorDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
    workspace_bwd_data_sizes_[i] = 0;
    workspace_bwd_filter_sizes_[i] = 0;
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensorDesc<Dtype>(&bias_desc_);
  }

  handles_setup_ = true;
}
SaberStatus VenderConv2DActPooling<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW, NCHW, NCHW>::\
    create(const std::vector<DataTensor_in *>& inputs,
            std::vector<DataTensor_out *>& outputs,
            ConvActivePoolingParam<OpTensor>& param, Context<NV> &ctx) {

    if (!(ctx == this->_ctx)) {
        if (_handle != NULL) {
            CUDNN_CHECK(cudnnDestroy(_handle));
        }
        this->_ctx = ctx;

        cudaStream_t cuda_stream;
        cuda_stream = ctx.get_compute_stream();

        CUDNN_CHECK(cudnnCreate(&_handle));
        CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream));
    }

    int input_num = inputs[0]->num();
    int input_channel = inputs[0]->channel();
    int input_height = inputs[0]->height();
    int input_width = inputs[0]->width();
    int output_channel = outputs[0]->channel();
    int output_height = outputs[0]->height();
    int output_width = outputs[0]->width();
    {
        _inner_shape = inputs[0]->shape();
        _inner_shape[0] = input_num;
        _inner_shape[1] = param.conv_param.weight()->num();

        int kernel_exten = param.conv_param.dilation_h *
                           (param.conv_param.weight()->height() - 1) + 1;
        int output_dim = (input_height + 2 * param.conv_param.pad_h - kernel_exten)
                         / param.conv_param.stride_h + 1;
        _inner_shape[2] = output_dim;
        kernel_exten = param.conv_param.dilation_w *
                           (param.conv_param.weight()->width() - 1) + 1;
        output_dim = (input_width + 2 * param.conv_param.pad_w - kernel_exten)
                         / param.conv_param.stride_w + 1;
        _inner_shape[3] = output_dim;
        _inner_tensor.re_alloc(_inner_shape);
    }

    int kernel_h = param.conv_param.weight()->height();
    int kernel_w = param.conv_param.weight()->width();

    int filter_dim_a[] = {output_channel,
                          input_channel / param.conv_param.group,
                          kernel_h, kernel_w};

    cudnn::setNDFilterDesc<OpDataType>(&_filter_desc,
                                    param.conv_param.weight()->dims(),
                                    filter_dim_a, CUDNN_TENSOR_NCHW);

    Shape in_stride = inputs[0]->get_stride();
    Shape inner_stride = _inner_tensor.get_stride();
    Shape out_stride = outputs[0]->get_stride();

    int dim_a[] = {input_num, input_channel,
                   input_height, input_width};

    int dim_inner[] = {_inner_shape[0], _inner_shape[1],
                        _inner_shape[2], _inner_shape[3]};

    int dim_b[] = {input_num, output_channel,
                   output_height, output_width};
    cudnn::setTensorNdDesc<InDataType >(&_input_descs,
                                       inputs[0]->dims(), dim_a, &in_stride[0]);
    cudnn::setTensorNdDesc<InDataType >(&_inner_descs,
                                       4, dim_inner,
                                       &inner_stride[0]);
    cudnn::setTensorNdDesc<InDataType>(&_output_descs,
                                      outputs[0]->dims(), dim_b, &out_stride[0]);
    int pad_a[] = {param.conv_param.pad_h, param.conv_param.pad_w};
    int filter_stride_a[] = {param.conv_param.stride_h, param.conv_param.stride_w};
    int dilation_a[] = {param.conv_param.dilation_h, param.conv_param.dilation_w};

    cudnn::setConvolutionNdDesc<OpDataType >(&_conv_descs,
                                          inputs[0]->dims() - 2, pad_a,
                                          filter_stride_a, dilation_a);
    // set activation descriptor
    if (param.has_activation) {
        cudnn::set_activation_des<OpDataType>(&_active_descs, param.activation_param.active);
    }
    if (param.has_pooling) {
        int windowHeight[] = {param.pooling_param.window_h,
                              param.pooling_param.window_w};
        int padding[] = {param.pooling_param.pad_h,
                         param.pooling_param.pad_w};
        int stride[] = {param.pooling_param.stride_h,
                        param.pooling_param.stride_w};

        cudnn::set_nd_pooling_des<OpDataType >(&_pooling_descs,
                                            param.pooling_param.pooling_type,
                                            _inner_tensor.dims() - 2,
                                            windowHeight,
                                            padding,stride);
    }
    // true: use tensor core
    // false: disable tensor core
    cudnn::set_math_type<OpDataType>(&_conv_descs, _use_tensor_core);
    cudnn::set_group_count<OpDataType>(&_conv_descs, param.conv_param.group);

    // Get fastest implement of cudnn
    // set up algo and workspace size
    if (param.conv_param.group == inputs[0]->channel() && \
        inputs[0]->channel() == outputs[0]->channel()) {
        _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;//CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
    } else {
        CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(_handle, \
            _input_descs, _filter_desc, _conv_descs, _inner_descs, \
            _preference, _workspace_limit_bytes, &_fwd_algo));
    }

    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                        _input_descs, _filter_desc,
                                                        _conv_descs, _inner_descs,
                                                        _fwd_algo, &_workspace_fwd_sizes));

    if (_workspace_fwd_sizes > _workspaceSizeInBytes) {
        _workspaceSizeInBytes = _workspace_fwd_sizes;
        if (_workspaceData != NULL) {
            cudaFree(_workspaceData);
        }
        cudaMalloc(&_workspaceData, _workspaceSizeInBytes);
        _workspace = reinterpret_cast<char*>(_workspaceData);
    }

    if (param.conv_param.bias()->size()> 0) {
        int dim_bias[] = {1, output_channel, 1, 1};
        int stride_bias[] = {output_channel, 1, 1, 1};

        cudnn::setTensorNdDesc<OpDataType >(&_bias_desc,
                                         4, dim_bias, stride_bias);
    }
    return SaberSuccess;
}
Beispiel #24
0
 CUDNN() {
   CUDNN_CHECK(cudnnCreate(&handle_));
 }