void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (!is_1x1_) { conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); col_buff = col_buffer_.gpu_data(); } for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, (Dtype)1., weights + weight_offset_ * g); } }
void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input, const int_tp input_off, const Dtype* weights, Dtype* output, const int_tp output_off, bool skip_im2col) { const Dtype* col_buff = input; if (this->device_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu(input + input_off, col_buffer()->mutable_gpu_data()); } col_buff = col_buffer()->gpu_data(); } for (int_tp g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>( CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype) 1., weights + weight_offset_ * g, col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 0., output + output_off + output_offset_ * g); } #endif // USE_CUDA } else { #ifdef USE_GREENTEA if (!is_1x1_) { if (!skip_im2col) { greentea_conv_im2col_gpu(input, input_off, col_buffer()->mutable_gpu_data(), 0); } col_buff = col_buffer()->gpu_data(); } for (int_tp g = 0; g < group_; ++g) { greentea_gpu_gemm<Dtype>(this->device_->id(), CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype) 1., (cl_mem) weights, weight_offset_ * g, (cl_mem) col_buff, (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 0., (cl_mem) output, output_off + output_offset_ * g); } #endif // USE_GREENTEA } }
void BaseConvolutionNDLayer<Dtype>::weight_gpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (this->device_context_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA if (!is_1x1_) { conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); col_buff = col_buffer_.gpu_data(); } for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, (Dtype)1., weights + weight_offset_ * g); } #endif // USE_CUDA } }