void PaddingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (!propagate_down[0]) { return; } caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff()); if (pad_pos_) { for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { for (int h = 0; h < height_in_; ++h) { // copy the width part caffe_gpu_axpy(width_in_, (Dtype)1., top[0]->gpu_diff(n, c, h + pad_beg_, pad_beg_), bottom[0]->mutable_gpu_diff(n, c, h)); } } } } else { for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { for (int h = 0; h < height_out_; ++h) { // copy the width part caffe_gpu_axpy(width_out_, (Dtype)1., top[0]->gpu_diff(n, c, h), bottom[0]->mutable_gpu_diff(n, c, h - pad_beg_, - pad_beg_)); } } } } }
void PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // top[n, c, h, w] = bottom[n, c, h-pad_beg, w-pad_beg] if in range if (pad_pos_) { caffe_gpu_set(top[0]->count(), Dtype(0), top[0]->mutable_gpu_data()); for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { CUDA_CHECK(cudaMemcpy2D( top[0]->mutable_gpu_data(n, c, pad_beg_, pad_beg_), sizeof(Dtype) * width_out_, bottom[0]->gpu_data(n, c), sizeof(Dtype) * width_in_, sizeof(Dtype) * width_in_, height_in_, cudaMemcpyDeviceToDevice)); } } } else { for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { CUDA_CHECK(cudaMemcpy2D( top[0]->mutable_gpu_data(n, c), sizeof(Dtype) * width_out_, bottom[0]->gpu_data(n, c, - pad_beg_, - pad_beg_), sizeof(Dtype) * width_in_, sizeof(Dtype) * width_out_, height_out_, cudaMemcpyDeviceToDevice)); } } } }
void InterpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (!propagate_down[0]) { return; } caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff()); caffe_gpu_interp2_backward<Dtype,false>(num_ * channels_, bottom[0]->mutable_gpu_diff(), - pad_beg_, - pad_beg_, height_in_eff_, width_in_eff_, height_in_, width_in_, top[0]->gpu_diff(), 0, 0, height_out_, width_out_, height_out_, width_out_); }
void DeconvolutionLayer<Dtype>::Backward_gpu( const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); if (this->param_propagate_down_[0]) { caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } if (this->bias_term_ && this->param_propagate_down_[1]) { caffe_gpu_set( this->blobs_[1]->count(), Dtype(0), this->blobs_[1]->mutable_gpu_diff()); } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); for (int n = 0; n < this->num_; ++n) { this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); } } if (this->param_propagate_down_[0] || propagate_down[i]) { for (int n = 0; n < this->num_; ++n) { // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_gpu_gemm(top_diff + top[i]->offset(n), bottom_data + bottom[i]->offset(n), weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->forward_gpu_gemm( top_diff + top[i]->offset(n), weight, bottom_diff + bottom[i]->offset(n)); } } } } }
void PowerLayer<Dtype>::Backward_gpu( const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[0]) { Dtype* bottom_diff = (bottom)[0]->mutable_gpu_diff(); const int count = (bottom)[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { caffe_gpu_set(count, diff_scale_, bottom_diff); } else { const Dtype* bottom_data = (bottom)[0]->gpu_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) // = diff_scale * y / (shift + scale * x) if (power_ == Dtype(2)) { // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x caffe_gpu_axpby( count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power // -> dy/dx = scale * power * (scale * x)^(power - 1) // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x const Dtype* top_data = top[0]->gpu_data(); caffe_gpu_div(count, top_data, bottom_data, bottom_diff); caffe_gpu_scal(count, power_, bottom_diff); } else { caffe_copy(count, bottom_data, bottom_diff); if (scale_ != Dtype(1)) { caffe_gpu_scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, shift_, bottom_diff); } const Dtype* top_data = top[0]->gpu_data(); caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff); if (diff_scale_ != Dtype(1)) { caffe_gpu_scal(count, diff_scale_, bottom_diff); } } } caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } }
void Tensor<Dtype>::SetValues(const Dtype value) { switch (mode()) { case Caffe::CPU: caffe_set(this->count(), value, this->mutable_cpu_mem()); break; case Caffe::GPU: #ifndef CPU_ONLY caffe_gpu_set(this->count(), value, this->mutable_gpu_mem()); #else NO_GPU; #endif break; default: ASSERT(false, "Unknown caffe mode."); } }
GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device) : Params<Dtype>(root_solver) { int initial_device; CUDA_CHECK(cudaGetDevice(&initial_device)); // Allocate device buffers CUDA_CHECK(cudaSetDevice(device)); CUDA_CHECK(cudaMalloc(&data_, size_ * sizeof(Dtype))); // Copy blob values const vector<Blob<Dtype>*>& net = root_solver->net()->learnable_params(); apply_buffers(net, data_, size_, copy); CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype))); caffe_gpu_set(size_, Dtype(0), diff_); CUDA_CHECK(cudaSetDevice(initial_device)); }
void PowerLayer<Dtype>::Forward_gpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { Dtype* top_data = (top)[0]->mutable_gpu_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); caffe_gpu_set(count, value, top_data); return; } const Dtype* bottom_data = bottom[0]->gpu_data(); caffe_copy(count, bottom_data, top_data); if (scale_ != Dtype(1)) { caffe_gpu_scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { caffe_gpu_powx(count, top_data, power_, top_data); } }
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { init_cpu = false; init_gpu = false; const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param(); num_iterations_ = meanfield_param.num_iterations(); CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1."; theta_alpha_ = meanfield_param.theta_alpha(); theta_beta_ = meanfield_param.theta_beta(); theta_gamma_ = meanfield_param.theta_gamma(); count_ = bottom[0]->count(); num_ = bottom[0]->num(); channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); num_pixels_ = height_ * width_; LOG(INFO) << "This implementation has not been tested batch size > 1."; top[0]->Reshape(num_, channels_, height_, width_); // Initialize the parameters that will updated by backpropagation. if (this->blobs_.size() > 0) { LOG(INFO) << "Multimeanfield layer skipping parameter initialization."; } else { this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix // Allocate space for kernel weights. this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_)); this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data()); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data()); // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available. FILE * pFile; pFile = fopen("spatial.par", "r"); CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); pFile = fopen("bilateral.par", "r"); CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); // Initialize the compatibility matrix. this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data()); // Initialize it to have the Potts model. for (int c = 0; c < channels_; ++c) { (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.); } } float spatial_kernel[2 * num_pixels_]; float *spatial_kernel_gpu_; compute_spatial_kernel(spatial_kernel); spatial_lattice_.reset(new ModifiedPermutohedral()); spatial_norm_.Reshape(1, 1, height_, width_); Dtype* norm_data_gpu ; Dtype* norm_data; // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size. switch (Caffe::mode()) { case Caffe::CPU: norm_data = spatial_norm_.mutable_cpu_data(); spatial_lattice_->init(spatial_kernel, 2, width_, height_); // Calculate spatial filter normalization factors. norm_feed_= new Dtype[num_pixels_]; caffe_set(num_pixels_, Dtype(1.0), norm_feed_); // pass norm_feed and norm_data to gpu spatial_lattice_->compute(norm_data, norm_feed_, 1); bilateral_kernel_buffer_ = new float[5 * num_pixels_]; init_cpu = true; break; #ifndef CPU_ONLY case Caffe::GPU: CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ; spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_); CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ; caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_); norm_data_gpu = spatial_norm_.mutable_gpu_data(); spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); norm_data = spatial_norm_.mutable_cpu_data(); CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaFree(spatial_kernel_gpu_)); init_gpu = true; break; #endif default: LOG(FATAL) << "Unknown caffe mode."; } for (int i = 0; i < num_pixels_; ++i) { norm_data[i] = 1.0f / (norm_data[i] + 1e-20f); } bilateral_norms_.Reshape(num_, 1, height_, width_); // Configure the split layer that is used to make copies of the unary term. One copy for each iteration. // It may be possible to optimize this calculation later. split_layer_bottom_vec_.clear(); split_layer_bottom_vec_.push_back(bottom[0]); split_layer_top_vec_.clear(); split_layer_out_blobs_.resize(num_iterations_); for (int i = 0; i < num_iterations_; i++) { split_layer_out_blobs_[i].reset(new Blob<Dtype>()); split_layer_top_vec_.push_back(split_layer_out_blobs_[i].get()); } LayerParameter split_layer_param; split_layer_.reset(new SplitLayer<Dtype>(split_layer_param)); split_layer_->SetUp(split_layer_bottom_vec_, split_layer_top_vec_); // Make blobs to store outputs of each meanfield iteration. Output of the last iteration is stored in top[0]. // So we need only (num_iterations_ - 1) blobs. iteration_output_blobs_.resize(num_iterations_ - 1); for (int i = 0; i < num_iterations_ - 1; ++i) { iteration_output_blobs_[i].reset(new Blob<Dtype>(num_, channels_, height_, width_)); } // Make instances of MeanfieldIteration and initialize them. meanfield_iterations_.resize(num_iterations_); for (int i = 0; i < num_iterations_; ++i) { meanfield_iterations_[i].reset(new MeanfieldIteration<Dtype>()); meanfield_iterations_[i]->OneTimeSetUp( split_layer_out_blobs_[i].get(), // unary terms (i == 0) ? bottom[1] : iteration_output_blobs_[i - 1].get(), // softmax input (i == num_iterations_ - 1) ? top[0] : iteration_output_blobs_[i].get(), // output blob spatial_lattice_, // spatial lattice &spatial_norm_); // spatial normalization factors. } this->param_propagate_down_.resize(this->blobs_.size(), true); LOG(INFO) << ("MultiStageMeanfieldLayer initialized."); }
void TiedConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype> *> &top, const vector<bool> &propagate_down, vector<Blob<Dtype> *> *bottom) { const Dtype *weight = NULL; Dtype *weight_diff = NULL; if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->gpu_data(); weight_diff = this->blobs_[0]->mutable_gpu_diff(); // Init weight diffs to all 0s. caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } Dtype *bias_diff = NULL; if (bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_gpu_diff(); caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } const int weight_offset = M_ * K_; for (int i = 0; i < num_in_; ++i) { //-----Same concept as Backward_cpu of convolutionlayer----- const Dtype* top_diff = NULL; // Bias gradient if necessary if (bias_term_ && this->param_propagate_down_[1]) { top_diff = top[i]->gpu_diff(); for (int n = 0; n < num_; ++n) { caffe_gpu_gemv<Dtype>( CblasNoTrans, num_output_, N_[i], 1., top_diff + top[i]->offset(n), reinterpret_cast<const Dtype *>(bias_multipliers_[i]->gpu_data()), 1., bias_diff); } } if (this->param_propagate_down_[0] || propagate_down[i]) { if (!top_diff) { top_diff = top[i]->gpu_diff(); } Dtype* col_data = this->col_buffers_[i]->mutable_gpu_data(); const Dtype* bottom_data = (*bottom)[i]->gpu_data(); Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff(); const int col_offset = K_ * N_[i]; const int top_offset = M_ * N_[i]; for (int n = 0; n < num_; ++n) { // Since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_[i], (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } } // gradient w.r.t. bottom data, if necessary if (propagate_down[i]) { if (weight == NULL) { weight = this->blobs_[0]->gpu_data(); } for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_[i], M_, (Dtype)1., weight + weight_offset * g, top_diff + top[i]->offset(n) + top_offset * g, (Dtype)0., col_data + col_offset * g); } // col2im back to the data col2im_gpu(col_data, channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n)); } } } // montage_channels(this->blobs_[0].get(), // boost::lexical_cast<std::string>(M_) + " tconv bprop " + // boost::lexical_cast<std::string>(i) , true); //// make sure to give back the pointer to gpu after visualization // weight_diff = this->blobs_[0]->mutable_gpu_diff(); } // end for each input // montage_channels(this->blobs_[0].get(), "final tconv bprop " + // boost::lexical_cast<std::string>(M_), true); // cv::waitKey(0); }