void DeConvolutionLayer::col2im(Mat &dstMat) { if (is1x1()) return; if (dstMat.type() == CV_32F) col2im_cpu((float*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr()); if (dstMat.type() == CV_64F) col2im_cpu((double*)colMat.ptr(), inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr()); }
void conv_col2im_cpu(const Dtype* col_buff, Dtype* data){ if (!force_nd_im2col&&num_spatial_axes == 2){ col2im_cpu(col_buff, conv_in_channels, conv_input_shape.cpu_data()[1], conv_input_shape.cpu_data()[2], kernel_shape.cpu_data()[0], kernel_shape.cpu_data()[1], pad.cpu_data()[0], pad.cpu_data()[1], stride.cpu_data()[0], stride.cpu_data()[1], data); } }
void forward_deconvolutional_layer(const layer l, network net) { int i; int m = l.size * l.size * l.n; int n = l.h * l.w; int k = l.c; fill_cpu(l.outputs * l.batch, 0, l.output, 1); for (i = 0; i < l.batch; ++i) { real_t *a = l.weights; real_t *b = net.input + i * l.c * l.h * l.w; real_t *c = net.workspace; gemm_cpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n); col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output + i * l.outputs); } if (l.batch_normalize) { forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_w * l.out_h); } activate_array(l.output, l.batch * l.n * l.out_w * l.out_h, l.activation); }
void forward_deconvolutional_layer(const layer l, network_state state) { int i; int out_h = l.out_h; int out_w = l.out_w; int size = out_h*out_w; int m = l.size*l.size*l.n; int n = l.h*l.w; int k = l.c; fill_cpu(l.outputs*l.batch, 0, l.output, 1); for(i = 0; i < l.batch; ++i){ float *a = l.weights; float *b = state.input + i*l.c*l.h*l.w; float *c = state.workspace; gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size); } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.batch*l.n*size, l.activation); }
void backward_convolutional_layer(convolutional_layer l, network_state state) { int i; int m = l.n; int n = l.size*l.size*l.c; int k = convolutional_out_height(l)* convolutional_out_width(l); gradient_array(l.output, m*k*l.batch, l.activation, l.delta); backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); for(i = 0; i < l.batch; ++i){ float *a = l.delta + i*m*k; float *b = l.col_image; float *c = l.filter_updates; float *im = state.input+i*l.c*l.h*l.w; im2col_cpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(state.delta){ a = l.filters; b = l.delta + i*m*k; c = l.col_image; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); Dtype* bias_diff = NULL; if (bias_term_) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } const int weight_offset = M_ * K_; const int col_offset = K_ * N_; const int top_offset = M_ * N_; for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = (*bottom)[i]->cpu_data(); Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); Dtype* col_data = col_buffer_.mutable_cpu_data(); Dtype* col_diff = col_buffer_.mutable_cpu_diff(); // Bias gradient, if necessary. if (bias_term_) { for (int n = 0; n < num_; ++n) { caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), static_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1., bias_diff); } } for (int n = 0; n < num_; ++n) { // Since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_, width_, depth_, kernel_h_,kernel_w_,kernel_d_, pad_h_, pad_w_, pad_d_, stride_h_,stride_w_,stride_d_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_, (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } // gradient w.r.t. bottom data, if necessary if (propagate_down[i]) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_, (Dtype)1., weight + weight_offset * g, top_diff + top[i]->offset(n) + top_offset * g, (Dtype)0., col_diff + col_offset * g); } // col2im back to the data col2im_cpu(col_diff, channels_, height_, width_, depth_, kernel_h_,kernel_w_,kernel_d_, pad_h_, pad_w_, pad_d_, stride_h_, stride_w_, stride_d_, bottom_diff + (*bottom)[i]->offset(n)); } } } }
void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { col2im_cpu(top[0]->cpu_diff(), top[0]->num(), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, hole_h_, hole_w_, bottom[0]->mutable_cpu_diff()); }
void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const bool propagate_down, vector<Blob<Dtype>*>* bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, bottom_diff + (*bottom)[0]->offset(n)); } }
Dtype Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const bool propagate_down, vector<Blob<Dtype>*>* bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_cpu(top_diff + top[0]->offset(n), CHANNELS_, HEIGHT_, WIDTH_, KSIZE_, STRIDE_, bottom_diff + (*bottom)[0]->offset(n)); } return Dtype(0.); }
void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); for (int n = 0; n < num_; ++n) { col2im_cpu(top_diff + n * top_dim_, num_spatial_axes_, bottom[0]->shape().data() + channel_axis_, top[0]->shape().data() + channel_axis_, kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), bottom_diff + n * bottom_dim_); } }
inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { col2im_cpu(col_buff, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], data); } else { col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), data); } }
void backward_local_layer(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; float *b = l.col_image + j; float *c = l.filter_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(state.delta){ for(j = 0; j < locations; ++j){ float *a = l.filters + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; float *c = l.col_image + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
Dtype DeConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { //const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); //const Dtype* bottom_data = (*bottom)[0]->cpu_data(); Dtype* top_data = (*top)[0]->mutable_cpu_data(); Dtype* col_data = col_buffer_.mutable_cpu_data(); Dtype* col_diff = col_buffer_.mutable_cpu_diff(); int weight_offset = M_ * K_; int col_offset = K_ * N_; int bottom_offset = M_ * N_; for (int n = 0; n < num_; ++n) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_, (Dtype)1., weight + weight_offset * g, bottom_data + bottom[0]->offset(n) + bottom_offset * g, (Dtype)0., col_data + col_offset * g); } // col2im forward to the top_data col2im_cpu(col_data, channels_, height_out_, width_out_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); // add bias if (bias_term_) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); } // Debugging stuff /* for (int k = 0; k < col_buffer_.count(); ++k) { std::cout << col_buffer_.cpu_data()[k] << " "; } std::cout << std::endl; */ } return Dtype(0.); }
void backward_convolutional_layer(convolutional_layer l, network net) { int i, j; int m = l.n/l.groups; int n = l.size*l.size*l.c/l.groups; int k = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); } for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.delta + (i*l.groups + j)*m*k; float *b = net.workspace; float *c = l.weight_updates + j*l.nweights/l.groups; float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w; im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ a = l.weights + j*l.nweights/l.groups; b = l.delta + (i*l.groups + j)*m*k; c = net.workspace; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w); } } } }
void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); for (int n = 0; n < num_; ++n) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { col2im_cpu(top_diff + n * top_dim_, channels_, bottom[0]->shape(channel_axis_ + 1), bottom[0]->shape(channel_axis_ + 2), kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], dilation_.cpu_data()[0], dilation_.cpu_data()[1], bottom_diff + n * bottom_dim_); } else { col2im_nd_cpu(top_diff + n * top_dim_, num_spatial_axes_, bottom[0]->shape().data() + channel_axis_, top[0]->shape().data() + channel_axis_, kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), bottom_diff + n * bottom_dim_); } } }
void TiedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype> *> &top, const vector<bool> &propagate_down, vector<Blob<Dtype> *> *bottom) { //-----Same concept as Backward_cpu of convolutionlayer----- // but multiple times for each bottom-top pair, and accumulating dW const Dtype *weight = NULL; Dtype *weight_diff = NULL; if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->cpu_data(); weight_diff = this->blobs_[0]->mutable_cpu_diff(); // Init weight diff to all 0s. caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } // bias gradient if necessary Dtype *bias_diff = NULL; if (bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } const int weight_offset = M_ * K_; for (int i = 0; i < num_in_; ++i) { const Dtype *top_diff = NULL; // Bias gradient if necessary. if (bias_term_ && this->param_propagate_down_[1]) { top_diff = top[i]->cpu_diff(); for (int n = 0; n < num_; ++n) { caffe_cpu_gemv<Dtype>( CblasNoTrans, num_output_, N_[i], 1., top_diff + top[i]->offset(n), reinterpret_cast<const Dtype *>(bias_multipliers_[i]->cpu_data()), 1., bias_diff); } } if (this->param_propagate_down_[0] || propagate_down[i]) { if (!top_diff) { top_diff = top[i]->cpu_diff(); } Dtype* col_data = this->col_buffers_[i]->mutable_cpu_data(); const Dtype* bottom_data = (*bottom)[i]->cpu_data(); Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); const int col_offset = K_ * N_[i]; const int top_offset = M_ * N_[i]; for (int n = 0; n < num_; ++n) { // Since we saved memory in the forward pass by not storing all col // data, we will need to recompute them. im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. // AJ: propagate error Delta W_ij = error from above * this_activation^T if (this->param_propagate_down_[0]) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_[i], (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } } // gradient w.r.t. bottom data, if necessary // AJ: error here = W*error from above if (propagate_down[i]) { if (weight == NULL) { weight = this->blobs_[0]->cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_[i], M_, (Dtype)1., weight + weight_offset * g, top_diff + top[i]->offset(n) + top_offset * g, (Dtype)0., col_data + col_offset * g); } // col2im back to the data col2im_cpu(col_data, channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n)); } } } //--------------------------------------------------------- } }
void NonLocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { vector<bool> propagate_down_sub; propagate_down_sub.push_back(propagate_down[0]); propagate_down_sub.push_back(propagate_down[0]); if (propagate_down[0]) { for (int i = 0; i < eltwise_bottom_vec.size(); i++) caffe_set(eltwise_bottom_vec[i]->count(), (Dtype)0, eltwise_bottom_vec[i]->mutable_cpu_diff()); for (int i = 0; i < smooth_bottom_vec.size(); i++) caffe_set(smooth_bottom_vec[i]->count(), (Dtype)0, smooth_bottom_vec[i]->mutable_cpu_diff()); for (int i = 0; i < euclidean_bottom_vec.size(); i++) caffe_set(euclidean_bottom_vec[i]->count(), (Dtype)0, euclidean_bottom_vec[i]->mutable_cpu_diff()); for (int i = 0; i < split_1_bottom_vec.size(); i++) caffe_set(split_1_bottom_vec[i]->count(), (Dtype)0, split_1_bottom_vec[i]->mutable_cpu_diff()); for (int i = 0; i < smooth_top_vec.size(); i++) caffe_set(smooth_top_vec[i]->count(), (Dtype)0, smooth_top_vec[i]->mutable_cpu_diff()); for (int i = 0; i < split_0_top_vec.size(); i++) caffe_set(split_0_top_vec[i]->count(), (Dtype)0, split_0_top_vec[i]->mutable_cpu_diff()); for (int i = 0; i < split_3_top_vec.size(); i++) caffe_set(split_3_top_vec[i]->count(), (Dtype)0, split_3_top_vec[i]->mutable_cpu_diff()); for (int i = 0; i < normalize_top_vec.size(); i++) caffe_set(normalize_top_vec[i]->count(), (Dtype)0, normalize_top_vec[i]->mutable_cpu_diff()); if (top.size() == 3) eltwise_layer->Backward(eltwise_top_vec, propagate_down_sub, eltwise_bottom_vec); split_layer_2->Backward(split_2_top_vec, propagate_down_sub, split_2_bottom_vec); //int tmp_offset = smooth_top_vec[0]->offset(1); const int tmp_offset = split_3_top_vec[0]->offset(1); //const Dtype* eltwise_bottom_1_diff = eltwise_bottom_vec[1]->cpu_diff(); const Dtype* split_2_bottom_diff = split_2_bottom_vec[0]->cpu_diff(); //Dtype* smooth_top_diff = smooth_top_vec[0]->mutable_cpu_diff(); Dtype* split_3_top_diff = split_3_top_vec[0]->mutable_cpu_diff(); for (int n = 0; n < split_2_bottom_vec[0]->num(); ++n) { for (int ch = 0; ch < channels_; ++ch) { //caffe_add(tmp_offset, smooth_top_diff, split_2_bottom_diff, smooth_top_diff); caffe_add(tmp_offset, split_3_top_diff, split_2_bottom_diff, split_3_top_diff); split_2_bottom_diff += tmp_offset; } //smooth_top_diff += tmp_offset; split_3_top_diff += tmp_offset; } const int norm_offset = normalize_top_vec[0]->offset(1); Dtype* normalize_diff = normalize_top_vec[0]->mutable_cpu_diff(); const Dtype* top_1_diff = top[1]->cpu_diff(); for (int n = 0; n < normalize_top_vec[0]->num(); ++n) { for (int ch = 0; ch < channels_; ++ch) { caffe_add(tmp_offset, normalize_diff, top_1_diff, normalize_diff); top_1_diff += norm_offset; } normalize_diff += norm_offset; } //nomralize_top_vec[0]->ShareDiff(*top[1]); normalize_layer->Backward(normalize_top_vec, propagate_down_sub, normalize_bottom_vec); split_3_top_vec[1]->ShareDiff(*normalize_bottom_vec[0]); split_layer_3->Backward(split_3_top_vec, propagate_down_sub, split_3_bottom_vec); smooth_threshold_layer->Backward(smooth_top_vec, propagate_down_sub, smooth_bottom_vec); caffe_scal(euclidean_top_vec[0]->count(), (Dtype)(1.0 / bottom[0]->channels()), euclidean_top_vec[0]->mutable_cpu_diff()); euclidean_layer->Backward(euclidean_top_vec, propagate_down_sub, euclidean_bottom_vec); split_1_top_vec[1]->ShareDiff(*euclidean_bottom_vec[0]); split_layer_1->Backward(split_1_top_vec, propagate_down_sub, split_1_bottom_vec); for (int n = 0; n < num_; ++n) { col2im_center_cpu(img2col_1_top.cpu_diff() + img2col_1_top.offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, split_0_top_vec[1]->mutable_cpu_diff() + split_0_top_vec[1]->offset(n)); col2im_cpu(img2col_0_top.cpu_diff() + img2col_0_top.offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, 1,1, split_0_top_vec[0]->mutable_cpu_diff() + split_0_top_vec[0]->offset(n)); } split_layer_0->Backward(split_0_top_vec, propagate_down_sub,bottom); } }
inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); }
void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); Dtype* x_data = col_buffer_.mutable_cpu_data(); Dtype* x_diff = col_buffer_.mutable_cpu_diff(); const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); Dtype* bias_diff = NULL; Blob<Dtype> intermediate; intermediate.Reshape(1, 1, 1, N_); Blob<Dtype> xt; xt.Reshape(1, 1, K_, N_); Dtype* xt_data = xt.mutable_cpu_data(); if (bias_term_) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { caffe_add(M_ * N_, bias_diff, top_diff + top[0]->offset(n), bias_diff); } } memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()); for (int n=0; n<num_; n++) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data); // gradient wrt weight for (int m=0; m<num_output_; m++) { Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m); for (int k=0; k<K_; k++) { caffe_mul(N_, top_diff+top[0]->offset(n, m), x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k)); } caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff); } // gradient wrt bottom data if (propagate_down[0]) { memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)); for (int m=0; m<num_output_; m++) { for (int k=0; k<K_; k++) { caffe_mul(N_, top_diff+top[0]->offset(n, m), weight+this->blobs_[0]->offset(m,0,k), intermediate.mutable_cpu_data()); caffe_cpu_axpby(N_, Dtype(1.0), intermediate.cpu_data(), Dtype(1.0), x_diff+col_buffer_.offset(0,k)); } } // col2im back to the data col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n)); } } }