Ejemplo n.º 1
0
    void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup)
    {
        uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn);

        if (is1x1())
        {
            colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
            return;
        }

#ifdef HAVE_OPENCL
        if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1())
        {
            std::vector<Range> ranges(4, Range::all());
            ranges[0] = Range(imNum, imNum+1);
            ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);

            UMat src = inpBlob.matRef()(&ranges[0]).getUMat(ACCESS_READ);
            UMat dst(colMat.size(), colMat.type());
            im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
            dst.copyTo(colMat);
            return;
        }
#endif // HAVE_OPENCL

        if (inpBlob.type() == CV_32F)
            im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
        if (inpBlob.type() == CV_64F)
            im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
    }
Ejemplo n.º 2
0
void forward_convolutional_layer(convolutional_layer l, network_state state)
{
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    int i;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
    /*
       if(l.binary){
       binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
       binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales);
       swap_binary(&l);
       }
     */

    if(l.binary){
        int m = l.n;
        int k = l.size*l.size*l.c;
        int n = out_h*out_w;

        char  *a = l.cfilters;
        float *b = state.workspace;
        float *c = l.output;

        for(i = 0; i < l.batch; ++i){
            im2col_cpu(state.input, l.c, l.h, l.w, 
                    l.size, l.stride, l.pad, b);
            gemm_bin(m,n,k,1,a,k,b,n,c,n);
            c += n*m;
            state.input += l.c*l.h*l.w;
        }
        scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
        add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
        activate_array(l.output, m*n*l.batch, l.activation);
        return;
    }

    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    float *a = l.filters;
    float *b = state.workspace;
    float *c = l.output;

    for(i = 0; i < l.batch; ++i){
        im2col_cpu(state.input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, b);
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        c += n*m;
        state.input += l.c*l.h*l.w;
    }

    if(l.batch_normalize){
        forward_batchnorm_layer(l, state);
    }
    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

    activate_array(l.output, m*n*l.batch, l.activation);
}
Ejemplo n.º 3
0
bool BaseConvolutionLayer<Dtype>::setupMaskIM2COL() {
  if (height_ * width_ * channels_ * kernel_h_ * kernel_w_ * height_out_
      * width_out_ <= 0) {
    LOG(WARNING)<< "skipping because at least one value is zero";
    return false;
  }

  DLOG(INFO) << "num_        = " << num_;
  DLOG(INFO) << "height_     = " << height_;
  DLOG(INFO) << "width_      = " << width_;
  DLOG(INFO) << "channels_   = " << channels_;
  DLOG(INFO) << "kernel_h_   = " << kernel_h_;
  DLOG(INFO) << "kernel_w_   = " << kernel_w_;
  DLOG(INFO) << "stride_h_   = " << kernel_h_;
  DLOG(INFO) << "stride_w_   = " << kernel_w_;
  DLOG(INFO) << "height_out_ = " << height_out_;
  DLOG(INFO) << "width_out_  = " << width_out_;

  index_mask_.Reshape(1, 1, height_, width_);
  im2col_mask_.Reshape(1, channels_*kernel_h_*kernel_w_,
      height_out_, width_out_);
  col2im_mask_.Reshape(1, 1, height_, width_);

  for ( int pixel = 0; pixel < height_*width_; pixel++ ) {
    index_mask_.mutable_cpu_data()[pixel] = pixel;
  }

  // iSNAPSHOT("index mask", index_mask_.cpu_data(), height_*width_);
  DLOG(INFO) << "call im2col_cpu()";
  im2col_cpu(index_mask_.cpu_data(), channels_, height_,
      width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
      stride_h_, stride_w_, im2col_mask_.mutable_cpu_data());

  return true;
}
Ejemplo n.º 4
0
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  for (int n = 0; n < num_; ++n) {
    DCHECK_EQ(bottom[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
    DCHECK_EQ(top[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
    DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_);
    DCHECK_EQ(pad_.count(), num_spatial_axes_);
    DCHECK_EQ(stride_.count(), num_spatial_axes_);
    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
      im2col_cpu(bottom_data + n * bottom_dim_, channels_,
          bottom[0]->shape(channel_axis_ + 1),
          bottom[0]->shape(channel_axis_ + 2),
          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
          pad_.cpu_data()[0], pad_.cpu_data()[1],
          stride_.cpu_data()[0], stride_.cpu_data()[1],
          top_data + n * top_dim_);
    } else {
      im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
          bottom[0]->shape().data() + channel_axis_,
          top[0]->shape().data() + channel_axis_,
          kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
          top_data + n * top_dim_);
    }
  }
}
void backward_convolutional_layer(convolutional_layer l, network_state state)
{
    int i;
    int m = l.n;
    int n = l.size*l.size*l.c;
    int k = convolutional_out_height(l)*
        convolutional_out_width(l);

    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

    for(i = 0; i < l.batch; ++i){
        float *a = l.delta + i*m*k;
        float *b = l.col_image;
        float *c = l.filter_updates;

        float *im = state.input+i*l.c*l.h*l.w;

        im2col_cpu(im, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, b);
        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

        if(state.delta){
            a = l.filters;
            b = l.delta + i*m*k;
            c = l.col_image;

            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);

            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
}
Ejemplo n.º 6
0
void forward_local_layer(const local_layer l, network_state state) {
	int out_h = local_out_height(l);
	int out_w = local_out_width(l);
	int i, j;
	int locations = out_h * out_w;

	for (i = 0; i < l.batch; ++i) {
		copy_cpu(l.outputs, l.biases, 1, l.output + i * l.outputs, 1);
	}

	for (i = 0; i < l.batch; ++i) {
		float *input = state.input + i * l.w * l.h * l.c;
		im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image);
		float *output = l.output + i * l.outputs;
		for (j = 0; j < locations; ++j) {
			float *a = l.weights + j * l.size * l.size * l.c * l.n;
			float *b = l.col_image + j;
			float *c = output + j;

			int m = l.n;
			int n = 1;
			int k = l.size * l.size * l.c;

			gemm(0, 0, m, n, k, 1, a, k, b, locations, 1, c, locations);
		}
	}
	activate_array(l.output, l.outputs * l.batch, l.activation);
}
Ejemplo n.º 7
0
void forward_convolutional_layer(const convolutional_layer l, network_state state)
{
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    int i;

    bias_output(l.output, l.biases, l.batch, l.n, out_h*out_w);

    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    float *a = l.filters;
    float *b = l.col_image;
    float *c = l.output;

    for(i = 0; i < l.batch; ++i){
        im2col_cpu(state.input, l.c, l.h, l.w, 
            l.size, l.stride, l.pad, b);
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        c += n*m;
        state.input += l.c*l.h*l.w;
    }
    activate_array(l.output, m*n*l.batch, l.activation);
}
Ejemplo n.º 8
0
void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) {
  const Dtype* weight = this->blobs_[0]->cpu_data();
  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
  caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
  Dtype* bias_diff = NULL;
  if (bias_term_) {
    bias_diff = this->blobs_[1]->mutable_cpu_diff();
    caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
  }
  const int weight_offset = M_ * K_;
  const int col_offset = K_ * N_;
  const int top_offset = M_ * N_;
  for (int i = 0; i < top.size(); ++i) {
    const Dtype* top_diff = top[i]->cpu_diff();
    const Dtype* bottom_data = (*bottom)[i]->cpu_data();
    Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff();
    Dtype* col_data = col_buffer_.mutable_cpu_data();
    Dtype* col_diff = col_buffer_.mutable_cpu_diff();

    // Bias gradient, if necessary.
    if (bias_term_) {
      for (int n = 0; n < num_; ++n) {
        caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
            1., top_diff + top[0]->offset(n),
            static_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1.,
            bias_diff);
      }
    }
    for (int n = 0; n < num_; ++n) {
      // Since we saved memory in the forward pass by not storing all col data,
      // we will need to recompute them.
      im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
                 width_, depth_, kernel_h_,kernel_w_,kernel_d_, 
				 pad_h_, pad_w_, pad_d_,
				 stride_h_,stride_w_,stride_d_,
				 col_data);
      // gradient w.r.t. weight. Note that we will accumulate diffs.
      for (int g = 0; g < group_; ++g) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
          (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
          col_data + col_offset * g, (Dtype)1.,
          weight_diff + weight_offset * g);
      }
      // gradient w.r.t. bottom data, if necessary
      if (propagate_down[i]) {
        for (int g = 0; g < group_; ++g) {
          caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
            (Dtype)1., weight + weight_offset * g,
            top_diff + top[i]->offset(n) + top_offset * g,
            (Dtype)0., col_diff + col_offset * g);
        }
        // col2im back to the data
        col2im_cpu(col_diff, channels_, height_, width_, depth_, kernel_h_,kernel_w_,kernel_d_, 
					pad_h_, pad_w_, pad_d_, stride_h_, stride_w_, stride_d_,
					bottom_diff + (*bottom)[i]->offset(n));
      }
    }
  }
}
Ejemplo n.º 9
0
Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      vector<Blob<Dtype>*>* top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = (*top)[0]->mutable_cpu_data();
  Dtype* col_data = col_buffer_.mutable_cpu_data();
  const Dtype* weight = this->blobs_[0]->cpu_data();
  int weight_offset = M_ * K_;
  int col_offset = K_ * N_;
  int top_offset = M_ * N_;
  for (int n = 0; n < num_; ++n) {
    // First, im2col
    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
                      width_, kernel_size_, pad_, stride_, col_data);
    // Second, innerproduct with groups
    for (int g = 0; g < group_; ++g) {
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
        (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
        (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
    }
    // third, add bias
    if (bias_term_) {
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
          N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(),
          reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
          (Dtype)1., top_data + (*top)[0]->offset(n));
    }
  }
  return Dtype(0.);
}
Ejemplo n.º 10
0
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  im2col_cpu(bottom[0]->cpu_data(),
	bottom[0]->num(), channels_, height_, width_,
	kernel_h_, kernel_w_, pad_h_, pad_w_,
	stride_h_, stride_w_, hole_h_, hole_w_,
	top[0]->mutable_cpu_data());
}
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
		vector<Blob<Dtype>*>* top) {
	const Dtype* bottom_data = bottom[0]->cpu_data();
	Dtype* top_data = (*top)[0]->mutable_cpu_data();
	for (int n = 0; n < bottom[0]->num(); ++n) {
		im2col_cpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
				WIDTH_, KSIZE_, STRIDE_, top_data + (*top)[0]->offset(n));
	}
}
Ejemplo n.º 12
0
	//	wrap im2col using param in this class
	//	data is 3D(channels,height,width), col_buff is 3D(channels
	void conv_im2col_cpu(const Dtype* data, Dtype* col_buff){
		// only implements conv2D
		if (!force_nd_im2col&&num_spatial_axes == 2){
			//	im2col transform the input into the form which is convenient for convolution
			//	use conv_xxx cause dimensions could reverse in reshape(), we need dynamic input
			im2col_cpu(data, conv_in_channels, conv_input_shape.cpu_data()[1], conv_input_shape.cpu_data()[2],
				kernel_shape.cpu_data()[0], kernel_shape.cpu_data()[1], pad.cpu_data()[0], pad.cpu_data()[1],
				stride.cpu_data()[0], stride.cpu_data()[1], col_buff);
		}
	}
Ejemplo n.º 13
0
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  for (int n = 0; n < bottom[0]->num(); ++n) {
    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
        stride_h_, stride_w_, hole_h_, hole_w_, top_data + top[0]->offset(n));
  }
}
Ejemplo n.º 14
0
Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      vector<Blob<Dtype>*>* top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = (*top)[0]->mutable_cpu_data();
  for (int n = 0; n < bottom[0]->num(); ++n) {
    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
        width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n));
  }
  return Dtype(0.);
}
Ejemplo n.º 15
0
void DeConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
  const Dtype* top_diff = top[0]->cpu_diff();
  const Dtype* weight = this->blobs_[0]->cpu_data();
  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
  const Dtype* bottom_data = (*bottom)[0]->cpu_data();
  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
  Dtype* col_data = col_buffer_.mutable_cpu_data();
  Dtype* col_diff = col_buffer_.mutable_cpu_diff();
  Dtype* bias_diff = NULL;
  if (bias_term_) {
      bias_diff = this->blobs_[1]->mutable_cpu_diff();
      memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
      //JTS fixed gradient wrt. bias, not sure about the group stuff ...
      for (int n = 0; n < num_; ++n) {
            caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
                            1., top_diff + top[0]->offset(n), 
                            reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1.,
                            bias_diff);
      }
  }

  int weight_offset = M_ * K_;
  int col_offset = K_ * N_;
  int bottom_offset = M_ * N_;
  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
  for (int n = 0; n < num_; ++n) {
      im2col_cpu(top_diff + top[0]->offset(n), channels_, height_out_,
                 width_out_, kernel_size_, pad_, stride_, col_diff);
      // gradient wrt. weights
      for (int g = 0; g < group_; ++g) {
          caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
                                (Dtype)1., bottom_data + (*bottom)[0]->offset(n) + bottom_offset * g,
                                col_diff + col_offset * g, (Dtype)1.,
                                weight_diff + weight_offset * g);
      }
      if (propagate_down) {
          for (int g = 0; g < group_; ++g) {
              caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
                                    (Dtype)1., weight + weight_offset * g, col_diff + col_offset * g,
                                    (Dtype)0., bottom_diff + (*bottom)[0]->offset(n) + bottom_offset * g);
          }
      }
  }
  /* debug
  for (int n = 0; n < this->blobs_[0]->count(); ++n) {
      std::cout << this->blobs_[0]->cpu_diff()[n] << std::endl;
  }
  for (int n = 0; n < col_buffer_.count(); ++n) {
      //std::cout << col_buffer_.cpu_diff()[n] <<  "  "; 
      std::cout << top[0]->cpu_diff()[n] <<  "  "; 
  }
  std::cout << std::endl;
  */
}
Ejemplo n.º 16
0
 // wrap im2col/col2im so we don't have to remember the (long) argument lists
 inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
   if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
     im2col_cpu(data, conv_in_channels_,
         conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
         kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
         pad_.cpu_data()[0], pad_.cpu_data()[1],
         stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
   } else {
     im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
         col_buffer_shape_.data(), kernel_shape_.cpu_data(),
         pad_.cpu_data(), stride_.cpu_data(), col_buff);
   }
 }
Ejemplo n.º 17
0
Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      vector<Blob<Dtype>*>* top) {
  for (int i = 0; i < bottom.size(); ++i) {
    const Dtype* bottom_data = bottom[i]->cpu_data();
    Dtype* top_data = (*top)[i]->mutable_cpu_data();
    Dtype* col_data = col_buffer_.mutable_cpu_data();
    const Dtype* weight = this->blobs_[0]->cpu_data();
    int weight_offset = M_ * K_;
    int col_offset = K_ * N_;
    int top_offset = M_ * N_;
    for (int n = 0; n < num_; ++n) {
      // First, im2col
      im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
                        width_, depth_, kernel_h_,  kernel_w_,  kernel_d_, 
						pad_h_, pad_w_, pad_d_, stride_h_, stride_w_, stride_d_,
						col_data);
      // Second, innerproduct with groups
      for (int g = 0; g < group_; ++g) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
          (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
          (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
      }
      // third, add bias
      if (bias_term_) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
            N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(),
            reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()),
            (Dtype)1., top_data + (*top)[i]->offset(n));
      }
	  
	 
	  
		// if(this->layer_param_.name()=="fc"){
					    // const Dtype* bias_v =this->blobs_[1]->cpu_data();
						// for(size_t t=0;t<this->blobs_[1]->count();t++){
							// d_test+=top_data[t];
							// if(top_data[t]!=0 && name_ =="fc_1"){LOG(INFO)<<top_data[t];}
							// if(isnan(top_data[t])&&name_ =="fc_1"){
							// LOG(INFO)<<"bias ["<< t<<"]="<<bias_v[t]<<"out of " <<this->blobs_[1]->count();							
							// sleep(100);
							// }
						// }
	  
	  
    }
  }
  
  return Dtype(0.);
}
Ejemplo n.º 18
0
void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  for (int i = 0; i < bottom.size(); ++i) {
    const Dtype* bottom_data = bottom[i]->cpu_data();
    Dtype* top_data = top[i]->mutable_cpu_data();
    Dtype* col_buff = NULL;
    if (!is_1x1_) {
      col_buff = col_buffer_.mutable_cpu_data();
    }
    const Dtype* weight = this->blobs_[0]->cpu_data();
    int weight_offset = M_ * K_;  // number of filter parameters in a group
    int col_offset = K_ * N_;  // number of values in an input region / column
    int top_offset = M_ * N_;  // number of values in an output region / column
    for (int n = 0; n < num_; ++n) {
      // im2col transformation: unroll input regions for filtering
      // into column matrix for multplication.
      if (!is_1x1_) {
        im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
            width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
            col_buff);
      } else {  // special case for 1x1 convolution
        col_buff = bottom[i]->mutable_cpu_data() + bottom[i]->offset(n);
      }
      // Take inner products for groups.
      for (int g = 0; g < group_; ++g) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
          (Dtype)1., weight + weight_offset * g, col_buff + col_offset * g,
          (Dtype)0., top_data + top[i]->offset(n) + top_offset * g);
        /**
         * void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C) {
         */
      }
      // Add bias.
      if (bias_term_) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
            N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(),
            bias_multiplier_.cpu_data(),
            (Dtype)1., top_data + top[i]->offset(n));
      }
    }
  }
}
Ejemplo n.º 19
0
void backward_local_layer(local_layer l, network_state state)
{
    int i, j;
    int locations = l.out_w*l.out_h;

    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

    for(i = 0; i < l.batch; ++i){
        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
    }

    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_cpu(input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, l.col_image);

        for(j = 0; j < locations; ++j){ 
            float *a = l.delta + i*l.outputs + j;
            float *b = l.col_image + j;
            float *c = l.filter_updates + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
            int k = 1;

            gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
        }

        if(state.delta){
            for(j = 0; j < locations; ++j){ 
                float *a = l.filters + j*l.size*l.size*l.c*l.n;
                float *b = l.delta + i*l.outputs + j;
                float *c = l.col_image + j;

                int m = l.size*l.size*l.c;
                int n = 1;
                int k = l.n;

                gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
            }

            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
}
Ejemplo n.º 20
0
void backward_deconvolutional_layer(layer l, network_state state)
{
    float alpha = 1./l.batch;
    int out_h = deconvolutional_out_height(l);
    int out_w = deconvolutional_out_width(l);
    int size = out_h*out_w;
    int i;

    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
    if(l.batch_normalize){
        backward_batchnorm_layer(l, state);
    } else {
        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
    }

    for(i = 0; i < l.batch; ++i){
        int m = l.c;
        int n = l.size*l.size*l.n;
        int k = l.h*l.w;

        float *a = state.input + i*m*n;
        float *b = state.workspace;
        float *c = l.weight_updates;

        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, 
                l.size, l.stride, 0, b);
        gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n);

        if(state.delta){
            int m = l.c;
            int n = l.h*l.w;
            int k = l.size*l.size*l.n;

            float *a = l.weights;
            float *b = state.workspace;
            float *c = state.delta + i*n*m;

            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }
    }
}
Ejemplo n.º 21
0
void backward_convolutional_layer(convolutional_layer l, network net)
{
    int i, j;
    int m = l.n/l.groups;
    int n = l.size*l.size*l.c/l.groups;
    int k = l.out_w*l.out_h;

    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

    if(l.batch_normalize){
        backward_batchnorm_layer(l, net);
    } else {
        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
    }

    for(i = 0; i < l.batch; ++i){
        for(j = 0; j < l.groups; ++j){
            float *a = l.delta + (i*l.groups + j)*m*k;
            float *b = net.workspace;
            float *c = l.weight_updates + j*l.nweights/l.groups;

            float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w;

            im2col_cpu(im, l.c/l.groups, l.h, l.w, 
                    l.size, l.stride, l.pad, b);
            gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

            if(net.delta){
                a = l.weights + j*l.nweights/l.groups;
                b = l.delta + (i*l.groups + j)*m*k;
                c = net.workspace;

                gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);

                col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, 
                    l.pad, net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w);
            }
        }
    }
}
void backward_deconvolutional_layer(layer l, network net)
{
    int i;

    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

    if(l.batch_normalize){
        backward_batchnorm_layer(l, net);
    } else {
        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
    }

    //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));

    for(i = 0; i < l.batch; ++i){
        int m = l.c;
        int n = l.size*l.size*l.n;
        int k = l.h*l.w;

        float *a = net.input + i*m*k;
        float *b = net.workspace;
        float *c = l.weight_updates;

        im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, 
                l.size, l.stride, l.pad, b);
        gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

        if(net.delta){
            int m = l.c;
            int n = l.h*l.w;
            int k = l.size*l.size*l.n;

            float *a = l.weights;
            float *b = net.workspace;
            float *c = net.delta + i*n*m;

            gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }
    }
}
Ejemplo n.º 23
0
void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {

  Dtype* x_data = col_buffer_.mutable_cpu_data();
  const Dtype* weight = this->blobs_[0]->cpu_data();
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();

  Blob<Dtype> E;
  E.Reshape(1, 1, 1, K_);
  FillerParameter filler_param;
  filler_param.set_value(1);
  ConstantFiller<Dtype> filler(filler_param);
  filler.Fill(&E);

  Blob<Dtype> intermediate;
  intermediate.Reshape(1, 1, K_, N_);
  for (int n=0; n<num_; n++) {
    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);

    for (int m=0; m<num_output_; m++) { 
      caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m),
                intermediate.mutable_cpu_data());

      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_,
                            (Dtype)1., E.cpu_data(),
                            intermediate.cpu_data(),
                            (Dtype)0., top_data + top[0]->offset(n, m));
    }

    if (bias_term_) {
      caffe_add(M_ * N_, this->blobs_[1]->cpu_data(),
                top_data + top[0]->offset(n),
                top_data + top[0]->offset(n));
    }
  }
}
Ejemplo n.º 24
0
void
TiedConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype> *> &bottom,
                                         vector<Blob<Dtype> *> *top) {
  const Dtype *weight = this->blobs_[0]->cpu_data();
  const int weight_offset = M_ * K_; // number of filter parameters in a group
  for (int i = 0; i < num_in_; ++i) {
    //-----Same concept as Forward_cpu of convolutionlayer-----
    const Dtype *bottom_data = bottom[i]->cpu_data();
    const int col_offset = K_ * N_[i];
    const int top_offset = M_ * N_[i];
    Dtype *top_data = (*top)[i]->mutable_cpu_data();
    Dtype *col_data = this->col_buffers_[i]->mutable_cpu_data();
    for (int n = 0; n < num_; ++n) {
      // im2col transformation: unroll input regions for filtering
      // into column matrix for multplication.
      im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_[i],
                 width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
                 stride_w_, col_data);
      // Take innerproduct for groups.
      for (int g = 0; g < group_; ++g) {
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_[i], K_,
                              (Dtype)1., weight + weight_offset * g,
                              col_data + col_offset * g, (Dtype)0.,
                              top_data + (*top)[i]->offset(n) + top_offset * g);
      }
      // Add bias.
      if (bias_term_) {
        caffe_cpu_gemm<Dtype>(
            CblasNoTrans, CblasNoTrans, num_output_, N_[i], 1, (Dtype)1.,
            this->blobs_[1]->cpu_data(),
            reinterpret_cast<const Dtype *>(bias_multipliers_[i]->cpu_data()),
            (Dtype)1., top_data + (*top)[i]->offset(n));
      }
    }
    //---------------------------------------------------------
  }
}
Ejemplo n.º 25
0
void forward_convolutional_layer(convolutional_layer l, network net)
{
    int i, j;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

    if(l.xnor){
        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
        swap_binary(&l);
        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
        net.input = l.binary_input;
    }

    int m = l.n/l.groups;
    int k = l.size*l.size*l.c/l.groups;
    int n = l.out_w*l.out_h;
    for(i = 0; i < l.batch; ++i){
        for(j = 0; j < l.groups; ++j){
            float *a = l.weights + j*l.nweights/l.groups;
            float *b = net.workspace;
            float *c = l.output + (i*l.groups + j)*n*m;

            im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w,
                l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }
    }

    if(l.batch_normalize){
        forward_batchnorm_layer(l, net);
    } else {
        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
    }

    activate_array(l.output, l.outputs*l.batch, l.activation);
    if(l.binary || l.xnor) swap_binary(&l);
}
Ejemplo n.º 26
0
void
TiedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype> *> &top,
                                          const vector<bool> &propagate_down,
                                          vector<Blob<Dtype> *> *bottom) {
  //-----Same concept as Backward_cpu of convolutionlayer-----
  // but multiple times for each bottom-top pair, and accumulating dW
  const Dtype *weight = NULL;
  Dtype *weight_diff = NULL;
  if (this->param_propagate_down_[0]) {
    weight = this->blobs_[0]->cpu_data();
    weight_diff = this->blobs_[0]->mutable_cpu_diff();
    // Init weight diff to all 0s.
    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
  }
  // bias gradient if necessary
  Dtype *bias_diff = NULL;
  if (bias_term_ && this->param_propagate_down_[1]) {
    bias_diff = this->blobs_[1]->mutable_cpu_diff();
    caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
  }

  const int weight_offset = M_ * K_;
  for (int i = 0; i < num_in_; ++i) {
    const Dtype *top_diff = NULL;
    // Bias gradient if necessary.
    if (bias_term_ && this->param_propagate_down_[1]) {
      top_diff = top[i]->cpu_diff();
      for (int n = 0; n < num_; ++n) {
        caffe_cpu_gemv<Dtype>(
            CblasNoTrans, num_output_, N_[i], 1., top_diff + top[i]->offset(n),
            reinterpret_cast<const Dtype *>(bias_multipliers_[i]->cpu_data()),
            1., bias_diff);
      }
    }
    if (this->param_propagate_down_[0] || propagate_down[i]) {
      if (!top_diff) {
        top_diff = top[i]->cpu_diff();
      }
      Dtype* col_data = this->col_buffers_[i]->mutable_cpu_data();
      const Dtype* bottom_data = (*bottom)[i]->cpu_data();
      Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff();

      const int col_offset = K_ * N_[i];
      const int top_offset = M_ * N_[i];
      for (int n = 0; n < num_; ++n) {
	// Since we saved memory in the forward pass by not storing all col
	// data, we will need to recompute them.
	im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_[i],
		   width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
		   stride_w_, col_data);
	// gradient w.r.t. weight. Note that we will accumulate diffs.
	// AJ: propagate error Delta W_ij = error from above * this_activation^T
        if (this->param_propagate_down_[0]) {
	  for (int g = 0; g < group_; ++g) {
	    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_[i],
				  (Dtype)1.,
				  top_diff + top[i]->offset(n) + top_offset * g,
				  col_data + col_offset * g, (Dtype)1.,
				  weight_diff + weight_offset * g);
	  }
	}
	// gradient w.r.t. bottom data, if necessary
	// AJ: error here = W*error from above
	if (propagate_down[i]) {
          if (weight == NULL) {
            weight = this->blobs_[0]->cpu_data();
          }
	  for (int g = 0; g < group_; ++g) {
	    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_[i], M_,
				  (Dtype)1., weight + weight_offset * g,
				  top_diff + top[i]->offset(n) + top_offset * g,
				  (Dtype)0., col_data + col_offset * g);
	  }
	  // col2im back to the data
	  col2im_cpu(col_data, channels_, height_[i], width_[i], kernel_h_,
		     kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
		     bottom_diff + (*bottom)[i]->offset(n));
	}
      }
    }
    //---------------------------------------------------------
  }
}
void forward_convolutional_layer(const convolutional_layer l, network_state state)
{
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    int i;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    float *a = l.filters;
    float *b = l.col_image;
    float *c = l.output;
//	printf("the l.size is %i \n", l.size);
///*

//printf("the m,k,n is %i,%i,%i \n", m,k,n);
    for(i = 0; i < l.batch; ++i){
        im2col_cpu(state.input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, b);
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        c += n*m;
        state.input += l.c*l.h*l.w;
    }
//*/

//add by fanghao
 /*   int ii,jj,kk,mm,pp,tt;
    int lcc = l.c;
    int lhh = l.h;
    int lww = l.w;
    int kernel = l.size;
    int pad;
    if(l.pad)
         pad = l.size/2;
    else
	 pad = l.pad;
    lhh += 2*pad;
    lww += 2*pad;
    float *dataP;
    dataP = (float *)calloc(lcc*lhh*lww, sizeof(float));


//printf("the l.h is %i \n", l.h);
//printf("the l.w is %i \n", l.w);
//printf("the lhh is %i \n", lhh);
//printf("the lww is %i \n", lww);
//printf("the pad is %i \n", pad);




    for(ii=0; ii < lcc; ii++)
        for(jj=pad; jj<lhh-pad; jj++)
             for(kk=pad; kk<lww-pad; kk++)
                dataP[ii*lhh*lww + jj*lww + kk] = state.input[ii*(lhh - 2*pad)*(lww-2*pad) + (jj - pad)*(lww - 2*pad) + kk-pad];

    for(ii=0; ii<m; ii++)
        for(jj=0; jj<out_h; jj++)
            for(kk=0; kk<out_w; kk++) {
                float tempAcc = 0.0;
                    for(mm=0; mm<lcc; mm++)
                        for(pp=0; pp<kernel; pp++)
                            for(tt=0; tt<kernel; tt++)
                                tempAcc += a[ii*lcc*kernel*kernel+mm*kernel*kernel+pp*kernel+tt]*dataP[mm*lhh*lww+(l.stride*jj+pp)*lww+l.stride*kk+tt];
                c[ii*out_h*out_w+jj*out_w+kk] = tempAcc;
                        }
//	c += n*m;
//state.input += l.c*l.h*l.w;
//
*/



    if(l.batch_normalize){
        if(state.train){
            mean_cpu(l.output, l.batch, l.n, l.out_h*l.out_w, l.mean);   
            variance_cpu(l.output, l.mean, l.batch, l.n, l.out_h*l.out_w, l.variance);   
            normalize_cpu(l.output, l.mean, l.variance, l.batch, l.n, l.out_h*l.out_w);   
        } else {
            normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.n, l.out_h*l.out_w);
        }
        scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
    }
    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

    activate_array(l.output, m*n*l.batch, l.activation);
}
Ejemplo n.º 28
0
	void NonLocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
		const vector<Blob<Dtype>*>& top)
	{
		split_layer_0->Forward(bottom, split_0_top_vec);

		for (int n = 0; n < num_; ++n)
		{
			im2col_cpu(split_0_top_vec[0]->cpu_data() + split_0_top_vec[0]->offset(n), channels_, height_, width_,
				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
				1,1,
				img2col_0_top.mutable_cpu_data() + img2col_0_top.offset(n));

			im2col_center_cpu(split_0_top_vec[1]->cpu_data() + split_0_top_vec[1]->offset(n),
				channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
				img2col_1_top.mutable_cpu_data() + img2col_1_top.offset(n));
		}

		split_layer_1->Forward(split_1_bottom_vec, split_1_top_vec);
		euclidean_bottom_vec[0]->ShareData(*split_1_top_vec[1]);
		euclidean_layer->Forward(euclidean_bottom_vec, euclidean_top_vec);

		caffe_scal(euclidean_top_vec[0]->count(),
			(Dtype)(1.0 / bottom[0]->channels()), euclidean_top_vec[0]->mutable_cpu_data());

		smooth_threshold_layer->Forward(smooth_bottom_vec, smooth_top_vec);
		split_layer_3->Forward(split_3_bottom_vec, split_3_top_vec);
		normalize_bottom_vec[0]->ShareData(*split_3_top_vec[1]);
		normalize_layer->Forward(normalize_bottom_vec, normalize_top_vec);
		//top[1]->ShareData(*normalize_top_vec[0]);
		const Dtype* normalize_top_data = normalize_top_vec[0]->cpu_data();
		const Dtype* split_3_top_data_1 = normalize_top_vec[0]->cpu_data();
		Dtype* top_1_data = top[1]->mutable_cpu_data();
		const int norm_offset = normalize_top_vec[0]->offset(1);

		for (int n = 0; n < normalize_top_vec[0]->num(); ++n)
		{
			for (int ch = 0; ch < channels_; ++ch)
			{
				caffe_copy(norm_offset, split_3_top_data_1, top_1_data);
				top_1_data += norm_offset;
			}
			split_3_top_data_1 += norm_offset;
		}

		//int tmp_offset = smooth_top_vec[0]->count() / smooth_top_vec[0]->num();
		const int tmp_offset = split_3_top_vec[0]->offset(1);
		
		Dtype* split_2_bottom_data = split_2_bottom_vec[0]->mutable_cpu_data();
		//const Dtype* smooth_top_data = smooth_top_vec[0]->cpu_data();
		const Dtype* split_3_top_data = split_3_top_vec[0]->cpu_data();
		for (int n = 0; n < split_2_bottom_vec[0]->num(); ++n)
		{
			for (int ch = 0; ch < channels_; ++ch)
			{
				//caffe_copy(tmp_offset, smooth_top_data, split_2_bottom_data);
				caffe_copy(tmp_offset, split_3_top_data, split_2_bottom_data);
				split_2_bottom_data += tmp_offset;
			}
			//smooth_top_data += smooth_top_vec[0]->offset(1);
			split_3_top_data += tmp_offset;
		}

		split_layer_2->Forward(split_2_bottom_vec, split_2_top_vec);
		if (top.size() == 3)
			eltwise_layer->Forward(eltwise_bottom_vec, eltwise_top_vec);
	
	}
Ejemplo n.º 29
0
 // wrap im2col/col2im so we don't have to remember the (long) argument lists
 inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
   im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
 }
Ejemplo n.º 30
0
void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

  const Dtype* top_diff = top[0]->cpu_diff();
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  Dtype* x_data = col_buffer_.mutable_cpu_data();
  Dtype* x_diff = col_buffer_.mutable_cpu_diff();
  const Dtype* weight = this->blobs_[0]->cpu_data();
  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
  Dtype* bias_diff = NULL;

  Blob<Dtype> intermediate;
  intermediate.Reshape(1, 1, 1, N_);

  Blob<Dtype> xt;
  xt.Reshape(1, 1, K_, N_);
  Dtype* xt_data = xt.mutable_cpu_data();

  if (bias_term_) {
    bias_diff = this->blobs_[1]->mutable_cpu_diff();
    memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count());
    for (int n = 0; n < num_; ++n) {
      caffe_add(M_ * N_, bias_diff,
                top_diff + top[0]->offset(n),
                bias_diff);
    }
  }

  memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count());
  for (int n=0; n<num_; n++) {
    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
               width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data);

    // gradient wrt weight
    for (int m=0; m<num_output_; m++) {
      Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m);
      for (int k=0; k<K_; k++) {
        caffe_mul(N_, top_diff+top[0]->offset(n, m),  
                  x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k));
      }
      caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff);
    }
      
    // gradient wrt bottom data
    if (propagate_down[0]) {
      memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype));
      for (int m=0; m<num_output_; m++) {
        for (int k=0; k<K_; k++) {
          caffe_mul(N_, top_diff+top[0]->offset(n, m),
                    weight+this->blobs_[0]->offset(m,0,k),
                    intermediate.mutable_cpu_data());

          caffe_cpu_axpby(N_, Dtype(1.0),
                          intermediate.cpu_data(), Dtype(1.0),
                          x_diff+col_buffer_.offset(0,k));
        }
      }

      // col2im back to the data
      col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_,
                 pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n));

    }
  }

}