void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup) { uchar *srcPtr = inpBlob.ptr(imNum, cnGroup*inpGroupCn); if (is1x1()) { colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr); return; } #ifdef HAVE_OPENCL if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1()) { std::vector<Range> ranges(4, Range::all()); ranges[0] = Range(imNum, imNum+1); ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn); UMat src = inpBlob.matRef()(&ranges[0]).getUMat(ACCESS_READ); UMat dst(colMat.size(), colMat.type()); im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst); dst.copyTo(colMat); return; } #endif // HAVE_OPENCL if (inpBlob.type() == CV_32F) im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr()); if (inpBlob.type() == CV_64F) im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr()); }
void forward_convolutional_layer(convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); /* if(l.binary){ binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales); swap_binary(&l); } */ if(l.binary){ int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; char *a = l.cfilters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm_bin(m,n,k,1,a,k,b,n,c,n); c += n*m; state.input += l.c*l.h*l.w; } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); return; } int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
bool BaseConvolutionLayer<Dtype>::setupMaskIM2COL() { if (height_ * width_ * channels_ * kernel_h_ * kernel_w_ * height_out_ * width_out_ <= 0) { LOG(WARNING)<< "skipping because at least one value is zero"; return false; } DLOG(INFO) << "num_ = " << num_; DLOG(INFO) << "height_ = " << height_; DLOG(INFO) << "width_ = " << width_; DLOG(INFO) << "channels_ = " << channels_; DLOG(INFO) << "kernel_h_ = " << kernel_h_; DLOG(INFO) << "kernel_w_ = " << kernel_w_; DLOG(INFO) << "stride_h_ = " << kernel_h_; DLOG(INFO) << "stride_w_ = " << kernel_w_; DLOG(INFO) << "height_out_ = " << height_out_; DLOG(INFO) << "width_out_ = " << width_out_; index_mask_.Reshape(1, 1, height_, width_); im2col_mask_.Reshape(1, channels_*kernel_h_*kernel_w_, height_out_, width_out_); col2im_mask_.Reshape(1, 1, height_, width_); for ( int pixel = 0; pixel < height_*width_; pixel++ ) { index_mask_.mutable_cpu_data()[pixel] = pixel; } // iSNAPSHOT("index mask", index_mask_.cpu_data(), height_*width_); DLOG(INFO) << "call im2col_cpu()"; im2col_cpu(index_mask_.cpu_data(), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, im2col_mask_.mutable_cpu_data()); return true; }
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); for (int n = 0; n < num_; ++n) { DCHECK_EQ(bottom[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1); DCHECK_EQ(top[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1); DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_); DCHECK_EQ(pad_.count(), num_spatial_axes_); DCHECK_EQ(stride_.count(), num_spatial_axes_); if (!force_nd_im2col_ && num_spatial_axes_ == 2) { im2col_cpu(bottom_data + n * bottom_dim_, channels_, bottom[0]->shape(channel_axis_ + 1), bottom[0]->shape(channel_axis_ + 2), kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], top_data + n * top_dim_); } else { im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_, bottom[0]->shape().data() + channel_axis_, top[0]->shape().data() + channel_axis_, kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), top_data + n * top_dim_); } } }
void backward_convolutional_layer(convolutional_layer l, network_state state) { int i; int m = l.n; int n = l.size*l.size*l.c; int k = convolutional_out_height(l)* convolutional_out_width(l); gradient_array(l.output, m*k*l.batch, l.activation, l.delta); backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); for(i = 0; i < l.batch; ++i){ float *a = l.delta + i*m*k; float *b = l.col_image; float *c = l.filter_updates; float *im = state.input+i*l.c*l.h*l.w; im2col_cpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(state.delta){ a = l.filters; b = l.delta + i*m*k; c = l.col_image; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void forward_local_layer(const local_layer l, network_state state) { int out_h = local_out_height(l); int out_w = local_out_width(l); int i, j; int locations = out_h * out_w; for (i = 0; i < l.batch; ++i) { copy_cpu(l.outputs, l.biases, 1, l.output + i * l.outputs, 1); } for (i = 0; i < l.batch; ++i) { float *input = state.input + i * l.w * l.h * l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); float *output = l.output + i * l.outputs; for (j = 0; j < locations; ++j) { float *a = l.weights + j * l.size * l.size * l.c * l.n; float *b = l.col_image + j; float *c = output + j; int m = l.n; int n = 1; int k = l.size * l.size * l.c; gemm(0, 0, m, n, k, 1, a, k, b, locations, 1, c, locations); } } activate_array(l.output, l.outputs * l.batch, l.activation); }
void forward_convolutional_layer(const convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; bias_output(l.output, l.biases, l.batch, l.n, out_h*out_w); int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = l.col_image; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } activate_array(l.output, m*n*l.batch, l.activation); }
void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); Dtype* bias_diff = NULL; if (bias_term_) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } const int weight_offset = M_ * K_; const int col_offset = K_ * N_; const int top_offset = M_ * N_; for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = (*bottom)[i]->cpu_data(); Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); Dtype* col_data = col_buffer_.mutable_cpu_data(); Dtype* col_diff = col_buffer_.mutable_cpu_diff(); // Bias gradient, if necessary. if (bias_term_) { for (int n = 0; n < num_; ++n) { caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), static_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1., bias_diff); } } for (int n = 0; n < num_; ++n) { // Since we saved memory in the forward pass by not storing all col data, // we will need to recompute them. im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_, width_, depth_, kernel_h_,kernel_w_,kernel_d_, pad_h_, pad_w_, pad_d_, stride_h_,stride_w_,stride_d_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_, (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } // gradient w.r.t. bottom data, if necessary if (propagate_down[i]) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_, (Dtype)1., weight + weight_offset * g, top_diff + top[i]->offset(n) + top_offset * g, (Dtype)0., col_diff + col_offset * g); } // col2im back to the data col2im_cpu(col_diff, channels_, height_, width_, depth_, kernel_h_,kernel_w_,kernel_d_, pad_h_, pad_w_, pad_d_, stride_h_, stride_w_, stride_d_, bottom_diff + (*bottom)[i]->offset(n)); } } } }
Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = (*top)[0]->mutable_cpu_data(); Dtype* col_data = col_buffer_.mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g); } // third, add bias if (bias_term_) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), (Dtype)1., top_data + (*top)[0]->offset(n)); } } return Dtype(0.); }
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { im2col_cpu(bottom[0]->cpu_data(), bottom[0]->num(), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, hole_h_, hole_w_, top[0]->mutable_cpu_data()); }
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = (*top)[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_cpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_, WIDTH_, KSIZE_, STRIDE_, top_data + (*top)[0]->offset(n)); } }
// wrap im2col using param in this class // data is 3D(channels,height,width), col_buff is 3D(channels void conv_im2col_cpu(const Dtype* data, Dtype* col_buff){ // only implements conv2D if (!force_nd_im2col&&num_spatial_axes == 2){ // im2col transform the input into the form which is convenient for convolution // use conv_xxx cause dimensions could reverse in reshape(), we need dynamic input im2col_cpu(data, conv_in_channels, conv_input_shape.cpu_data()[1], conv_input_shape.cpu_data()[2], kernel_shape.cpu_data()[0], kernel_shape.cpu_data()[1], pad.cpu_data()[0], pad.cpu_data()[1], stride.cpu_data()[0], stride.cpu_data()[1], col_buff); } }
void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, hole_h_, hole_w_, top_data + top[0]->offset(n)); } }
Dtype Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = (*top)[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, pad_, stride_, top_data + (*top)[0]->offset(n)); } return Dtype(0.); }
void DeConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const bool propagate_down, vector<Blob<Dtype>*>* bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); const Dtype* bottom_data = (*bottom)[0]->cpu_data(); Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff(); Dtype* col_data = col_buffer_.mutable_cpu_data(); Dtype* col_diff = col_buffer_.mutable_cpu_diff(); Dtype* bias_diff = NULL; if (bias_term_) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()); //JTS fixed gradient wrt. bias, not sure about the group stuff ... for (int n = 0; n < num_; ++n) { caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 1., top_diff + top[0]->offset(n), reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), 1., bias_diff); } } int weight_offset = M_ * K_; int col_offset = K_ * N_; int bottom_offset = M_ * N_; memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()); for (int n = 0; n < num_; ++n) { im2col_cpu(top_diff + top[0]->offset(n), channels_, height_out_, width_out_, kernel_size_, pad_, stride_, col_diff); // gradient wrt. weights for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_, (Dtype)1., bottom_data + (*bottom)[0]->offset(n) + bottom_offset * g, col_diff + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } if (propagate_down) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1., weight + weight_offset * g, col_diff + col_offset * g, (Dtype)0., bottom_diff + (*bottom)[0]->offset(n) + bottom_offset * g); } } } /* debug for (int n = 0; n < this->blobs_[0]->count(); ++n) { std::cout << this->blobs_[0]->cpu_diff()[n] << std::endl; } for (int n = 0; n < col_buffer_.count(); ++n) { //std::cout << col_buffer_.cpu_diff()[n] << " "; std::cout << top[0]->cpu_diff()[n] << " "; } std::cout << std::endl; */ }
// wrap im2col/col2im so we don't have to remember the (long) argument lists inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { im2col_cpu(data, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff); } else { im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), col_buff); } }
Dtype ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) { for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = (*top)[i]->mutable_cpu_data(); Dtype* col_data = col_buffer_.mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); int weight_offset = M_ * K_; int col_offset = K_ * N_; int top_offset = M_ * N_; for (int n = 0; n < num_; ++n) { // First, im2col im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_, width_, depth_, kernel_h_, kernel_w_, kernel_d_, pad_h_, pad_w_, pad_d_, stride_h_, stride_w_, stride_d_, col_data); // Second, innerproduct with groups for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g); } // third, add bias if (bias_term_) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), reinterpret_cast<const Dtype*>(bias_multiplier_->cpu_data()), (Dtype)1., top_data + (*top)[i]->offset(n)); } // if(this->layer_param_.name()=="fc"){ // const Dtype* bias_v =this->blobs_[1]->cpu_data(); // for(size_t t=0;t<this->blobs_[1]->count();t++){ // d_test+=top_data[t]; // if(top_data[t]!=0 && name_ =="fc_1"){LOG(INFO)<<top_data[t];} // if(isnan(top_data[t])&&name_ =="fc_1"){ // LOG(INFO)<<"bias ["<< t<<"]="<<bias_v[t]<<"out of " <<this->blobs_[1]->count(); // sleep(100); // } // } } } return Dtype(0.); }
void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = top[i]->mutable_cpu_data(); Dtype* col_buff = NULL; if (!is_1x1_) { col_buff = col_buffer_.mutable_cpu_data(); } const Dtype* weight = this->blobs_[0]->cpu_data(); int weight_offset = M_ * K_; // number of filter parameters in a group int col_offset = K_ * N_; // number of values in an input region / column int top_offset = M_ * N_; // number of values in an output region / column for (int n = 0; n < num_; ++n) { // im2col transformation: unroll input regions for filtering // into column matrix for multplication. if (!is_1x1_) { im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); } else { // special case for 1x1 convolution col_buff = bottom[i]->mutable_cpu_data() + bottom[i]->offset(n); } // Take inner products for groups. for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1., weight + weight_offset * g, col_buff + col_offset * g, (Dtype)0., top_data + top[i]->offset(n) + top_offset * g); /** * void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { */ } // Add bias. if (bias_term_) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype)1., this->blobs_[1]->cpu_data(), bias_multiplier_.cpu_data(), (Dtype)1., top_data + top[i]->offset(n)); } } } }
void backward_local_layer(local_layer l, network_state state) { int i, j; int locations = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1); } for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, l.col_image); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; float *b = l.col_image + j; float *c = l.filter_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; int k = 1; gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(state.delta){ for(j = 0; j < locations; ++j){ float *a = l.filters + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; float *c = l.col_image + j; int m = l.size*l.size*l.c; int n = 1; int k = l.n; gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } }
void backward_deconvolutional_layer(layer l, network_state state) { float alpha = 1./l.batch; int out_h = deconvolutional_out_height(l); int out_w = deconvolutional_out_width(l); int size = out_h*out_w; int i; gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, state); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; float *a = state.input + i*m*n; float *b = state.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, l.size, l.stride, 0, b); gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n); if(state.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; float *b = state.workspace; float *c = state.delta + i*n*m; gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } } }
void backward_convolutional_layer(convolutional_layer l, network net) { int i, j; int m = l.n/l.groups; int n = l.size*l.size*l.c/l.groups; int k = l.out_w*l.out_h; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); } for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.delta + (i*l.groups + j)*m*k; float *b = net.workspace; float *c = l.weight_updates + j*l.nweights/l.groups; float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w; im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ a = l.weights + j*l.nweights/l.groups; b = l.delta + (i*l.groups + j)*m*k; c = net.workspace; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w); } } } }
void backward_deconvolutional_layer(layer l, network net) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; float *a = net.input + i*m*k; float *b = net.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, b); gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; float *b = net.workspace; float *c = net.delta + i*n*m; gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } }
void LocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { Dtype* x_data = col_buffer_.mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); Blob<Dtype> E; E.Reshape(1, 1, 1, K_); FillerParameter filler_param; filler_param.set_value(1); ConstantFiller<Dtype> filler(filler_param); filler.Fill(&E); Blob<Dtype> intermediate; intermediate.Reshape(1, 1, K_, N_); for (int n=0; n<num_; n++) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data); for (int m=0; m<num_output_; m++) { caffe_mul(K_*N_, x_data, weight+this->blobs_[0]->offset(m), intermediate.mutable_cpu_data()); caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, 1, N_, K_, (Dtype)1., E.cpu_data(), intermediate.cpu_data(), (Dtype)0., top_data + top[0]->offset(n, m)); } if (bias_term_) { caffe_add(M_ * N_, this->blobs_[1]->cpu_data(), top_data + top[0]->offset(n), top_data + top[0]->offset(n)); } } }
void TiedConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype> *> &bottom, vector<Blob<Dtype> *> *top) { const Dtype *weight = this->blobs_[0]->cpu_data(); const int weight_offset = M_ * K_; // number of filter parameters in a group for (int i = 0; i < num_in_; ++i) { //-----Same concept as Forward_cpu of convolutionlayer----- const Dtype *bottom_data = bottom[i]->cpu_data(); const int col_offset = K_ * N_[i]; const int top_offset = M_ * N_[i]; Dtype *top_data = (*top)[i]->mutable_cpu_data(); Dtype *col_data = this->col_buffers_[i]->mutable_cpu_data(); for (int n = 0; n < num_; ++n) { // im2col transformation: unroll input regions for filtering // into column matrix for multplication. im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); // Take innerproduct for groups. for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_[i], K_, (Dtype)1., weight + weight_offset * g, col_data + col_offset * g, (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g); } // Add bias. if (bias_term_) { caffe_cpu_gemm<Dtype>( CblasNoTrans, CblasNoTrans, num_output_, N_[i], 1, (Dtype)1., this->blobs_[1]->cpu_data(), reinterpret_cast<const Dtype *>(bias_multipliers_[i]->cpu_data()), (Dtype)1., top_data + (*top)[i]->offset(n)); } } //--------------------------------------------------------- } }
void forward_convolutional_layer(convolutional_layer l, network net) { int i, j; fill_cpu(l.outputs*l.batch, 0, l.output, 1); if(l.xnor){ binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights); swap_binary(&l); binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input); net.input = l.binary_input; } int m = l.n/l.groups; int k = l.size*l.size*l.c/l.groups; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.weights + j*l.nweights/l.groups; float *b = net.workspace; float *c = l.output + (i*l.groups + j)*n*m; im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } } if(l.batch_normalize){ forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.outputs*l.batch, l.activation); if(l.binary || l.xnor) swap_binary(&l); }
void TiedConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype> *> &top, const vector<bool> &propagate_down, vector<Blob<Dtype> *> *bottom) { //-----Same concept as Backward_cpu of convolutionlayer----- // but multiple times for each bottom-top pair, and accumulating dW const Dtype *weight = NULL; Dtype *weight_diff = NULL; if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->cpu_data(); weight_diff = this->blobs_[0]->mutable_cpu_diff(); // Init weight diff to all 0s. caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } // bias gradient if necessary Dtype *bias_diff = NULL; if (bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); caffe_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } const int weight_offset = M_ * K_; for (int i = 0; i < num_in_; ++i) { const Dtype *top_diff = NULL; // Bias gradient if necessary. if (bias_term_ && this->param_propagate_down_[1]) { top_diff = top[i]->cpu_diff(); for (int n = 0; n < num_; ++n) { caffe_cpu_gemv<Dtype>( CblasNoTrans, num_output_, N_[i], 1., top_diff + top[i]->offset(n), reinterpret_cast<const Dtype *>(bias_multipliers_[i]->cpu_data()), 1., bias_diff); } } if (this->param_propagate_down_[0] || propagate_down[i]) { if (!top_diff) { top_diff = top[i]->cpu_diff(); } Dtype* col_data = this->col_buffers_[i]->mutable_cpu_data(); const Dtype* bottom_data = (*bottom)[i]->cpu_data(); Dtype* bottom_diff = (*bottom)[i]->mutable_cpu_diff(); const int col_offset = K_ * N_[i]; const int top_offset = M_ * N_[i]; for (int n = 0; n < num_; ++n) { // Since we saved memory in the forward pass by not storing all col // data, we will need to recompute them. im2col_cpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_data); // gradient w.r.t. weight. Note that we will accumulate diffs. // AJ: propagate error Delta W_ij = error from above * this_activation^T if (this->param_propagate_down_[0]) { for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_[i], (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g, col_data + col_offset * g, (Dtype)1., weight_diff + weight_offset * g); } } // gradient w.r.t. bottom data, if necessary // AJ: error here = W*error from above if (propagate_down[i]) { if (weight == NULL) { weight = this->blobs_[0]->cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_[i], M_, (Dtype)1., weight + weight_offset * g, top_diff + top[i]->offset(n) + top_offset * g, (Dtype)0., col_data + col_offset * g); } // col2im back to the data col2im_cpu(col_data, channels_, height_[i], width_[i], kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff + (*bottom)[i]->offset(n)); } } } //--------------------------------------------------------- } }
void forward_convolutional_layer(const convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = l.col_image; float *c = l.output; // printf("the l.size is %i \n", l.size); ///* //printf("the m,k,n is %i,%i,%i \n", m,k,n); for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } //*/ //add by fanghao /* int ii,jj,kk,mm,pp,tt; int lcc = l.c; int lhh = l.h; int lww = l.w; int kernel = l.size; int pad; if(l.pad) pad = l.size/2; else pad = l.pad; lhh += 2*pad; lww += 2*pad; float *dataP; dataP = (float *)calloc(lcc*lhh*lww, sizeof(float)); //printf("the l.h is %i \n", l.h); //printf("the l.w is %i \n", l.w); //printf("the lhh is %i \n", lhh); //printf("the lww is %i \n", lww); //printf("the pad is %i \n", pad); for(ii=0; ii < lcc; ii++) for(jj=pad; jj<lhh-pad; jj++) for(kk=pad; kk<lww-pad; kk++) dataP[ii*lhh*lww + jj*lww + kk] = state.input[ii*(lhh - 2*pad)*(lww-2*pad) + (jj - pad)*(lww - 2*pad) + kk-pad]; for(ii=0; ii<m; ii++) for(jj=0; jj<out_h; jj++) for(kk=0; kk<out_w; kk++) { float tempAcc = 0.0; for(mm=0; mm<lcc; mm++) for(pp=0; pp<kernel; pp++) for(tt=0; tt<kernel; tt++) tempAcc += a[ii*lcc*kernel*kernel+mm*kernel*kernel+pp*kernel+tt]*dataP[mm*lhh*lww+(l.stride*jj+pp)*lww+l.stride*kk+tt]; c[ii*out_h*out_w+jj*out_w+kk] = tempAcc; } // c += n*m; //state.input += l.c*l.h*l.w; // */ if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.n, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.n, l.out_h*l.out_w, l.variance); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.n, l.out_h*l.out_w); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.n, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
void NonLocalLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { split_layer_0->Forward(bottom, split_0_top_vec); for (int n = 0; n < num_; ++n) { im2col_cpu(split_0_top_vec[0]->cpu_data() + split_0_top_vec[0]->offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, 1,1, img2col_0_top.mutable_cpu_data() + img2col_0_top.offset(n)); im2col_center_cpu(split_0_top_vec[1]->cpu_data() + split_0_top_vec[1]->offset(n), channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, img2col_1_top.mutable_cpu_data() + img2col_1_top.offset(n)); } split_layer_1->Forward(split_1_bottom_vec, split_1_top_vec); euclidean_bottom_vec[0]->ShareData(*split_1_top_vec[1]); euclidean_layer->Forward(euclidean_bottom_vec, euclidean_top_vec); caffe_scal(euclidean_top_vec[0]->count(), (Dtype)(1.0 / bottom[0]->channels()), euclidean_top_vec[0]->mutable_cpu_data()); smooth_threshold_layer->Forward(smooth_bottom_vec, smooth_top_vec); split_layer_3->Forward(split_3_bottom_vec, split_3_top_vec); normalize_bottom_vec[0]->ShareData(*split_3_top_vec[1]); normalize_layer->Forward(normalize_bottom_vec, normalize_top_vec); //top[1]->ShareData(*normalize_top_vec[0]); const Dtype* normalize_top_data = normalize_top_vec[0]->cpu_data(); const Dtype* split_3_top_data_1 = normalize_top_vec[0]->cpu_data(); Dtype* top_1_data = top[1]->mutable_cpu_data(); const int norm_offset = normalize_top_vec[0]->offset(1); for (int n = 0; n < normalize_top_vec[0]->num(); ++n) { for (int ch = 0; ch < channels_; ++ch) { caffe_copy(norm_offset, split_3_top_data_1, top_1_data); top_1_data += norm_offset; } split_3_top_data_1 += norm_offset; } //int tmp_offset = smooth_top_vec[0]->count() / smooth_top_vec[0]->num(); const int tmp_offset = split_3_top_vec[0]->offset(1); Dtype* split_2_bottom_data = split_2_bottom_vec[0]->mutable_cpu_data(); //const Dtype* smooth_top_data = smooth_top_vec[0]->cpu_data(); const Dtype* split_3_top_data = split_3_top_vec[0]->cpu_data(); for (int n = 0; n < split_2_bottom_vec[0]->num(); ++n) { for (int ch = 0; ch < channels_; ++ch) { //caffe_copy(tmp_offset, smooth_top_data, split_2_bottom_data); caffe_copy(tmp_offset, split_3_top_data, split_2_bottom_data); split_2_bottom_data += tmp_offset; } //smooth_top_data += smooth_top_vec[0]->offset(1); split_3_top_data += tmp_offset; } split_layer_2->Forward(split_2_bottom_vec, split_2_top_vec); if (top.size() == 3) eltwise_layer->Forward(eltwise_bottom_vec, eltwise_top_vec); }
// wrap im2col/col2im so we don't have to remember the (long) argument lists inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); }
void LocalLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); Dtype* x_data = col_buffer_.mutable_cpu_data(); Dtype* x_diff = col_buffer_.mutable_cpu_diff(); const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); Dtype* bias_diff = NULL; Blob<Dtype> intermediate; intermediate.Reshape(1, 1, 1, N_); Blob<Dtype> xt; xt.Reshape(1, 1, K_, N_); Dtype* xt_data = xt.mutable_cpu_data(); if (bias_term_) { bias_diff = this->blobs_[1]->mutable_cpu_diff(); memset(bias_diff, 0, sizeof(Dtype) * this->blobs_[1]->count()); for (int n = 0; n < num_; ++n) { caffe_add(M_ * N_, bias_diff, top_diff + top[0]->offset(n), bias_diff); } } memset(weight_diff, 0, sizeof(Dtype) * this->blobs_[0]->count()); for (int n=0; n<num_; n++) { im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, x_data); // gradient wrt weight for (int m=0; m<num_output_; m++) { Dtype* filter_weight_diff = weight_diff+this->blobs_[0]->offset(m); for (int k=0; k<K_; k++) { caffe_mul(N_, top_diff+top[0]->offset(n, m), x_data+col_buffer_.offset(0,k), xt_data+xt.offset(0,0,k)); } caffe_cpu_axpby(K_*N_, Dtype(1.0), xt_data, Dtype(1.0), filter_weight_diff); } // gradient wrt bottom data if (propagate_down[0]) { memset(x_diff, 0, col_buffer_.count() * sizeof(Dtype)); for (int m=0; m<num_output_; m++) { for (int k=0; k<K_; k++) { caffe_mul(N_, top_diff+top[0]->offset(n, m), weight+this->blobs_[0]->offset(m,0,k), intermediate.mutable_cpu_data()); caffe_cpu_axpby(N_, Dtype(1.0), intermediate.cpu_data(), Dtype(1.0), x_diff+col_buffer_.offset(0,k)); } } // col2im back to the data col2im_cpu(x_diff, channels_, height_, width_, kernel_size_, kernel_size_, pad_, pad_, stride_, stride_, bottom_diff + bottom[0]->offset(n)); } } }