void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer, vector<Blob<Dtype>*>* bottom, vector<Blob<Dtype>*>* top, int check_bottom, int top_id, int top_data_id, bool element_wise) { if (element_wise) { CHECK_EQ(0, layer->blobs().size()); CHECK_LE(0, top_id); CHECK_LE(0, top_data_id); const int top_count = (*top)[top_id]->count(); for (int blob_id = 0; blob_id < bottom->size(); ++blob_id) { CHECK_EQ(top_count, (*bottom)[blob_id]->count()); } } // First, figure out what blobs we need to check against. vector<Blob<Dtype>*> blobs_to_check; for (int i = 0; i < layer->blobs().size(); ++i) { blobs_to_check.push_back(layer->blobs()[i].get()); } if (check_bottom < 0) { for (int i = 0; i < bottom->size(); ++i) { blobs_to_check.push_back((*bottom)[i]); } } else { CHECK(check_bottom < bottom->size()); blobs_to_check.push_back((*bottom)[check_bottom]); } // Compute the gradient analytically using Backward Caffe::set_random_seed(seed_); // Get any loss from the layer Dtype computed_objective = layer->Forward(*bottom, top); // Get additional loss from the objective computed_objective += GetObjAndGradient(top, top_id, top_data_id); layer->Backward(*top, true, bottom); // Store computed gradients for all checked blobs vector<shared_ptr<Blob<Dtype> > > computed_gradient_blobs(blobs_to_check.size()); for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; computed_gradient_blobs[blob_id].reset(new Blob<Dtype>()); computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); const int count = blobs_to_check[blob_id]->count(); const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); Dtype* computed_gradients = computed_gradient_blobs[blob_id]->mutable_cpu_data(); caffe_copy(count, diff, computed_gradients); } // Compute derivative of top w.r.t. each bottom and parameter input using // finite differencing. // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; const Dtype* computed_gradients = computed_gradient_blobs[blob_id]->cpu_data(); // LOG(ERROR) << "Blob " << blob_id << ": checking " // << current_blob->count() << " parameters."; for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { // For an element-wise layer, we only need to do finite differencing to // compute the derivative of (*top)[top_id][top_data_id] w.r.t. // (*bottom)[blob_id][i] only for i == top_data_id. For any other // i != top_data_id, we know the derivative is 0 by definition, and simply // check that that's true. Dtype estimated_gradient = 0; if (!element_wise || (feat_id == top_data_id)) { // Do finite differencing. // Compute loss with stepsize_ added to input. current_blob->mutable_cpu_data()[feat_id] += stepsize_; Caffe::set_random_seed(seed_); Dtype positive_objective = layer->Forward(*bottom, top); positive_objective += GetObjAndGradient(top, top_id, top_data_id); // Compute loss with stepsize_ subtracted from input. current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; Caffe::set_random_seed(seed_); Dtype negative_objective = layer->Forward(*bottom, top); negative_objective += GetObjAndGradient(top, top_id, top_data_id); // Recover original input value. current_blob->mutable_cpu_data()[feat_id] += stepsize_; estimated_gradient = (positive_objective - negative_objective) / stepsize_ / 2.; } Dtype computed_gradient = computed_gradients[feat_id]; Dtype feature = current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " // << current_blob->cpu_diff()[feat_id]; if (kink_ - kink_range_ > fabs(feature) || fabs(feature) > kink_ + kink_range_) { // We check relative accuracy, but for too small values, we threshold // the scale factor by 1. Dtype scale = max( max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id << "," << top_data_id << "," << blob_id << "," << feat_id; } // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "computed gradient: " << computed_gradient // << " estimated_gradient: " << estimated_gradient; } } }
void GradientChecker<Dtype>::CheckGradientSingle( Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp check_bottom, int_tp top_id, int_tp top_data_id, bool element_wise) { if (element_wise) { CHECK_EQ(0, layer->blobs().size()); CHECK_LE(0, top_id); CHECK_LE(0, top_data_id); const int_tp top_count = top[top_id]->count(); for (int_tp blob_id = 0; blob_id < bottom.size(); ++blob_id) { CHECK_EQ(top_count, bottom[blob_id]->count()); } } // First, figure out what blobs we need to check against, and zero init // parameter blobs. vector<Blob<Dtype>*> blobs_to_check; vector<bool> propagate_down(bottom.size(), check_bottom == -1); for (int_tp i = 0; i < layer->blobs().size(); ++i) { Blob<Dtype>* blob = layer->blobs()[i].get(); caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff()); blobs_to_check.push_back(blob); } if (check_bottom == -1) { for (int_tp i = 0; i < bottom.size(); ++i) { blobs_to_check.push_back(bottom[i]); } } else if (check_bottom >= 0) { CHECK_LT(check_bottom, bottom.size()); blobs_to_check.push_back(bottom[check_bottom]); propagate_down[check_bottom] = true; } CHECK_GT(blobs_to_check.size(), 0)<< "No blobs to check."; // Compute the gradient analytically using Backward Caffe::set_random_seed(seed_, Caffe::GetDefaultDevice()); // Ignore the loss from the layer (it's just the weighted sum of the losses // from the top blobs, whose gradients we may want to test individually). layer->Forward(bottom, top); // Get additional loss from the objective GetObjAndGradient(*layer, top, top_id, top_data_id); layer->Backward(top, propagate_down, bottom); // Store computed gradients for all checked blobs vector<shared_ptr<Blob<Dtype> > > computed_gradient_blobs( blobs_to_check.size()); for (int_tp blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; computed_gradient_blobs[blob_id].reset(new Blob<Dtype>()); computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); const int_tp count = blobs_to_check[blob_id]->count(); const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); Dtype* computed_gradients = computed_gradient_blobs[blob_id] ->mutable_cpu_data(); caffe_cpu_copy(count, diff, computed_gradients); } // Compute derivative of top w.r.t. each bottom and parameter input using // finite differencing. // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; for (int_tp blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; const Dtype* computed_gradients = computed_gradient_blobs[blob_id]->cpu_data(); // LOG(ERROR) << "Blob " << blob_id << ": checking " // << current_blob->count() << " parameters."; for (int_tp feat_id = 0; feat_id < current_blob->count(); ++feat_id) { // For an element-wise layer, we only need to do finite differencing to // compute the derivative of top[top_id][top_data_id] w.r.t. // bottom[blob_id][i] only for i == top_data_id. For any other // i != top_data_id, we know the derivative is 0 by definition, and simply // check that that's true. Dtype estimated_gradient = 0; Dtype positive_objective = 0; Dtype negative_objective = 0; if (!element_wise || (feat_id == top_data_id)) { // Do finite differencing. // Compute loss with stepsize_ added to input. current_blob->mutable_cpu_data()[feat_id] += stepsize_; Caffe::set_random_seed(seed_, Caffe::GetDefaultDevice()); layer->Forward(bottom, top); positive_objective = GetObjAndGradient(*layer, top, top_id, top_data_id); // Compute loss with stepsize_ subtracted from input. current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; Caffe::set_random_seed(seed_, Caffe::GetDefaultDevice()); layer->Forward(bottom, top); negative_objective = GetObjAndGradient(*layer, top, top_id, top_data_id); // Recover original input value. current_blob->mutable_cpu_data()[feat_id] += stepsize_; estimated_gradient = (positive_objective - negative_objective) / stepsize_ / 2.; } Dtype computed_gradient = computed_gradients[feat_id]; Dtype feature = current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " // << current_blob->cpu_diff()[feat_id]; if (kink_ - kink_range_ > fabs(feature) || fabs(feature) > kink_ + kink_range_) { // We check relative accuracy, but for too small values, we threshold // the scale factor by 1. Dtype scale = std::max<Dtype>( std::max(fabs(computed_gradient), fabs(estimated_gradient)), Dtype(1.)); EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id << "," << top_data_id << "," << blob_id << "," << feat_id << "; feat = " << feature << "; objective+ = " << positive_objective << "; objective- = " << negative_objective; } // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "computed gradient: " << computed_gradient // << " estimated_gradient: " << estimated_gradient; } } }
void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer, vector<Blob<Dtype>*>* bottom, vector<Blob<Dtype>*>* top, int check_bottom, int top_id, int top_data_id, bool element_wise) { if (element_wise) { CHECK_EQ(0, layer->blobs().size()); CHECK_LE(0, top_id); CHECK_LE(0, top_data_id); const int top_count = (*top)[top_id]->count(); for (int blob_id = 0; blob_id < bottom->size(); ++blob_id) { CHECK_EQ(top_count, (*bottom)[blob_id]->count()); } } // First, figure out what blobs we need to check against. vector<Blob<Dtype>*> blobs_to_check; vector<bool> propagate_down(bottom->size(), check_bottom < 0); for (int i = 0; i < layer->blobs().size(); ++i) { blobs_to_check.push_back(layer->blobs()[i].get()); } if (check_bottom < 0) { for (int i = 0; i < bottom->size(); ++i) { blobs_to_check.push_back((*bottom)[i]); } } else { //printf("TestGradientUtil: setting propogat down to true\n"); CHECK_LT(check_bottom, bottom->size()); blobs_to_check.push_back((*bottom)[check_bottom]); propagate_down[check_bottom] = true; } // Compute the gradient analytically using Backward Caffe::set_random_seed(seed_); // Ignore the loss from the layer (it's just the weighted sum of the losses // from the top blobs, whose gradients we may want to test individually). layer->Forward(*bottom, top); // Get additional loss from the objective GetObjAndGradient(*layer, top, top_id, top_data_id); layer->Backward(*top, propagate_down, bottom); // Store computed gradients for all checked blobs vector<shared_ptr<Blob<Dtype> > > computed_gradient_blobs(blobs_to_check.size()); for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; computed_gradient_blobs[blob_id].reset(new Blob<Dtype>()); computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob); const int count = blobs_to_check[blob_id]->count(); const Dtype* diff = blobs_to_check[blob_id]->cpu_diff(); Dtype* computed_gradients = computed_gradient_blobs[blob_id]->mutable_cpu_data(); caffe_copy(count, diff, computed_gradients); } // Compute derivative of top w.r.t. each bottom and parameter input using // finite differencing. // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs."; for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob<Dtype>* current_blob = blobs_to_check[blob_id]; const Dtype* computed_gradients = computed_gradient_blobs[blob_id]->cpu_data(); // LOG(ERROR) << "Blob " << blob_id << ": checking " // << current_blob->count() << " parameters."; /* printf("TestGradientUtil: current_blob->count() : %d \n", current_blob->count()); printf("TestGradientUtil: start printing current_blob data \n"); for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { printf("%f \t", (float) current_blob->mutable_cpu_data()[feat_id]); } */ //printf("TestGradientUtil: end printing current_blob data \n \n"); for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) { // For an element-wise layer, we only need to do finite differencing to // compute the derivative of (*top)[top_id][top_data_id] w.r.t. // (*bottom)[blob_id][i] only for i == top_data_id. For any other // i != top_data_id, we know the derivative is 0 by definition, and simply // check that that's true. Dtype estimated_gradient = 0; Dtype positive_objective = 0; Dtype negative_objective = 0; if (!element_wise || (feat_id == top_data_id)) { // Do finite differencing. // Compute loss with stepsize_ added to input. /* printf("TestGradientUtil: feature_id %d , feature_value %f \n",feat_id , (float) current_blob->mutable_cpu_data()[feat_id]); */ current_blob->mutable_cpu_data()[feat_id] += stepsize_; Caffe::set_random_seed(seed_); layer->Forward(*bottom, top); positive_objective = GetObjAndGradient(*layer, top, top_id, top_data_id); // printf("TestGradientUtil: positive_objective : %f \n", (float) positive_objective ); //printf("TestGradientUtil: stepsize_ : %f \n", (float) stepsize_); // Compute loss with stepsize_ subtracted from input. current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; Caffe::set_random_seed(seed_); layer->Forward(*bottom, top); negative_objective = GetObjAndGradient(*layer, top, top_id, top_data_id); //printf("TestGradientUtil: negative_objective : %f \n", (float) negative_objective ); // Recover original input value. current_blob->mutable_cpu_data()[feat_id] += stepsize_; estimated_gradient = (positive_objective - negative_objective) / stepsize_ / 2.; } Dtype computed_gradient = computed_gradients[feat_id]; Dtype feature = current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " " // << current_blob->cpu_diff()[feat_id]; if (kink_ - kink_range_ > fabs(feature) || fabs(feature) > kink_ + kink_range_) { // We check relative accuracy, but for too small values, we threshold // the scale factor by 1. Dtype scale = std::max( std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); /* printf("TestGradientUtil: computed_gradient : %f \n", (float) computed_gradient); printf("TestGradientUtil: estimated_gradient : %f \n", (float) estimated_gradient ); printf("TestGradientUtil: Diff between computed and estimate : %f \n", (float) (computed_gradient - estimated_gradient)); printf("TestGradientUtil: threshold_ : %f \n", (float) threshold_ ); printf("TestGradientUtil: scale : %f \n", (float) scale ); */ EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id << "," << top_data_id << "," << blob_id << "," << feat_id << "; feat = " << feature << "; objective+ = " << positive_objective << "; objective- = " << negative_objective; } // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "computed gradient: " << computed_gradient // << " estimated_gradient: " << estimated_gradient; } } }