bool Solver::Solve(const char* resume_file) { LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); // Initialize to false every time we start solving. requested_early_exit_ = false; if (resume_file != nullptr) { LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file); } callback_soft_barrier(); if (Caffe::restored_iter() != -1) { iter_ = Caffe::restored_iter(); iterations_restored_ = iter_; // for correct benchmarking iterations_last_ = -1; } // For a network that is trained by the solver, no bottom or top vecs // should be given, and we will just provide dummy vecs. int start_iter = iter_; Step(param_.max_iter() - iter_); // If we haven't already, save a snapshot after optimization, unless // overridden by setting snapshot_after_train := false if (param_.snapshot_after_train() && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { if (Caffe::root_solver()) { Snapshot(); } } Caffe::set_restored_iter(-1); iterations_restored_ = 0; iterations_last_ = 0; if (requested_early_exit_) { LOG(INFO) << "Optimization stopped early."; return true; } // After the optimization is done, run an additional train and test pass to // display the train and test loss/outputs if appropriate (based on the // display and test_interval settings, respectively). Unlike in the rest of // training, for the train net we only run a forward pass as we've already // updated the parameters "max_iter" times -- this final pass is only done to // display the loss, which is computed in the forward pass. if (this->display()) { int average_loss = this->param_.average_loss(); float loss; net_->Forward(&loss); UpdateSmoothedLoss(loss, start_iter, average_loss); LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << smoothed_loss_; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { bool use_multi_gpu_testing = Caffe::solver_count() > 1; TestAll(0, use_multi_gpu_testing); callback_soft_barrier(); } return false; }
void Solver<Dtype>::Solve(const char* resume_file) { CHECK(Caffe::root_solver()); LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); // Initialize to false every time we start solving. requested_early_exit_ = false; if (resume_file) { LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file); } // For a network that is trained by the solver, no bottom or top vecs // should be given, and we will just provide dummy vecs. int start_iter = iter_; Step(param_.max_iter() - iter_); // If we haven't already, save a snapshot after optimization, unless // overridden by setting snapshot_after_train := false if (param_.snapshot_after_train() && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { Snapshot(); } if (requested_early_exit_) { LOG(INFO) << "Optimization stopped early."; return; } // After the optimization is done, run an additional train and test pass to // display the train and test loss/outputs if appropriate (based on the // display and test_interval settings, respectively). Unlike in the rest of // training, for the train net we only run a forward pass as we've already // updated the parameters "max_iter" times -- this final pass is only done to // display the loss, which is computed in the forward pass. if (param_.display() && iter_ % param_.display() == 0) { int average_loss = this->param_.average_loss(); Dtype loss; net_->Forward(&loss); UpdateSmoothedLoss(loss, start_iter, average_loss); LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { TestAll(); } LOG(INFO) << "Optimization Done."; }
void Solver<Dtype>::Step(int iters) { const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); losses_.clear(); smoothed_loss_ = 0; while (iter_ < stop_iter) { // zero-init the params net_->ClearParamDiffs(); if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization()) && Caffe::root_solver()) { TestAll(); if (requested_early_exit_) { // Break out of the while loop because stop was requested while testing. break; } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); } const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); // accumulate the loss and gradient Dtype loss = 0; for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward(); } loss /= param_.iter_size(); // average the loss across iterations for smoothed reporting UpdateSmoothedLoss(loss, start_iter, average_loss); if (display) { LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << smoothed_loss_; const vector<Blob<Dtype>*>& result = net_->output_blobs(); int score_index = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * result_vec[k] << " loss)"; } LOG_IF(INFO, Caffe::root_solver()) << " Train net output #" << score_index++ << ": " << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); } ApplyUpdate(); // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; SolverAction::Enum request = GetRequestedAction(); // Save a snapshot if needed. if ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot(); } if (SolverAction::STOP == request) { requested_early_exit_ = true; // Break out of training loop. break; } } }
void Solver::Step(int iters) { const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); losses_.clear(); smoothed_loss_ = 0; const Caffe::Brew mode = Caffe::mode(); const int solver_count = Caffe::solver_count(); const bool root_solver = this->is_root(); net_->set_solver(this); #ifndef CPU_ONLY for (const shared_ptr<Blob>& param : net_->learnable_params()) { // To prevent allocations inside on_start call: param->allocate_data(mode == Caffe::GPU); } net_->InitializeLearnableDiffSpace(); if (solver_count > 1) { // we need to sync all threads before starting, otherwise some cuda init, // malloc or other cuda stuff could interlock with in-loop cuda GPU sync // called in on_start. callback_soft_barrier(); { unique_ptr<unique_lock<shared_mutex>> lock; if (root_solver) { lock.reset(new unique_lock<shared_mutex>(GPUMemory::read_write_mutex())); } callback_soft_barrier(); callback_->on_start(net_->learnable_params()); } callback_soft_barrier(); LOG(INFO) << "Starting Optimization on GPU " << Caffe::current_device(); } const bool use_multi_gpu_testing = Caffe::solver_count() > 1; const string mgpu_str = use_multi_gpu_testing ? "[MultiGPU] " : ""; #else const bool use_multi_gpu_testing = false; const string mgpu_str; #endif uint64_t random_seed = param_.random_seed() >= 0 ? static_cast<uint64_t>(param_.random_seed()) : Caffe::next_seed(); reduce_thread_.reset(new boost::thread(&Solver::Reduce, this, Caffe::current_device(), mode, random_seed, solver_count, root_solver)); while (iter_ < stop_iter) { if (param_.snapshot_diff()) { net_->ClearParamDiffs(); } // we clean them in ApplyUpdate otherwise // Just started or restored? const bool first_loop = iter_ == 0 || iterations_last_ < 0; if (iter_ == 0) { if (TestAll(1, use_multi_gpu_testing)) { break; } callback_soft_barrier(); LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Initial Test completed"; } else if (param_.test_interval() && iter_ % param_.test_interval() == 0 && iterations_last_ >= 0) { test_timer_.Start(); if (TestAll(0, use_multi_gpu_testing)) { break; } callback_soft_barrier(); float lapse = test_timer_.Seconds(); LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Tests completed in " << lapse << "s"; } if (requested_early_exit_) { // Break out of the while loop because stop was requested while testing. break; } const bool display = this->display(); net_->set_debug_info(display && param_.debug_info()); // accumulate the loss and gradient float loss = 0.F; if (first_loop) { iterations_last_ = iter_; iteration_timer_.Start(); init_flag_.set(); } iteration_start_signal(); for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward(i + 1 == param_.iter_size()); if (i == 0) { if (first_loop) { iter0_flag_.set(); net_->wait_layers_init(); } iter_size_complete_ = true; } } loss /= param_.iter_size(); iteration_wait(); if (requested_early_exit_) { total_lapse_ += iteration_timer_.Seconds(); break; } // average the loss across iterations for smoothed reporting UpdateSmoothedLoss(loss, start_iter, average_loss); if (display || iter_ <= 2 || iter_ + 1 >= stop_iter) { float lapse = iteration_timer_.Seconds(); if (iter_ >= 2) { // we skip 0th and 1st for correct benchmarking total_lapse_ += lapse; float per_s = (iter_ - iterations_last_) / (lapse > 0.F ? lapse : 1.F); LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << " (" << per_s << " iter/s, " << lapse << "s/" << param_.display() << " iter), loss = " << smoothed_loss_; } else { LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << " (" << lapse << " s), loss = " << smoothed_loss_; } const vector<Blob*>& result = net_->output_blobs(); int score_index = 0; for (int j = 0; j < result.size(); ++j) { const float* result_vec = result[j]->cpu_data<float>(); const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const float loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << (loss_weight * result_vec[k]) << " loss)"; } LOG_IF(INFO, Caffe::root_solver()) << " Train net output #" << score_index++ << ": " << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } PrintRate(); iterations_last_ = iter_; iteration_timer_.Start(); } // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; SolverAction::Enum request = GetRequestedAction(); // Save a snapshot if needed. if ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot(); } if (SolverAction::STOP == request) { requested_early_exit_ = true; total_lapse_ += iteration_timer_.Seconds(); // Break out of training loop. break; } } Finalize(); }