Ejemplo n.º 1
0
bool Solver::Solve(const char* resume_file) {
  LOG(INFO) << "Solving " << net_->name();
  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
  // Initialize to false every time we start solving.
  requested_early_exit_ = false;

  if (resume_file != nullptr) {
    LOG(INFO) << "Restoring previous solver status from " << resume_file;
    Restore(resume_file);
  }
  callback_soft_barrier();
  if (Caffe::restored_iter() != -1) {
    iter_ = Caffe::restored_iter();
    iterations_restored_ = iter_;  // for correct benchmarking
    iterations_last_ = -1;
  }

  // For a network that is trained by the solver, no bottom or top vecs
  // should be given, and we will just provide dummy vecs.
  int start_iter = iter_;
  Step(param_.max_iter() - iter_);
  // If we haven't already, save a snapshot after optimization, unless
  // overridden by setting snapshot_after_train := false
  if (param_.snapshot_after_train()
      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
    if (Caffe::root_solver()) {
      Snapshot();
    }
  }
  Caffe::set_restored_iter(-1);
  iterations_restored_ = 0;
  iterations_last_ = 0;
  if (requested_early_exit_) {
    LOG(INFO) << "Optimization stopped early.";
    return true;
  }
  // After the optimization is done, run an additional train and test pass to
  // display the train and test loss/outputs if appropriate (based on the
  // display and test_interval settings, respectively).  Unlike in the rest of
  // training, for the train net we only run a forward pass as we've already
  // updated the parameters "max_iter" times -- this final pass is only done to
  // display the loss, which is computed in the forward pass.
  if (this->display()) {
    int average_loss = this->param_.average_loss();
    float loss;
    net_->Forward(&loss);

    UpdateSmoothedLoss(loss, start_iter, average_loss);

    LOG_IF(INFO, Caffe::root_solver()) << "Iteration "
        << iter_ << ", loss = " << smoothed_loss_;
  }
  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
    bool use_multi_gpu_testing = Caffe::solver_count() > 1;
    TestAll(0, use_multi_gpu_testing);
    callback_soft_barrier();
  }
  return false;
}
Ejemplo n.º 2
0
void Solver<Dtype>::Solve(const char* resume_file) {
  CHECK(Caffe::root_solver());
  LOG(INFO) << "Solving " << net_->name();
  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();

  // Initialize to false every time we start solving.
  requested_early_exit_ = false;

  if (resume_file) {
    LOG(INFO) << "Restoring previous solver status from " << resume_file;
    Restore(resume_file);
  }

  // For a network that is trained by the solver, no bottom or top vecs
  // should be given, and we will just provide dummy vecs.
  int start_iter = iter_;
  Step(param_.max_iter() - iter_);
  // If we haven't already, save a snapshot after optimization, unless
  // overridden by setting snapshot_after_train := false
  if (param_.snapshot_after_train()
      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
    Snapshot();
  }
  if (requested_early_exit_) {
    LOG(INFO) << "Optimization stopped early.";
    return;
  }
  // After the optimization is done, run an additional train and test pass to
  // display the train and test loss/outputs if appropriate (based on the
  // display and test_interval settings, respectively).  Unlike in the rest of
  // training, for the train net we only run a forward pass as we've already
  // updated the parameters "max_iter" times -- this final pass is only done to
  // display the loss, which is computed in the forward pass.
  if (param_.display() && iter_ % param_.display() == 0) {
    int average_loss = this->param_.average_loss();
    Dtype loss;
    net_->Forward(&loss);

    UpdateSmoothedLoss(loss, start_iter, average_loss);

    LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_;
  }
  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
    TestAll();
  }
  LOG(INFO) << "Optimization Done.";
}
Ejemplo n.º 3
0
void Solver<Dtype>::Step(int iters) {
  const int start_iter = iter_;
  const int stop_iter = iter_ + iters;
  int average_loss = this->param_.average_loss();
  losses_.clear();
  smoothed_loss_ = 0;

  while (iter_ < stop_iter) {
    // zero-init the params
    net_->ClearParamDiffs();
    if (param_.test_interval() && iter_ % param_.test_interval() == 0
        && (iter_ > 0 || param_.test_initialization())
        && Caffe::root_solver()) {
      TestAll();
      if (requested_early_exit_) {
        // Break out of the while loop because stop was requested while testing.
        break;
      }
    }

    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_start();
    }
    const bool display = param_.display() && iter_ % param_.display() == 0;
    net_->set_debug_info(display && param_.debug_info());
    // accumulate the loss and gradient
    Dtype loss = 0;
    for (int i = 0; i < param_.iter_size(); ++i) {
      loss += net_->ForwardBackward();
    }
    loss /= param_.iter_size();
    // average the loss across iterations for smoothed reporting
    UpdateSmoothedLoss(loss, start_iter, average_loss);
    if (display) {
      LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
          << ", loss = " << smoothed_loss_;
      const vector<Blob<Dtype>*>& result = net_->output_blobs();
      int score_index = 0;
      for (int j = 0; j < result.size(); ++j) {
        const Dtype* result_vec = result[j]->cpu_data();
        const string& output_name =
            net_->blob_names()[net_->output_blob_indices()[j]];
        const Dtype loss_weight =
            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
        for (int k = 0; k < result[j]->count(); ++k) {
          ostringstream loss_msg_stream;
          if (loss_weight) {
            loss_msg_stream << " (* " << loss_weight
                            << " = " << loss_weight * result_vec[k] << " loss)";
          }
          LOG_IF(INFO, Caffe::root_solver()) << "    Train net output #"
              << score_index++ << ": " << output_name << " = "
              << result_vec[k] << loss_msg_stream.str();
        }
      }
    }
    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_gradients_ready();
    }
    ApplyUpdate();

    // Increment the internal iter_ counter -- its value should always indicate
    // the number of times the weights have been updated.
    ++iter_;

    SolverAction::Enum request = GetRequestedAction();

    // Save a snapshot if needed.
    if ((param_.snapshot()
         && iter_ % param_.snapshot() == 0
         && Caffe::root_solver()) ||
         (request == SolverAction::SNAPSHOT)) {
      Snapshot();
    }
    if (SolverAction::STOP == request) {
      requested_early_exit_ = true;
      // Break out of training loop.
      break;
    }
  }
}
Ejemplo n.º 4
0
void Solver::Step(int iters) {
  const int start_iter = iter_;
  const int stop_iter = iter_ + iters;
  int average_loss = this->param_.average_loss();
  losses_.clear();
  smoothed_loss_ = 0;
  const Caffe::Brew mode = Caffe::mode();
  const int solver_count = Caffe::solver_count();
  const bool root_solver = this->is_root();

  net_->set_solver(this);

#ifndef CPU_ONLY
  for (const shared_ptr<Blob>& param : net_->learnable_params()) {
    // To prevent allocations inside on_start call:
    param->allocate_data(mode == Caffe::GPU);
  }

  net_->InitializeLearnableDiffSpace();

  if (solver_count > 1) {
    // we need to sync all threads before starting, otherwise some cuda init,
    // malloc or other cuda stuff could interlock with in-loop cuda GPU sync
    // called in on_start.
    callback_soft_barrier();
    {
      unique_ptr<unique_lock<shared_mutex>> lock;
      if (root_solver) {
        lock.reset(new unique_lock<shared_mutex>(GPUMemory::read_write_mutex()));
      }
      callback_soft_barrier();
      callback_->on_start(net_->learnable_params());
    }
    callback_soft_barrier();
    LOG(INFO) << "Starting Optimization on GPU " << Caffe::current_device();
  }
  const bool use_multi_gpu_testing = Caffe::solver_count() > 1;
  const string mgpu_str = use_multi_gpu_testing ? "[MultiGPU] " : "";
#else
  const bool use_multi_gpu_testing = false;
  const string mgpu_str;
#endif

  uint64_t random_seed = param_.random_seed() >= 0 ?
      static_cast<uint64_t>(param_.random_seed()) : Caffe::next_seed();

  reduce_thread_.reset(new boost::thread(&Solver::Reduce, this,
      Caffe::current_device(), mode, random_seed, solver_count, root_solver));

  while (iter_ < stop_iter) {
    if (param_.snapshot_diff()) {
      net_->ClearParamDiffs();
    }  // we clean them in ApplyUpdate otherwise

    // Just started or restored?
    const bool first_loop = iter_ == 0 || iterations_last_ < 0;
    if (iter_ == 0) {
      if (TestAll(1, use_multi_gpu_testing)) {
        break;
      }
      callback_soft_barrier();
      LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Initial Test completed";
    } else if (param_.test_interval()
        && iter_ % param_.test_interval() == 0
        && iterations_last_ >= 0) {
      test_timer_.Start();
      if (TestAll(0, use_multi_gpu_testing)) {
        break;
      }
      callback_soft_barrier();
      float lapse = test_timer_.Seconds();
      LOG_IF(INFO, Caffe::root_solver()) << mgpu_str << "Tests completed in "
                                         << lapse << "s";
    }
    if (requested_early_exit_) {
      // Break out of the while loop because stop was requested while testing.
      break;
    }

    const bool display = this->display();
    net_->set_debug_info(display && param_.debug_info());
    // accumulate the loss and gradient
    float loss = 0.F;
    if (first_loop) {
      iterations_last_ = iter_;
      iteration_timer_.Start();
      init_flag_.set();
    }

    iteration_start_signal();
    for (int i = 0; i < param_.iter_size(); ++i) {
      loss += net_->ForwardBackward(i + 1 == param_.iter_size());

      if (i == 0) {
        if (first_loop) {
          iter0_flag_.set();
          net_->wait_layers_init();
        }
        iter_size_complete_ = true;
      }
    }
    loss /= param_.iter_size();
    iteration_wait();
    if (requested_early_exit_) {
      total_lapse_ += iteration_timer_.Seconds();
      break;
    }

    // average the loss across iterations for smoothed reporting
    UpdateSmoothedLoss(loss, start_iter, average_loss);
    if (display || iter_ <= 2 || iter_ + 1 >= stop_iter) {
      float lapse = iteration_timer_.Seconds();
      if (iter_ >= 2) {  // we skip 0th and 1st for correct benchmarking
        total_lapse_ += lapse;
        float per_s = (iter_ - iterations_last_) / (lapse > 0.F ? lapse : 1.F);
        LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
                                           << " (" << per_s << " iter/s, " << lapse << "s/"
                                           << param_.display() << " iter), loss = "
                                           << smoothed_loss_;
      } else {
        LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
                                           << " (" << lapse << " s), loss = " << smoothed_loss_;
      }
      const vector<Blob*>& result = net_->output_blobs();
      int score_index = 0;
      for (int j = 0; j < result.size(); ++j) {
        const float* result_vec = result[j]->cpu_data<float>();
        const string& output_name =
            net_->blob_names()[net_->output_blob_indices()[j]];
        const float loss_weight =
            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
        for (int k = 0; k < result[j]->count(); ++k) {
          ostringstream loss_msg_stream;
          if (loss_weight) {
            loss_msg_stream << " (* " << loss_weight
                << " = " << (loss_weight * result_vec[k]) << " loss)";
          }
          LOG_IF(INFO, Caffe::root_solver()) << "    Train net output #"
              << score_index++ << ": " << output_name << " = "
              << result_vec[k] << loss_msg_stream.str();
        }
      }
      PrintRate();
      iterations_last_ = iter_;
      iteration_timer_.Start();
    }
    // Increment the internal iter_ counter -- its value should always indicate
    // the number of times the weights have been updated.
    ++iter_;

    SolverAction::Enum request = GetRequestedAction();
    // Save a snapshot if needed.
    if ((param_.snapshot()
         && iter_ % param_.snapshot() == 0
         && Caffe::root_solver()) ||
         (request == SolverAction::SNAPSHOT)) {
      Snapshot();
    }
    if (SolverAction::STOP == request) {
      requested_early_exit_ = true;
      total_lapse_ += iteration_timer_.Seconds();
      // Break out of training loop.
      break;
    }
  }
  Finalize();
}