Esempio n. 1
0
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    timer->Start();
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);
    *timing << " add_grad: " << timer->MilliSeconds();
  }

  // Send gradients to parent
  if (parent_) {
    timer->Start();
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == parent_->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
    *timing << " send_grad: " << timer->MilliSeconds();
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}
void SocketSync<Dtype>::on_gradients_ready() {
  // Reduce gradients from local GPUs
  P2PSync<Dtype>::on_gradients_ready();
  // Send gradients to corresponding parameter server node
  int peer = rank_ + 1;
  for (int n = 0; n < peers_.size() - 1; ++n) {
    if (peer == peers_.size()) {
      peer = 0;
    }
    diff_send_[peer]->Write();
    peer++;
  }
  // Sum gradients as they are received
  peer = rank_ + 1;
  for (int n = 0; n < peers_.size() - 1; ++n) {
    if (peer == peers_.size()) {
      peer = 0;
    }
#ifndef CPU_ONLY
    SocketBuffer * buffer = diff_recv_[peer]->Read();
    Dtype* src = reinterpret_cast<Dtype*>(buffer->addr());
    Dtype* dst = diff_ + own_offs_;
    caffe_gpu_add(own_size_, src, dst, dst);
#else
    diff_recv_[peer]->Read();
#endif
    peer++;
  }
}
Esempio n. 3
0
void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (!propagate_down[0]) { return; }
  if (top.size() == 1) {
    caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
    return;
  }
  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
                bottom[0]->mutable_gpu_diff());
  // Add remaining top blob diffs.
  for (int i = 2; i < top.size(); ++i) {
    const Dtype* top_diff = top[i]->gpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
  }
}
Esempio n. 4
0
void Tensor<Dtype>::AddFrom(const Tensor& source) {
  if (source.count() != count_ || source.shape() != shape_) {
    ASSERT(false, "Trying to add blobs of different sizes: "
      << source.count() << " != " << count_);
  }
  switch (mode()) {
  case Caffe::CPU:
    caffe_add(count_, source.cpu_mem(),
        this->cpu_mem(),
        this->mutable_cpu_mem());
    break;
  case Caffe::GPU:
#ifndef CPU_ONLY
    caffe_gpu_add(count_, source.gpu_mem(),
        this->gpu_mem(),
        this->mutable_gpu_mem());
#else
    NO_GPU;
#endif
    break;
  default:
    ASSERT(false, "Unknown caffe mode.");
  }
}
Esempio n. 5
0
void AdaGradSolver<Dtype>::ComputeUpdateValue() {
  vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
  vector<float>& net_params_lr = this->net_->params_lr();
  vector<float>& net_params_weight_decay = this->net_->params_weight_decay();
  // get the learning rate
  Dtype rate = this->GetLearningRate();
  Dtype delta = this->param_.delta();
  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
  }
  Dtype weight_decay = this->param_.weight_decay();
  string regularization_type = this->param_.regularization_type();
  switch (Caffe::mode()) {
  case Caffe::CPU:
    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
      Dtype local_rate = rate * net_params_lr[param_id];
      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

      if (local_decay) {
        if (regularization_type == "L2") {
          // add weight decay
          caffe_axpy(net_params[param_id]->count(),
              local_decay,
              net_params[param_id]->cpu_data(),
              net_params[param_id]->mutable_cpu_diff());
        } else if (regularization_type == "L1") {
          caffe_cpu_sign(net_params[param_id]->count(),
              net_params[param_id]->cpu_data(),
              this->temp_[param_id]->mutable_cpu_data());
          caffe_axpy(net_params[param_id]->count(),
              local_decay,
              this->temp_[param_id]->cpu_data(),
              net_params[param_id]->mutable_cpu_diff());
        } else {
          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
        }
      }

      // compute square of gradient in update
      caffe_powx(net_params[param_id]->count(),
          net_params[param_id]->cpu_diff(), Dtype(2),
          this->update_[param_id]->mutable_cpu_data());

      // update history
      caffe_add(net_params[param_id]->count(),
          this->update_[param_id]->cpu_data(),
          this->history_[param_id]->cpu_data(),
          this->history_[param_id]->mutable_cpu_data());

      // prepare update
      caffe_powx(net_params[param_id]->count(),
                this->history_[param_id]->cpu_data(), Dtype(0.5),
                this->update_[param_id]->mutable_cpu_data());

      caffe_add_scalar(net_params[param_id]->count(),
                delta, this->update_[param_id]->mutable_cpu_data());

      caffe_div(net_params[param_id]->count(),
                net_params[param_id]->cpu_diff(),
                this->update_[param_id]->cpu_data(),
                this->update_[param_id]->mutable_cpu_data());

      // scale and copy
      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
          this->update_[param_id]->cpu_data(), Dtype(0),
          net_params[param_id]->mutable_cpu_diff());
    }
    break;
  case Caffe::GPU:
#ifndef CPU_ONLY
    for (int param_id = 0; param_id < net_params.size(); ++param_id) {
      Dtype local_rate = rate * net_params_lr[param_id];
      Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

      if (local_decay) {
        if (regularization_type == "L2") {
          // add weight decay
          caffe_gpu_axpy(net_params[param_id]->count(),
              local_decay,
              net_params[param_id]->gpu_data(),
              net_params[param_id]->mutable_gpu_diff());
        } else if (regularization_type == "L1") {
          caffe_gpu_sign(net_params[param_id]->count(),
              net_params[param_id]->gpu_data(),
              this->temp_[param_id]->mutable_gpu_data());
          caffe_gpu_axpy(net_params[param_id]->count(),
              local_decay,
              this->temp_[param_id]->gpu_data(),
              net_params[param_id]->mutable_gpu_diff());
        } else {
          LOG(FATAL) << "Unknown regularization type: " << regularization_type;
        }
      }

      // compute square of gradient in update
      caffe_gpu_powx(net_params[param_id]->count(),
          net_params[param_id]->gpu_diff(), Dtype(2),
          this->update_[param_id]->mutable_gpu_data());

      // update history
      caffe_gpu_add(net_params[param_id]->count(),
          this->update_[param_id]->gpu_data(),
          this->history_[param_id]->gpu_data(),
          this->history_[param_id]->mutable_gpu_data());

      // prepare update
      caffe_gpu_powx(net_params[param_id]->count(),
                this->history_[param_id]->gpu_data(), Dtype(0.5),
                this->update_[param_id]->mutable_gpu_data());

      caffe_gpu_add_scalar(net_params[param_id]->count(),
                delta, this->update_[param_id]->mutable_gpu_data());

      caffe_gpu_div(net_params[param_id]->count(),
                net_params[param_id]->gpu_diff(),
                this->update_[param_id]->gpu_data(),
                this->update_[param_id]->mutable_gpu_data());

      // scale and copy
      caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
          this->update_[param_id]->gpu_data(), Dtype(0),
          net_params[param_id]->mutable_gpu_diff());
    }
#else
    NO_GPU;
#endif
    break;
  default:
    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
  }
}
Esempio n. 6
0
void AdaGradSolver<Dtype>::ComputeUpdateValue(uint_tp param_id, Dtype rate) {
  CHECK(Caffe::root_solver());
  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
  const vector<float>& net_params_lr = this->net_->params_lr();
  Dtype delta = this->param_.delta();
  Dtype local_rate = rate * net_params_lr[param_id];
  switch (Caffe::mode()) {
    case Caffe::CPU: {
      // compute square of gradient in update
      caffe_powx(net_params[param_id]->count(),
                 net_params[param_id]->cpu_diff(), Dtype(2),
                 this->update_[param_id]->mutable_cpu_data());

      // update history
      caffe_add(net_params[param_id]->count(),
                this->update_[param_id]->cpu_data(),
                this->history_[param_id]->cpu_data(),
                this->history_[param_id]->mutable_cpu_data());

      // prepare update
      caffe_powx(net_params[param_id]->count(),
                 this->history_[param_id]->cpu_data(), Dtype(0.5),
                 this->update_[param_id]->mutable_cpu_data());

      caffe_add_scalar(net_params[param_id]->count(), delta,
                       this->update_[param_id]->mutable_cpu_data());

      caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(),
                this->update_[param_id]->cpu_data(),
                this->update_[param_id]->mutable_cpu_data());

      // scale and copy
      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
                      this->update_[param_id]->cpu_data(), Dtype(0),
                      net_params[param_id]->mutable_cpu_diff());
      break;
    }
    case Caffe::GPU: {
#ifndef CPU_ONLY
      if (this->device_->backend() == BACKEND_CUDA) {
#ifdef USE_CUDA
        // compute square of gradient in update
        caffe_gpu_powx(net_params[param_id]->count(),
                       net_params[param_id]->gpu_diff(), Dtype(2),
                       this->update_[param_id]->mutable_gpu_data());

        // update history
        caffe_gpu_add(net_params[param_id]->count(),
                      this->update_[param_id]->gpu_data(),
                      this->history_[param_id]->gpu_data(),
                      this->history_[param_id]->mutable_gpu_data());

        // prepare update
        caffe_gpu_powx(net_params[param_id]->count(),
                       this->history_[param_id]->gpu_data(), Dtype(0.5),
                       this->update_[param_id]->mutable_gpu_data());

        caffe_gpu_add_scalar(net_params[param_id]->count(), delta,
                             this->update_[param_id]->mutable_gpu_data());

        caffe_gpu_div(net_params[param_id]->count(),
                      net_params[param_id]->gpu_diff(),
                      this->update_[param_id]->gpu_data(),
                      this->update_[param_id]->mutable_gpu_data());

        // scale and copy
        caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
                        this->update_[param_id]->gpu_data(), Dtype(0),
                        net_params[param_id]->mutable_gpu_diff());
#endif  // USE_CUDA
      } else {
#ifdef USE_GREENTEA
        // compute square of gradient in update
        greentea_gpu_powx<Dtype>(
            this->device_->id(), net_params[param_id]->count(),
            (cl_mem) (net_params[param_id]->gpu_diff()), 0, Dtype(2),
            (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0);

        // update history
        greentea_gpu_add<Dtype>(
            this->device_->id(), net_params[param_id]->count(),
            (cl_mem) (this->update_[param_id]->gpu_data()), 0,
            (cl_mem) (this->history_[param_id]->gpu_data()), 0,
            (cl_mem) (this->history_[param_id]->mutable_gpu_data()), 0);

        // prepare update
        greentea_gpu_powx<Dtype>(
            this->device_->id(), net_params[param_id]->count(),
            (cl_mem) (this->history_[param_id]->gpu_data()), 0, Dtype(0.5),
            (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0);

        greentea_gpu_add_scalar<Dtype>(
            this->device_->id(), net_params[param_id]->count(), delta,
            (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0);

        greentea_gpu_div<Dtype>(
            this->device_->id(), net_params[param_id]->count(),
            (cl_mem) (net_params[param_id]->gpu_diff()), 0,
            (cl_mem) (this->update_[param_id]->gpu_data()), 0,
            (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0);

        // scale and copy
        greentea_gpu_axpby<Dtype>(
            this->device_->id(), net_params[param_id]->count(),
            local_rate, (cl_mem) (this->update_[param_id]->gpu_data()), 0,
            Dtype(0), (cl_mem) (net_params[param_id]->mutable_gpu_diff()), 0);
#endif  // USE_GREENTEA
      }
#else
      NO_GPU;
#endif
      break;
    }
    default:
      LOG(FATAL)<< "Unknown caffe mode: " << Caffe::mode();
    }
  }