void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) { #ifndef CPU_ONLY #ifdef DEBUG int device; CUDA_CHECK(cudaGetDevice(&device)); CHECK(device == solver_->param().device_id()); #endif // Sum children gradients as they appear in the queue for (int i = 0; i < children_.size(); ++i) { timer->Start(); P2PSync<Dtype> *child = queue_.pop(); Dtype* src = child->parent_grads_; Dtype* dst = diff_; #ifdef DEBUG bool ok = false; for (int j = 0; j < children_.size(); ++j) { if (child == children_[j]) { ok = true; } } CHECK(ok); cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == device); #endif caffe_gpu_add(size_, src, dst, dst); *timing << " add_grad: " << timer->MilliSeconds(); } // Send gradients to parent if (parent_) { timer->Start(); Dtype* src = diff_; Dtype* dst = parent_grads_; #ifdef DEBUG cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == parent_->solver_->param().device_id()); #endif CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // cudaMemcpyDeviceToDevice, cudaStreamDefault)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); parent_->queue_.push(this); *timing << " send_grad: " << timer->MilliSeconds(); } else { // Loss functions divide gradients by the batch size, so to compensate // for split batch, the root solver divides by number of solvers. caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_); } #endif }
void SocketSync<Dtype>::on_gradients_ready() { // Reduce gradients from local GPUs P2PSync<Dtype>::on_gradients_ready(); // Send gradients to corresponding parameter server node int peer = rank_ + 1; for (int n = 0; n < peers_.size() - 1; ++n) { if (peer == peers_.size()) { peer = 0; } diff_send_[peer]->Write(); peer++; } // Sum gradients as they are received peer = rank_ + 1; for (int n = 0; n < peers_.size() - 1; ++n) { if (peer == peers_.size()) { peer = 0; } #ifndef CPU_ONLY SocketBuffer * buffer = diff_recv_[peer]->Read(); Dtype* src = reinterpret_cast<Dtype*>(buffer->addr()); Dtype* dst = diff_ + own_offs_; caffe_gpu_add(own_size_, src, dst, dst); #else diff_recv_[peer]->Read(); #endif peer++; } }
void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (!propagate_down[0]) { return; } if (top.size() == 1) { caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); return; } caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), bottom[0]->mutable_gpu_diff()); // Add remaining top blob diffs. for (int i = 2; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); } }
void Tensor<Dtype>::AddFrom(const Tensor& source) { if (source.count() != count_ || source.shape() != shape_) { ASSERT(false, "Trying to add blobs of different sizes: " << source.count() << " != " << count_); } switch (mode()) { case Caffe::CPU: caffe_add(count_, source.cpu_mem(), this->cpu_mem(), this->mutable_cpu_mem()); break; case Caffe::GPU: #ifndef CPU_ONLY caffe_gpu_add(count_, source.gpu_mem(), this->gpu_mem(), this->mutable_gpu_mem()); #else NO_GPU; #endif break; default: ASSERT(false, "Unknown caffe mode."); } }
void AdaGradSolver<Dtype>::ComputeUpdateValue() { vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params(); vector<float>& net_params_lr = this->net_->params_lr(); vector<float>& net_params_weight_decay = this->net_->params_weight_decay(); // get the learning rate Dtype rate = this->GetLearningRate(); Dtype delta = this->param_.delta(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); switch (Caffe::mode()) { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else if (regularization_type == "L1") { caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), this->temp_[param_id]->mutable_cpu_data()); caffe_axpy(net_params[param_id]->count(), local_decay, this->temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history caffe_add(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->history_[param_id]->mutable_cpu_data()); // prepare update caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data()); caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); } break; case Caffe::GPU: #ifndef CPU_ONLY for (int param_id = 0; param_id < net_params.size(); ++param_id) { Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_gpu_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else if (regularization_type == "L1") { caffe_gpu_sign(net_params[param_id]->count(), net_params[param_id]->gpu_data(), this->temp_[param_id]->mutable_gpu_data()); caffe_gpu_axpy(net_params[param_id]->count(), local_decay, this->temp_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_add(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->history_[param_id]->mutable_gpu_data()); // prepare update caffe_gpu_powx(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // scale and copy caffe_gpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->gpu_data(), Dtype(0), net_params[param_id]->mutable_gpu_diff()); } #else NO_GPU; #endif break; default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } }
void AdaGradSolver<Dtype>::ComputeUpdateValue(uint_tp param_id, Dtype rate) { CHECK(Caffe::root_solver()); const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history caffe_add(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->history_[param_id]->mutable_cpu_data()); // prepare update caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data()); caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY if (this->device_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_add(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->history_[param_id]->mutable_gpu_data()); // prepare update caffe_gpu_powx(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // scale and copy caffe_gpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->gpu_data(), Dtype(0), net_params[param_id]->mutable_gpu_diff()); #endif // USE_CUDA } else { #ifdef USE_GREENTEA // compute square of gradient in update greentea_gpu_powx<Dtype>( this->device_->id(), net_params[param_id]->count(), (cl_mem) (net_params[param_id]->gpu_diff()), 0, Dtype(2), (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0); // update history greentea_gpu_add<Dtype>( this->device_->id(), net_params[param_id]->count(), (cl_mem) (this->update_[param_id]->gpu_data()), 0, (cl_mem) (this->history_[param_id]->gpu_data()), 0, (cl_mem) (this->history_[param_id]->mutable_gpu_data()), 0); // prepare update greentea_gpu_powx<Dtype>( this->device_->id(), net_params[param_id]->count(), (cl_mem) (this->history_[param_id]->gpu_data()), 0, Dtype(0.5), (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0); greentea_gpu_add_scalar<Dtype>( this->device_->id(), net_params[param_id]->count(), delta, (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0); greentea_gpu_div<Dtype>( this->device_->id(), net_params[param_id]->count(), (cl_mem) (net_params[param_id]->gpu_diff()), 0, (cl_mem) (this->update_[param_id]->gpu_data()), 0, (cl_mem) (this->update_[param_id]->mutable_gpu_data()), 0); // scale and copy greentea_gpu_axpby<Dtype>( this->device_->id(), net_params[param_id]->count(), local_rate, (cl_mem) (this->update_[param_id]->gpu_data()), 0, Dtype(0), (cl_mem) (net_params[param_id]->mutable_gpu_diff()), 0); #endif // USE_GREENTEA } #else NO_GPU; #endif break; } default: LOG(FATAL)<< "Unknown caffe mode: " << Caffe::mode(); } }