void PowerLayer<Dtype>::Backward_gpu( const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[0]) { Dtype* bottom_diff = (bottom)[0]->mutable_gpu_diff(); const int count = (bottom)[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { caffe_gpu_set(count, diff_scale_, bottom_diff); } else { const Dtype* bottom_data = (bottom)[0]->gpu_data(); // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) // = diff_scale * y / (shift + scale * x) if (power_ == Dtype(2)) { // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x caffe_gpu_axpby( count, diff_scale_ * scale_, bottom_data, Dtype(0), bottom_diff); if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); } } else if (shift_ == Dtype(0)) { // Special case for y = (scale * x)^power // -> dy/dx = scale * power * (scale * x)^(power - 1) // = scale * power * (scale * x)^power * (scale * x)^(-1) // = power * y / x const Dtype* top_data = top[0]->gpu_data(); caffe_gpu_div(count, top_data, bottom_data, bottom_diff); caffe_gpu_scal(count, power_, bottom_diff); } else { caffe_copy(count, bottom_data, bottom_diff); if (scale_ != Dtype(1)) { caffe_gpu_scal(count, scale_, bottom_diff); } if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, shift_, bottom_diff); } const Dtype* top_data = top[0]->gpu_data(); caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff); if (diff_scale_ != Dtype(1)) { caffe_gpu_scal(count, diff_scale_, bottom_diff); } } } caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } }
void Blob<Dtype>::scale_diff(Dtype scale_factor) { Dtype* diff; if (!diff_) { return; } switch (diff_->head()) { case SyncedMemory::SYNCED_PRV: case SyncedMemory::HEAD_AT_PRV: diff = mutable_prv_diff(); caffe_scal(prv_diff_count(), scale_factor, diff); break; case SyncedMemory::HEAD_AT_CPU: diff = mutable_cpu_diff(); caffe_scal(count_, scale_factor, diff); return; case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: #ifndef CPU_ONLY diff = mutable_gpu_diff(); caffe_gpu_scal(count_, scale_factor, diff); return; #else NO_GPU; #endif case SyncedMemory::UNINITIALIZED: return; default: LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); } }
void NCCL<Dtype>::run(int layer) { CHECK(solver_->param().layer_wise_reduce()); vector<shared_ptr<Blob<Dtype> > >& blobs = solver_->net()->layers()[layer]->blobs(); #ifdef DEBUG // Assert blobs are contiguous to reduce in one step (e.g. bias often small) for (int i = 1; i < blobs.size(); ++i) { CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(), blobs[i + 0]->gpu_diff()); } #endif if (blobs.size() > 0) { // Make sure default stream is done computing gradients. Could be // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid // blocking the default stream, but it's actually slower. CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); // Reduce asynchronously int size = 0; for (int i = 0; i < blobs.size(); ++i) { size += blobs[i]->count(); } if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(), blobs[0]->mutable_gpu_diff(), size, nccl::dataType<Dtype>::type, ncclSum, comm_, stream_)); caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(), blobs[0]->mutable_gpu_diff(), stream_); } }
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) { #ifndef CPU_ONLY #ifdef DEBUG int device; CUDA_CHECK(cudaGetDevice(&device)); CHECK(device == solver_->param().device_id()); #endif // Sum children gradients as they appear in the queue for (int i = 0; i < children_.size(); ++i) { timer->Start(); P2PSync<Dtype> *child = queue_.pop(); Dtype* src = child->parent_grads_; Dtype* dst = diff_; #ifdef DEBUG bool ok = false; for (int j = 0; j < children_.size(); ++j) { if (child == children_[j]) { ok = true; } } CHECK(ok); cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == device); #endif caffe_gpu_add(size_, src, dst, dst); *timing << " add_grad: " << timer->MilliSeconds(); } // Send gradients to parent if (parent_) { timer->Start(); Dtype* src = diff_; Dtype* dst = parent_grads_; #ifdef DEBUG cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == parent_->solver_->param().device_id()); #endif CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // cudaMemcpyDeviceToDevice, cudaStreamDefault)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); parent_->queue_.push(this); *timing << " send_grad: " << timer->MilliSeconds(); } else { // Loss functions divide gradients by the batch size, so to compensate // for split batch, the root solver divides by number of solvers. caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_); } #endif }
void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (!propagate_down[0]) { return; } const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); caffe_copy(count, bottom_data, bottom_diff); if (input_scale_ != Dtype(1)) { caffe_gpu_scal(count, input_scale_, bottom_diff); } if (input_shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, input_shift_, bottom_diff); } caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); if (backward_num_scale_ != Dtype(1)) { caffe_gpu_scal(count, backward_num_scale_, bottom_diff); } caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); }
void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { caffe_gpu_log(count, bottom_data, top_data); } else { caffe_copy(count, bottom_data, top_data); if (input_scale_ != Dtype(1)) { caffe_gpu_scal(count, input_scale_, top_data); } if (input_shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, input_shift_, top_data); } caffe_gpu_log(count, top_data, top_data); } if (base_scale_ != Dtype(1)) { caffe_gpu_scal(count, base_scale_, top_data); } }
void NCCL<Dtype>::on_gradients_ready() { if (solver_->param().layer_wise_reduce()) { CHECK_EQ(solver_->net()->params().size(), solver_->net()->learnable_params().size()) << "Layer-wise reduce is not supported for nets with shared weights."; // Make sure reduction is done before applying gradients CUDA_CHECK(cudaStreamSynchronize(stream_)); } else { if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_), nccl::dataType<Dtype>::type, ncclSum, comm_, cudaStreamDefault)); caffe_gpu_scal(static_cast<int>(size_), (Dtype) 1.0 / Caffe::solver_count(), diff_); } }
void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu( const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { // First, compute the diff const int count = bottom[0]->count(); const int num = bottom[0]->num(); const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); const Dtype* target = bottom[1]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); caffe_copy(count, sigmoid_output_data, bottom_diff); caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); // Scale down gradient const Dtype loss_weight = top[0]->cpu_diff()[0]; caffe_gpu_scal(count, loss_weight / num, bottom_diff); } }
void PowerLayer<Dtype>::Forward_gpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { Dtype* top_data = (top)[0]->mutable_gpu_data(); const int count = bottom[0]->count(); // Special case where we can ignore the input: scale or power is 0. if (diff_scale_ == Dtype(0)) { Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); caffe_gpu_set(count, value, top_data); return; } const Dtype* bottom_data = bottom[0]->gpu_data(); caffe_copy(count, bottom_data, top_data); if (scale_ != Dtype(1)) { caffe_gpu_scal(count, scale_, top_data); } if (shift_ != Dtype(0)) { caffe_gpu_add_scalar(count, shift_, top_data); } if (power_ != Dtype(1)) { caffe_gpu_powx(count, top_data, power_, top_data); } }
void Blob<Dtype>::scale_data(Dtype scale_factor) { Dtype* data; if (!data_) { return; } switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: data = mutable_cpu_data(); caffe_scal(count_, scale_factor, data); return; case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: #ifndef CPU_ONLY data = mutable_gpu_data(); caffe_gpu_scal(count_, scale_factor, data); return; #else NO_GPU; #endif case SyncedMemory::UNINITIALIZED: return; default: LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); } }
void InnerProductLayer<Dtype>::normalize_weights(Dtype mnorm) { Dtype *weight = 0; int M = this->blobs_[0]->height(); int N = this->blobs_[0]->width(); int off = this->blobs_[0]->offset(0, 0, 0, 1); switch (Caffe::mode()) { case Caffe::CPU: // apply the constraint to each column weight = this->blobs_[0]->mutable_cpu_data(); for (int i = 0; i < N; ++i) { // compute l2 norm Dtype nrm = caffe_cpu_norm2(M, weight, N); if (nrm > mnorm) { // and scale caffe_scal(M, mnorm / (nrm + Dtype(1e-7)), weight, N); } weight += off; } break; case Caffe::GPU: // apply the constraint to each column weight = this->blobs_[0]->mutable_gpu_data(); for (int i = 0; i < N; ++i) { // compute l2 norm Dtype nrm = caffe_gpu_norm2(M, weight, N); if (nrm > mnorm) { // and scale caffe_gpu_scal(M, mnorm / (nrm + Dtype(1e-7)), weight, N); } weight += off; } break; default: LOG(FATAL) << "Unknown caffe mode."; break; } }
void SGDSolver<Dtype>::Normalize(int param_id) { if (this->param_.iter_size() == 1) { return; } // Scale gradient to counterbalance accumulation. const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); switch (Caffe::mode()) { case Caffe::CPU: { caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY if (this->device_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); #endif // USE_CUDA } else { #ifdef USE_GREENTEA greentea_gpu_scal(this->device_->id(), net_params[param_id]->count(), accum_normalization, (cl_mem) (net_params[param_id]->mutable_gpu_diff()), 0); #endif // USE_GREENTEA } #else NO_GPU; #endif break; } default: LOG(FATAL)<< "Unknown caffe mode: " << Caffe::mode(); } }
void SGDSolver<Dtype>::Normalize(int param_id) { if (this->param_.iter_size() == 1) { return; } // Scale gradient to counterbalance accumulation. const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); switch (Caffe::mode()) { case Caffe::CPU: { caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } }
void Blob<Dtype>::scale_diff(Dtype scale_factor) { Dtype* diff; if (!diff_) { return; } switch (diff_->head()) { case SyncedMemory::HEAD_AT_CPU: { diff = mutable_cpu_diff(); caffe_scal(count_, scale_factor, diff); return; } case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: { #ifndef CPU_ONLY diff = mutable_gpu_diff(); if (device_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA caffe_gpu_scal(count_, scale_factor, diff); #endif } else { #ifdef USE_GREENTEA greentea_gpu_scal(device_->id(), count_, scale_factor, (cl_mem) diff, 0); #endif } return; #else NO_GPU; #endif } case SyncedMemory::UNINITIALIZED: return; default: LOG(FATAL)<< "Unknown SyncedMemory head state: " << diff_->head(); } }