void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1LU) << "Sparse update is not supported"; BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; adadeltaApply(value, grad, mom, accum, accum_update, lr, rou_, epsilon_, learningRate, momentum, decayRate); }
void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& paraConfig, size_t sparseId) const { if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); if (t0Vec_[sparseId] == 0) { vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]); t0Vec_[sparseId] = 1; } vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT], -alpha_ * gamma_ * learningRate_); vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT], tau_ * alpha_ * gamma_ * learningRate_); vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT], tau_ / beta_ + 1.0 / alpha_, *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_); } else { vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(), applyDecay_ ? paraConfig.decay_rate() : 0); } }
void AdagradParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real epsilon = optConfig_.ada_epsilon(); real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; adagradApply(value, grad, mom, accum_buffer, accum, lr, epsilon, learningRate, momentum, decayRate); }
void OptimizerWithRegularizerEveryNumBatches::doTraversal( const VectorPtr vecs[], const ParameterConfig& config) const { int32_t base = std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization())); regularizer_->update( vecs, config, optimizer_->getLearningRate(), base, timer_ + 1); }
void AdamParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1UL) << "Sparse update is not supported"; real beta1_power = std::pow(beta1_, step_); real beta2_power = std::pow(beta2_, step_); real learningRate = config.learning_rate() * learningRate_; BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM]; adamApply(value, grad, mom, v, beta1_, beta2_, beta1_power, beta2_power, epsilon_, learningRate); }
void OptimizerWithRegularizerEveryNumBatches::catchUpWith( const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { int32_t base = timer_ - timer_ % config.num_batches_regularization(); regularizer_->update(vecs, config, optimizer_->getLearningRate(), std::max(base, baseTimer_), timer_); }
void AdamaxParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1UL) << "Sparse update is not supported"; real learningRate = config.learning_rate() * learningRate_; BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM]; adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate); }
void RMSPropParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real accumulatedRou = rou_; bool firstTime = timer_ == 0; if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]); firstTime = t0Vec_[sparseId] == 0; t0Vec_[sparseId] = timer_ + 1; } real epsilon = optConfig_.ada_epsilon(); real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; rmspropApply(value, grad, mom, sum, sum1, lr, accumulatedRou, rou_, epsilon, learningRate, momentum, decayRate, firstTime); }
void OptimizerWithGradientClipping::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { real globalThreshold = optConfig_.gradient_clipping_threshold(); real localThreshold = config.gradient_clipping_threshold(); // Use local gradient clipping threshold if it's enabled, // otherwise using the global one. real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold; std::string field = localThreshold > 0.0f ? "local" : "global"; real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax(); if (maxAbsGrad > threshold) { if (FLAGS_log_clipping) { real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() / vecs[PARAMETER_GRADIENT]->getSize(); LOG(INFO) << "parameter=" << config.name() << " need clipping by " << field << " threshold=" << threshold << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad; } vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold); } optimizer_->update(vecs, config, sparseId); }
std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create( const ParameterConfig& paramConfig, int idx) { std::pair<std::string, int> key = {paramConfig.name(), idx}; return g_hookCache_.get( key, [&] { return createImpl(paramConfig.update_hooks(idx)); }); }