void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& paraConfig, size_t sparseId) const { if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); if (t0Vec_[sparseId] == 0) { vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]); t0Vec_[sparseId] = 1; } vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT], -alpha_ * gamma_ * learningRate_); vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT], tau_ * alpha_ * gamma_ * learningRate_); vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT], tau_ / beta_ + 1.0 / alpha_, *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_); } else { vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(), applyDecay_ ? paraConfig.decay_rate() : 0); } }
void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1LU) << "Sparse update is not supported"; BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; adadeltaApply(value, grad, mom, accum, accum_update, lr, rou_, epsilon_, learningRate, momentum, decayRate); }
void AdagradParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real epsilon = optConfig_.ada_epsilon(); real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; adagradApply(value, grad, mom, accum_buffer, accum, lr, epsilon, learningRate, momentum, decayRate); }
void RMSPropParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { BaseMatrix& value = *vecs[PARAMETER_VALUE]; BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM]; BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1]; BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; real accumulatedRou = rou_; bool firstTime = timer_ == 0; if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]); firstTime = t0Vec_[sparseId] == 0; t0Vec_[sparseId] = timer_ + 1; } real epsilon = optConfig_.ada_epsilon(); real learningRate = learningRate_ * config.learning_rate(); real momentum = config.momentum(); real decayRate = applyDecay_ ? config.decay_rate() : 0; rmspropApply(value, grad, mom, sum, sum1, lr, accumulatedRou, rou_, epsilon, learningRate, momentum, decayRate, firstTime); }