QFunction ValueIterationGeneral<M>::computeImmediateRewards(const M & model) const { QFunction pr = makeQFunction(S, A); for ( size_t s = 0; s < S; ++s ) for ( size_t a = 0; a < A; ++a ) for ( size_t s1 = 0; s1 < S; ++s1 ) pr(s, a) += model.getTransitionProbability(s,a,s1) * model.getExpectedReward(s,a,s1); return pr; }
std::tuple<bool, ValueFunction, QFunction> ValueIterationGeneral<M>::operator()(const M & model) { // Extract necessary knowledge from model so we don't have to pass it around S = model.getS(); A = model.getA(); discount_ = model.getDiscount(); { // Verify that parameter value function is compatible. size_t size = std::get<VALUES>(vParameter_).size(); if ( size != S ) { if ( size != 0 ) std::cerr << "AIToolbox: Size of starting value function in ValueIteration::solve() is incorrect, ignoring...\n"; // Defaulting v1_ = makeValueFunction(S); } else v1_ = vParameter_; } auto ir = computeImmediateRewards(model); unsigned timestep = 0; double variation = epsilon_ * 2; // Make it bigger Values val0; QFunction q = makeQFunction(S, A); bool useEpsilon = checkDifferentSmall(epsilon_, 0.0); while ( timestep < horizon_ && (!useEpsilon || variation > epsilon_) ) { ++timestep; auto & val1 = std::get<VALUES>(v1_); val0 = val1; q = computeQFunction(model, ir); bellmanOperator(q, &v1_); // We do this only if the epsilon specified is positive, otherwise we // continue for all the timesteps. if ( useEpsilon ) variation = (val1 - val0).cwiseAbs().maxCoeff(); } // We do not guarantee that the Value/QFunctions are the perfect ones, as we stop as within epsilon. return std::make_tuple(variation <= epsilon_, v1_, q); }
SARSA::SARSA(size_t ss, size_t aa, double discount, double alpha) : S(ss), A(aa), alpha_(alpha), discount_(discount), q_(makeQFunction(S, A)) { if ( discount_ <= 0.0 || alpha_ > 1.0 ) throw std::invalid_argument("Discount parameter must be in (0,1]"); if ( alpha_ <= 0.0 || alpha_ > 1.0 ) throw std::invalid_argument("Learning rate parameter must be in (0,1]"); }
QLearning<M>::QLearning(const M& model, double alpha) : model_(model), S(model_.getS()), A(model_.getA()), alpha_(alpha), discount_(model_.getDiscount()), q_(makeQFunction(S,A)) { if ( alpha_ <= 0.0 || alpha_ > 1.0 ) throw std::invalid_argument("Learning rate parameter must be in (0,1]"); }
PrioritizedSweeping<M>::PrioritizedSweeping(const M & m, double theta, unsigned n) : S(m.getS()), A(m.getA()), N(n), theta_(theta), model_(m), qfun_(makeQFunction(S,A)), vfun_(makeValueFunction(S)) {}
} else v1_ = vParameter_; } const auto & ir = [&]{ if constexpr (is_model_eigen_v<M>) return model.getRewardFunction(); else return computeImmediateRewards(model); }(); unsigned timestep = 0; double variation = tolerance_ * 2; // Make it bigger Values val0; auto & val1 = v1_.values; QFunction q = makeQFunction(S, A); const bool useTolerance = checkDifferentSmall(tolerance_, 0.0); while ( timestep < horizon_ && (!useTolerance || variation > tolerance_) ) { ++timestep; AI_LOGGER(AI_SEVERITY_DEBUG, "Processing timestep " << timestep); val0 = val1; // We apply the discount directly on the values vector. val1 *= model.getDiscount(); q = computeQFunction(model, val1, ir); // Compute the new value function (note that also val1 is overwritten) bellmanOperatorInline(q, &v1_);