double RTBSS<M>::simulate(const Belief & b, unsigned horizon) { if ( horizon == 0 ) return 0; std::vector<size_t> actionList(A); // Here we use no heuristic to sort the actions. If you want one // add it here! std::iota(std::begin(actionList), std::end(actionList), 0); double max = -std::numeric_limits<double>::infinity(); for ( auto a : actionList ) { double rew = beliefExpectedReward(model_, b, a); double uBound = rew + upperBound(b, a, horizon - 1); if ( uBound > max ) { for ( size_t o = 0; o < O; ++o ) { double p = beliefObservationProbability(model_, b, a, o); // Only work if it makes sense if ( checkDifferentSmall(p, 0.0) ) rew += model_.getDiscount() * p * simulate(updateBelief(model_, b, a, o), horizon - 1); } } if ( rew > max ) { max = rew; if ( horizon == maxDepth_ ) maxA_ = a; } } return max; }
void PrioritizedSweeping<M>::stepUpdateQ(size_t s, size_t a) { auto & values = std::get<VALUES>(vfun_); { // Update q[s][a] double newQValue = 0; for ( size_t s1 = 0; s1 < S; ++s1 ) { double probability = model_.getTransitionProbability(s,a,s1); if ( checkDifferentSmall( probability, 0.0 ) ) newQValue += probability * ( model_.getExpectedReward(s,a,s1) + model_.getDiscount() * values[s1] ); } qfun_(s, a) = newQValue; } double p = values[s]; { // Update value and action values[s] = qfun_.row(s).maxCoeff(&std::get<ACTIONS>(vfun_)[s]); } p = std::fabs(values[s] - p); // If it changed enough, we're going to update its parents. if ( p > theta_ ) { auto it = queueHandles_.find(s); if ( it != std::end(queueHandles_) && std::get<PRIORITY>(*(it->second)) < p ) queue_.increase(it->second, std::make_tuple(p, s)); else queueHandles_[s] = queue_.push(std::make_tuple(p, s)); } }
void SparseRLModel<E>::sync(const size_t s, const size_t a, const size_t s1) { const auto visitSum = experience_.getVisitsSum(s, a); // The second condition is related to numerical errors. Once in a // while we reset those by forcing a true update using real data. if ( !(visitSum % 10000ul) ) return sync(s, a); if ( visitSum == 1ul ) { transitions_[a].coeffRef(s, s) = 0.0; transitions_[a].coeffRef(s, s1) = 1.0; if (checkDifferentSmall(0.0, experience_.getRewardSum(s, a))) rewards_.coeffRef(s, a) = experience_.getRewardSum(s, a); } else { const double newVisits = static_cast<double>(experience_.getVisits(s, a, s1)); const double rewValue = experience_.getRewardSum(s, a) / visitSum; if (checkDifferentGeneral(rewValue, rewards_.coeff(s, a))) rewards_.coeffRef(s, a) = rewValue; const double newTransitionValue = newVisits / static_cast<double>(visitSum - 1); const double newVectorSum = 1.0 + (newTransitionValue - transitions_[a].coeff(s, s1)); // This works because as long as all the values in the transition have the same denominator // (in this case visitSum-1), then the numerators do not matter, as we can simply normalize. // In the end of the process the new values will be the same as if we updated directly using // an increased denominator, and thus we will be able to call this function again correctly. transitions_[a].coeffRef(s, s1) = newTransitionValue; transitions_[a].row(s) /= newVectorSum; } }
void BeliefGenerator<M>::expandBeliefList(size_t max, BeliefList * blp) const { assert(blp); auto & bl = *blp; size_t size = bl.size(); std::vector<Belief> newBeliefs(A); std::vector<double> distances(A); auto dBegin = std::begin(distances), dEnd = std::end(distances); // L1 distance auto computeDistance = [this](const Belief & lhs, const Belief & rhs) { double distance = 0.0; for ( size_t i = 0; i < S; ++i ) distance += std::abs(lhs[i] - rhs[i]); return distance; }; Belief helper; double distance; // We apply the discovery process also to all beliefs we discover // along the way. for ( auto it = std::begin(bl); it != std::end(bl); ++it ) { // Compute all new beliefs for ( size_t a = 0; a < A; ++a ) { distances[a] = 0.0; for ( int j = 0; j < 20; ++j ) { size_t s = sampleProbability(S, *it, rand_); size_t o; std::tie(std::ignore, o, std::ignore) = model_.sampleSOR(s, a); helper = updateBelief(model_, *it, a, o); // Compute distance (here we compare also against elements we just added!) distance = computeDistance(helper, bl.front()); for ( auto jt = ++std::begin(bl); jt != std::end(bl); ++jt ) { if ( checkEqualSmall(distance, 0.0) ) break; // We already have it! distance = std::min(distance, computeDistance(helper, *jt)); } // Select the best found over 20 times if ( distance > distances[a] ) { distances[a] = distance; newBeliefs[a] = helper; } } } // Find furthest away, add only if it is new. size_t id = std::distance( dBegin, std::max_element(dBegin, dEnd) ); if ( checkDifferentSmall(distances[id], 0.0) ) { bl.emplace_back(std::move(newBeliefs[id])); ++size; if ( size == max ) break; } } }
bool isProbability(const size_t d, const T & in) { double p = 0.0; for ( size_t i = 0; i < d; ++i ) { const double value = static_cast<double>(in[i]); if ( value < 0.0 ) return false; p += value; } if ( checkDifferentSmall(p, 1.0) ) return false; return true; }
std::tuple<bool, ValueFunction, QFunction> ValueIterationGeneral<M>::operator()(const M & model) { // Extract necessary knowledge from model so we don't have to pass it around S = model.getS(); A = model.getA(); discount_ = model.getDiscount(); { // Verify that parameter value function is compatible. size_t size = std::get<VALUES>(vParameter_).size(); if ( size != S ) { if ( size != 0 ) std::cerr << "AIToolbox: Size of starting value function in ValueIteration::solve() is incorrect, ignoring...\n"; // Defaulting v1_ = makeValueFunction(S); } else v1_ = vParameter_; } auto ir = computeImmediateRewards(model); unsigned timestep = 0; double variation = epsilon_ * 2; // Make it bigger Values val0; QFunction q = makeQFunction(S, A); bool useEpsilon = checkDifferentSmall(epsilon_, 0.0); while ( timestep < horizon_ && (!useEpsilon || variation > epsilon_) ) { ++timestep; auto & val1 = std::get<VALUES>(v1_); val0 = val1; q = computeQFunction(model, ir); bellmanOperator(q, &v1_); // We do this only if the epsilon specified is positive, otherwise we // continue for all the timesteps. if ( useEpsilon ) variation = (val1 - val0).cwiseAbs().maxCoeff(); } // We do not guarantee that the Value/QFunctions are the perfect ones, as we stop as within epsilon. return std::make_tuple(variation <= epsilon_, v1_, q); }
void PrioritizedSweeping<M>::batchUpdateQ() { for ( unsigned i = 0; i < N; ++i ) { if ( queue_.empty() ) return; // The state we extract has been processed already // So it is the future we have to backtrack from. size_t s1; std::tie(std::ignore, s1) = queue_.top(); queue_.pop(); queueHandles_.erase(s1); for ( size_t s = 0; s < S; ++s ) for ( size_t a = 0; a < A; ++a ) if ( checkDifferentSmall(model_.getTransitionProbability(s,a,s1), 0.0) ) stepUpdateQ(s, a); } }
}; template <typename M, typename> std::tuple<double, VList> BlindStrategies::operator()(const M & m, const bool fasterConvergence) { const MDP::QFunction ir = [&]{ if constexpr(MDP::is_model_eigen_v<M>) return m.getRewardFunction().transpose(); else return MDP::computeImmediateRewards(m).transpose(); }(); // This function produces a very simple lower bound for the POMDP. The // bound for each action is computed assuming to take the same action forever // (so the bound for action 0 assumes to forever take action 0, the bound for // action 1 assumes to take action 1, etc.). VList retval; const bool useTolerance = checkDifferentSmall(tolerance_, 0.0); double maxVariation = 0.0; for (size_t a = 0; a < m.getA(); ++a) { auto newAlpha = Vector(m.getS()); auto oldAlpha = Vector(m.getS()); // Note that here we can take the minimum for each action // separately, since the implied policy will take that action // forever anyway so there cannot be "cross-pollination" between // different actions. if (fasterConvergence) oldAlpha.fill(ir.row(a).minCoeff() / std::max(0.0001, 1.0 - m.getDiscount())); else oldAlpha = ir.row(a); unsigned timestep = 0;
std::tuple<double, ValueFunction> IncrementalPruning::operator()(const M & model) { // Initialize "global" variables S = model.getS(); A = model.getA(); O = model.getO(); auto v = makeValueFunction(S); // TODO: May take user input unsigned timestep = 0; Pruner prune(S); Projecter projecter(model); const bool useTolerance = checkDifferentSmall(tolerance_, 0.0); double variation = tolerance_ * 2; // Make it bigger while ( timestep < horizon_ && ( !useTolerance || variation > tolerance_ ) ) { ++timestep; // Compute all possible outcomes, from our previous results. // This means that for each action-observation pair, we are going // to obtain the same number of possible outcomes as the number // of entries in our initial vector w. auto projs = projecter(v[timestep-1]); size_t finalWSize = 0; // In this method we split the work by action, which will then // be joined again at the end of the loop. for ( size_t a = 0; a < A; ++a ) { // We prune each outcome separately to be sure // we do not replicate work later. for ( size_t o = 0; o < O; ++o ) { const auto begin = std::begin(projs[a][o]); const auto end = std::end (projs[a][o]); projs[a][o].erase(prune(begin, end, unwrap), end); } // Here we reduce at the minimum the cross-summing, by alternating // merges. We pick matches like a reverse binary tree, so that // we always pick lists that have been merged the least. // // Example for O==7: // // 0 <- 1 2 <- 3 4 <- 5 6 // 0 ------> 2 4 ------> 6 // 2 <---------------- 6 // // In particular, the variables are: // // - oddOld: Whether our starting step has an odd number of elements. // If so, we skip the last one. // - front: The id of the element at the "front" of our current pass. // note that since passes can be backwards this can be high. // - back: Opposite of front, which excludes the last element if we // have odd elements. // - stepsize: The space between each "first" of each new merge. // - diff: The space between each "first" and its match to merge. // - elements: The number of elements we have left to merge. bool oddOld = O % 2; int i, front = 0, back = O - oddOld, stepsize = 2, diff = 1, elements = O; while ( elements > 1 ) { for ( i = front; i != back; i += stepsize ) { projs[a][i] = crossSum(projs[a][i], projs[a][i + diff], a, stepsize > 0); const auto begin = std::begin(projs[a][i]); const auto end = std::end (projs[a][i]); projs[a][i].erase(prune(begin, end, unwrap), end); --elements; } const bool oddNew = elements % 2; const int tmp = back; back = front - ( oddNew ? 0 : stepsize ); front = tmp - ( oddOld ? 0 : stepsize ); stepsize *= -2; diff *= -2; oddOld = oddNew; } // Put the result where we can find it if (front != 0) projs[a][0] = std::move(projs[a][front]); finalWSize += projs[a][0].size(); } VList w; w.reserve(finalWSize); // Here we don't have to do fancy merging since no cross-summing is involved for ( size_t a = 0; a < A; ++a ) w.insert(std::end(w), std::make_move_iterator(std::begin(projs[a][0])), std::make_move_iterator(std::end(projs[a][0]))); // We have them all, and we prune one final time to be sure we have // computed the parsimonious set of value functions. const auto begin = std::begin(w); const auto end = std::end (w); w.erase(prune(begin, end, unwrap), end); v.emplace_back(std::move(w)); // Check convergence if ( useTolerance ) variation = weakBoundDistance(v[timestep-1], v[timestep]); } return std::make_tuple(useTolerance ? variation : 0.0, v); }