size_t QSoftmaxPolicy::sampleAction(const size_t & s) const { if ( temperature_ == 0.0 ) return greedy_.sampleAction(s); Vector actionValues(A); unsigned infinities = 0; for ( size_t a = 0; a < A; ++a ) { actionValues(a) = std::exp(q_(s, a) / temperature_); if ( std::isinf(actionValues(a)) ) infinities++; } if (infinities) { auto pickDistribution = std::uniform_int_distribution<unsigned>(0, infinities-1); unsigned selection = pickDistribution(rand_); size_t retval = 0; for ( ; retval < A - 1; ++retval) { if ( std::isinf(actionValues(retval)) && !selection ) break; --selection; } return retval; } else { actionValues.normalize(); return sampleProbability(A, actionValues, rand_); } }
void BeliefGenerator<M>::expandBeliefList(size_t max, BeliefList * blp) const { assert(blp); auto & bl = *blp; size_t size = bl.size(); std::vector<Belief> newBeliefs(A); std::vector<double> distances(A); auto dBegin = std::begin(distances), dEnd = std::end(distances); // L1 distance auto computeDistance = [this](const Belief & lhs, const Belief & rhs) { double distance = 0.0; for ( size_t i = 0; i < S; ++i ) distance += std::abs(lhs[i] - rhs[i]); return distance; }; Belief helper; double distance; // We apply the discovery process also to all beliefs we discover // along the way. for ( auto it = std::begin(bl); it != std::end(bl); ++it ) { // Compute all new beliefs for ( size_t a = 0; a < A; ++a ) { distances[a] = 0.0; for ( int j = 0; j < 20; ++j ) { size_t s = sampleProbability(S, *it, rand_); size_t o; std::tie(std::ignore, o, std::ignore) = model_.sampleSOR(s, a); helper = updateBelief(model_, *it, a, o); // Compute distance (here we compare also against elements we just added!) distance = computeDistance(helper, bl.front()); for ( auto jt = ++std::begin(bl); jt != std::end(bl); ++jt ) { if ( checkEqualSmall(distance, 0.0) ) break; // We already have it! distance = std::min(distance, computeDistance(helper, *jt)); } // Select the best found over 20 times if ( distance > distances[a] ) { distances[a] = distance; newBeliefs[a] = helper; } } } // Find furthest away, add only if it is new. size_t id = std::distance( dBegin, std::max_element(dBegin, dEnd) ); if ( checkDifferentSmall(distances[id], 0.0) ) { bl.emplace_back(std::move(newBeliefs[id])); ++size; if ( size == max ) break; } } }
std::tuple<size_t, double> SparseRLModel<E>::sampleSR(const size_t s, const size_t a) const { const size_t s1 = sampleProbability(S, transitions_[a].row(s), rand_); return std::make_tuple(s1, rewards_.coeff(s, a)); }
std::tuple<size_t, double> Model::sampleSR(size_t s, size_t a) const { size_t s1 = sampleProbability(S, transitions_[a].row(s), rand_); return std::make_tuple(s1, rewards_[a](s, s1)); }
std::pair<size_t, double> RLModel::sample(size_t s, size_t a) const { size_t s1 = sampleProbability(transitions_[s][a], S, rand_); return std::make_pair(s1, rewards_[s][a][s1]); }