void SparseRLModel<E>::sync(const size_t s, const size_t a, const size_t s1) { const auto visitSum = experience_.getVisitsSum(s, a); // The second condition is related to numerical errors. Once in a // while we reset those by forcing a true update using real data. if ( !(visitSum % 10000ul) ) return sync(s, a); if ( visitSum == 1ul ) { transitions_[a].coeffRef(s, s) = 0.0; transitions_[a].coeffRef(s, s1) = 1.0; if (checkDifferentSmall(0.0, experience_.getRewardSum(s, a))) rewards_.coeffRef(s, a) = experience_.getRewardSum(s, a); } else { const double newVisits = static_cast<double>(experience_.getVisits(s, a, s1)); const double rewValue = experience_.getRewardSum(s, a) / visitSum; if (checkDifferentGeneral(rewValue, rewards_.coeff(s, a))) rewards_.coeffRef(s, a) = rewValue; const double newTransitionValue = newVisits / static_cast<double>(visitSum - 1); const double newVectorSum = 1.0 + (newTransitionValue - transitions_[a].coeff(s, s1)); // This works because as long as all the values in the transition have the same denominator // (in this case visitSum-1), then the numerators do not matter, as we can simply normalize. // In the end of the process the new values will be the same as if we updated directly using // an increased denominator, and thus we will be able to call this function again correctly. transitions_[a].coeffRef(s, s1) = newTransitionValue; transitions_[a].row(s) /= newVectorSum; } }
double QGreedyPolicy::getActionProbability(const size_t & s, size_t a) const { double max = q_(s, 0); unsigned count = 1; for ( size_t aa = 1; aa < A; ++aa ) { if ( checkEqualGeneral(q_(s, aa), max) ) ++count; else if ( q_(s, aa) > max ) { max = q_(s, aa); count = 1; } } if ( checkDifferentGeneral(q_(s, a), max) ) return 0.0; return 1.0 / count; }
void SparseRLModel<E>::sync(const size_t s, const size_t a) { // Nothing to do const auto visitSum = experience_.getVisitsSum(s, a); if ( visitSum == 0ul ) return; // Clear beginning's identity matrix if ( visitSum == 1ul ) transitions_[a].coeffRef(s, s) = 0.0; // Create reciprocal for fast division const double visitSumReciprocal = 1.0 / visitSum; // Normalize for ( size_t s1 = 0; s1 < S; ++s1 ) { const auto visits = experience_.getVisits(s, a, s1); if (visits > 0) transitions_[a].coeffRef(s, s1) = static_cast<double>(visits) * visitSumReciprocal; } const double rewValue = experience_.getRewardSum(s, a) * visitSumReciprocal; if (checkDifferentGeneral(rewValue, rewards_.coeff(s, a))) rewards_.coeffRef(s, a) = rewValue; }