Ejemplo n.º 1
0
        double RTBSS<M>::simulate(const Belief & b, unsigned horizon) {
            if ( horizon == 0 ) return 0;

            std::vector<size_t> actionList(A);

            // Here we use no heuristic to sort the actions. If you want one
            // add it here!
            std::iota(std::begin(actionList), std::end(actionList), 0);

            double max = -std::numeric_limits<double>::infinity();

            for ( auto a : actionList ) {
                double rew = beliefExpectedReward(model_, b, a);

                double uBound = rew + upperBound(b, a, horizon - 1);
                if ( uBound > max ) {
                    for ( size_t o = 0; o < O; ++o ) {
                        double p = beliefObservationProbability(model_, b, a, o);
                        // Only work if it makes sense
                        if ( checkDifferentSmall(p, 0.0) ) rew += model_.getDiscount() * p * simulate(updateBelief(model_, b, a, o), horizon - 1);
                    }
                }
                if ( rew > max ) {
                    max = rew;
                    if ( horizon == maxDepth_ ) maxA_ = a;
                }
            }
            return max;
        }
Ejemplo n.º 2
0
        void PrioritizedSweeping<M>::stepUpdateQ(size_t s, size_t a) {
            auto & values = std::get<VALUES>(vfun_);
            { // Update q[s][a]
                double newQValue = 0;
                for ( size_t s1 = 0; s1 < S; ++s1 ) {
                    double probability = model_.getTransitionProbability(s,a,s1);
                    if ( checkDifferentSmall( probability, 0.0 ) )
                        newQValue += probability * ( model_.getExpectedReward(s,a,s1) + model_.getDiscount() * values[s1] );
                }
                qfun_(s, a) = newQValue;
            }

            double p = values[s];
            {
                // Update value and action
                values[s] = qfun_.row(s).maxCoeff(&std::get<ACTIONS>(vfun_)[s]);
            }

            p = std::fabs(values[s] - p);

            // If it changed enough, we're going to update its parents.
            if ( p > theta_ ) {
                auto it = queueHandles_.find(s);

                if ( it != std::end(queueHandles_) && std::get<PRIORITY>(*(it->second)) < p )
                    queue_.increase(it->second, std::make_tuple(p, s));
                else
                    queueHandles_[s] = queue_.push(std::make_tuple(p, s));
            }
        }
Ejemplo n.º 3
0
    void SparseRLModel<E>::sync(const size_t s, const size_t a, const size_t s1) {
        const auto visitSum = experience_.getVisitsSum(s, a);
        // The second condition is related to numerical errors. Once in a
        // while we reset those by forcing a true update using real data.
        if ( !(visitSum % 10000ul) ) return sync(s, a);
        if ( visitSum == 1ul ) {
            transitions_[a].coeffRef(s, s) = 0.0;
            transitions_[a].coeffRef(s, s1) = 1.0;
            if (checkDifferentSmall(0.0, experience_.getRewardSum(s, a)))
                rewards_.coeffRef(s, a) = experience_.getRewardSum(s, a);
        } else {
            const double newVisits = static_cast<double>(experience_.getVisits(s, a, s1));
            const double rewValue = experience_.getRewardSum(s, a) / visitSum;

            if (checkDifferentGeneral(rewValue, rewards_.coeff(s, a)))
                rewards_.coeffRef(s, a) = rewValue;

            const double newTransitionValue = newVisits / static_cast<double>(visitSum - 1);
            const double newVectorSum = 1.0 + (newTransitionValue - transitions_[a].coeff(s, s1));
            // This works because as long as all the values in the transition have the same denominator
            // (in this case visitSum-1), then the numerators do not matter, as we can simply normalize.
            // In the end of the process the new values will be the same as if we updated directly using
            // an increased denominator, and thus we will be able to call this function again correctly.
            transitions_[a].coeffRef(s, s1) = newTransitionValue;
            transitions_[a].row(s) /= newVectorSum;
        }
    }
Ejemplo n.º 4
0
        void BeliefGenerator<M>::expandBeliefList(size_t max, BeliefList * blp) const {
            assert(blp);
            auto & bl = *blp;
            size_t size = bl.size();

            std::vector<Belief> newBeliefs(A);
            std::vector<double> distances(A);
            auto dBegin = std::begin(distances), dEnd = std::end(distances);

            // L1 distance
            auto computeDistance = [this](const Belief & lhs, const Belief & rhs) {
                double distance = 0.0;
                for ( size_t i = 0; i < S; ++i )
                    distance += std::abs(lhs[i] - rhs[i]);
                return distance;
            };

            Belief helper; double distance;
            // We apply the discovery process also to all beliefs we discover
            // along the way.
            for ( auto it = std::begin(bl); it != std::end(bl); ++it ) {
                // Compute all new beliefs
                for ( size_t a = 0; a < A; ++a ) {
                    distances[a] = 0.0;
                    for ( int j = 0; j < 20; ++j ) {
                        size_t s = sampleProbability(S, *it, rand_);

                        size_t o;
                        std::tie(std::ignore, o, std::ignore) = model_.sampleSOR(s, a);
                        helper = updateBelief(model_, *it, a, o);

                        // Compute distance (here we compare also against elements we just added!)
                        distance = computeDistance(helper, bl.front());
                        for ( auto jt = ++std::begin(bl); jt != std::end(bl); ++jt ) {
                            if ( checkEqualSmall(distance, 0.0) ) break; // We already have it!
                            distance = std::min(distance, computeDistance(helper, *jt));
                        }
                        // Select the best found over 20 times
                        if ( distance > distances[a] ) {
                            distances[a] = distance;
                            newBeliefs[a] = helper;
                        }
                    }
                }
                // Find furthest away, add only if it is new.
                size_t id = std::distance( dBegin, std::max_element(dBegin, dEnd) );
                if ( checkDifferentSmall(distances[id], 0.0) ) {
                    bl.emplace_back(std::move(newBeliefs[id]));
                    ++size;
                    if ( size == max ) break;
                }
            }
        }
Ejemplo n.º 5
0
    bool isProbability(const size_t d, const T & in) {
        double p = 0.0;
        for ( size_t i = 0; i < d; ++i ) {
            const double value = static_cast<double>(in[i]);
            if ( value < 0.0 ) return false;
            p += value;
        }
        if ( checkDifferentSmall(p, 1.0) )
            return false;

        return true;
    }
Ejemplo n.º 6
0
        std::tuple<bool, ValueFunction, QFunction> ValueIterationGeneral<M>::operator()(const M & model) {
            // Extract necessary knowledge from model so we don't have to pass it around
            S = model.getS();
            A = model.getA();
            discount_ = model.getDiscount();

            {
                // Verify that parameter value function is compatible.
                size_t size = std::get<VALUES>(vParameter_).size();
                if ( size != S ) {
                    if ( size != 0 )
                        std::cerr << "AIToolbox: Size of starting value function in ValueIteration::solve() is incorrect, ignoring...\n";
                    // Defaulting
                    v1_ = makeValueFunction(S);
                }
                else
                    v1_ = vParameter_;
            }

            auto ir = computeImmediateRewards(model);

            unsigned timestep = 0;
            double variation = epsilon_ * 2; // Make it bigger

            Values val0;
            QFunction q = makeQFunction(S, A);

            bool useEpsilon = checkDifferentSmall(epsilon_, 0.0);
            while ( timestep < horizon_ && (!useEpsilon || variation > epsilon_) ) {
                ++timestep;

                auto & val1 = std::get<VALUES>(v1_);
                val0 = val1;

                q = computeQFunction(model, ir);
                bellmanOperator(q, &v1_);

                // We do this only if the epsilon specified is positive, otherwise we
                // continue for all the timesteps.
                if ( useEpsilon )
                    variation = (val1 - val0).cwiseAbs().maxCoeff();
            }

            // We do not guarantee that the Value/QFunctions are the perfect ones, as we stop as within epsilon.
            return std::make_tuple(variation <= epsilon_, v1_, q);
        }
Ejemplo n.º 7
0
        void PrioritizedSweeping<M>::batchUpdateQ() {
            for ( unsigned i = 0; i < N; ++i ) {
                if ( queue_.empty() ) return;

                // The state we extract has been processed already
                // So it is the future we have to backtrack from.
                size_t s1;
                std::tie(std::ignore, s1) = queue_.top();

                queue_.pop();
                queueHandles_.erase(s1);

                for ( size_t s = 0; s < S; ++s )
                    for ( size_t a = 0; a < A; ++a )
                        if ( checkDifferentSmall(model_.getTransitionProbability(s,a,s1), 0.0) )
                            stepUpdateQ(s, a);
            }
        }
Ejemplo n.º 8
0
    };


    template <typename M, typename>
    std::tuple<double, VList> BlindStrategies::operator()(const M & m, const bool fasterConvergence) {
        const MDP::QFunction ir = [&]{
            if constexpr(MDP::is_model_eigen_v<M>) return m.getRewardFunction().transpose();
            else return MDP::computeImmediateRewards(m).transpose();
        }();
        // This function produces a very simple lower bound for the POMDP. The
        // bound for each action is computed assuming to take the same action forever
        // (so the bound for action 0 assumes to forever take action 0, the bound for
        // action 1 assumes to take action 1, etc.).
        VList retval;

        const bool useTolerance = checkDifferentSmall(tolerance_, 0.0);

        double maxVariation = 0.0;
        for (size_t a = 0; a < m.getA(); ++a) {
            auto newAlpha = Vector(m.getS());
            auto oldAlpha = Vector(m.getS());
            // Note that here we can take the minimum for each action
            // separately, since the implied policy will take that action
            // forever anyway so there cannot be "cross-pollination" between
            // different actions.
            if (fasterConvergence)
                oldAlpha.fill(ir.row(a).minCoeff() / std::max(0.0001, 1.0 - m.getDiscount()));
            else
                oldAlpha = ir.row(a);

            unsigned timestep = 0;
Ejemplo n.º 9
0
    std::tuple<double, ValueFunction> IncrementalPruning::operator()(const M & model) {
        // Initialize "global" variables
        S = model.getS();
        A = model.getA();
        O = model.getO();

        auto v = makeValueFunction(S); // TODO: May take user input

        unsigned timestep = 0;

        Pruner prune(S);
        Projecter projecter(model);

        const bool useTolerance = checkDifferentSmall(tolerance_, 0.0);
        double variation = tolerance_ * 2; // Make it bigger
        while ( timestep < horizon_ && ( !useTolerance || variation > tolerance_ ) ) {
            ++timestep;

            // Compute all possible outcomes, from our previous results.
            // This means that for each action-observation pair, we are going
            // to obtain the same number of possible outcomes as the number
            // of entries in our initial vector w.
            auto projs = projecter(v[timestep-1]);

            size_t finalWSize = 0;
            // In this method we split the work by action, which will then
            // be joined again at the end of the loop.
            for ( size_t a = 0; a < A; ++a ) {
                // We prune each outcome separately to be sure
                // we do not replicate work later.
                for ( size_t o = 0; o < O; ++o ) {
                    const auto begin = std::begin(projs[a][o]);
                    const auto end   = std::end  (projs[a][o]);
                    projs[a][o].erase(prune(begin, end, unwrap), end);
                }

                // Here we reduce at the minimum the cross-summing, by alternating
                // merges. We pick matches like a reverse binary tree, so that
                // we always pick lists that have been merged the least.
                //
                // Example for O==7:
                //
                //  0 <- 1    2 <- 3    4 <- 5    6
                //  0 ------> 2         4 ------> 6
                //            2 <---------------- 6
                //
                // In particular, the variables are:
                //
                // - oddOld:   Whether our starting step has an odd number of elements.
                //             If so, we skip the last one.
                // - front:    The id of the element at the "front" of our current pass.
                //             note that since passes can be backwards this can be high.
                // - back:     Opposite of front, which excludes the last element if we
                //             have odd elements.
                // - stepsize: The space between each "first" of each new merge.
                // - diff:     The space between each "first" and its match to merge.
                // - elements: The number of elements we have left to merge.

                bool oddOld = O % 2;
                int i, front = 0, back = O - oddOld, stepsize = 2, diff = 1, elements = O;
                while ( elements > 1 ) {
                    for ( i = front; i != back; i += stepsize ) {
                        projs[a][i] = crossSum(projs[a][i], projs[a][i + diff], a, stepsize > 0);
                        const auto begin = std::begin(projs[a][i]);
                        const auto end   = std::end  (projs[a][i]);
                        projs[a][i].erase(prune(begin, end, unwrap), end);
                        --elements;
                    }

                    const bool oddNew = elements % 2;

                    const int tmp   = back;
                    back      = front - ( oddNew ? 0 : stepsize );
                    front     = tmp   - ( oddOld ? 0 : stepsize );
                    stepsize *= -2;
                    diff     *= -2;

                    oddOld = oddNew;
                }
                // Put the result where we can find it
                if (front != 0)
                    projs[a][0] = std::move(projs[a][front]);
                finalWSize += projs[a][0].size();
            }
            VList w;
            w.reserve(finalWSize);

            // Here we don't have to do fancy merging since no cross-summing is involved
            for ( size_t a = 0; a < A; ++a )
                w.insert(std::end(w), std::make_move_iterator(std::begin(projs[a][0])), std::make_move_iterator(std::end(projs[a][0])));

            // We have them all, and we prune one final time to be sure we have
            // computed the parsimonious set of value functions.
            const auto begin = std::begin(w);
            const auto end   = std::end  (w);
            w.erase(prune(begin, end, unwrap), end);

            v.emplace_back(std::move(w));

            // Check convergence
            if ( useTolerance )
                variation = weakBoundDistance(v[timestep-1], v[timestep]);
        }

        return std::make_tuple(useTolerance ? variation : 0.0, v);
    }