Ejemplo n.º 1
0
int SampleUniform (int min, int max, int seed)
{
    // Create a Mersenne twister random number generator
    // that is seeded once with #seconds since 1970
    static mt19937 rng(static_cast<unsigned> (seed));
 
    // select Gaussian probability distribution
    uniform_int<> uni_dist(min, max);
 
    // bind random number generator to distribution, forming a function
    variate_generator<mt19937&, uniform_int<> >  uniform_sampler(rng, uni_dist);
 
    // sample from the distribution
    return uniform_sampler();
}
    bool update()
    {
        bool terminal = environment.query_state( currentState.begin() );
        if (terminal)
        {
            // V_{t+1}(s_t) <- r_t
            valueFunction.update( lastState.begin(), 
                                  lastState.end(), 
                                  lastReward, 
                                  beta );
            return true;
        }
        else  // nonterminal state
        {
            // V_{t+1}(s_t) <- r_t + gamma*V_t(s_{t+1})
            ValueType delta = lastReward
                            + gamma * valueFunction.compute( currentState.begin(), currentState.end() )
                            - lastValue;

            /*valueFunction.update( lastState.begin(), 
                                  lastState.end(), 
                                  value,
                                  beta );
            value = valueFunction.compute( lastState.begin(), lastState.end() );*/
            vector_type gradient(valueFunction.num_params());
            valueFunction.get_gradient(lastState.begin(), lastState.end(), gradient.begin());
            eligibility = gamma*lambda*eligibility + gradient;

            vector_type params(valueFunction.num_params());
            valueFunction.get_params(params.begin());
            params += beta*delta*eligibility;
            valueFunction.set_params(params.begin(), params.end());

            ValueType value = valueFunction.compute( lastState.begin(), lastState.end() );

            if (value > lastValue)
            {
                // update best action
                bestAction.update( lastState.begin(), 
                                   lastState.end(), 
                                   lastAction.begin(), 
                                   lastAction.end(), 
                                   alpha );
            }
            lastValue = valueFunction.compute( currentState.begin(), currentState.end() );
        }

        // select new action
        bestAction.compute( currentState.begin(), 
                            currentState.end(), 
                            lastAction.begin() );
        
        // sum with variation
        for (size_t i = 0; i < actionSize; ++i)
        {
            //lastAction[i] += (*normal_sampler[i])();
            lastAction[i] += normal_sampler();
        }

        value_type distance = value_type();
        for (size_t i = 0; i<currentState.size(); ++i) {
            distance += sqrt( (currentState[i] - lastState[i]) * (currentState[i] - lastState[i]) );
        }

        if (distance < 0.1)
        {
            std::cerr << "random action" << std::endl;
            for (size_t i = 0; i < actionSize; ++i)
            {
                //lastAction[i] += (*normal_sampler[i])();
                lastAction[i] = uniform_sampler();
            }
        }

        averaged_reward = averaged_reward*gamma + lastReward;
        //fileStat << averaged_reward << std::endl;

        lastState.swap(currentState);
        lastReward = environment.perform_action( lastAction.begin(), 
                                                 lastAction.end() );
        ++numActions;

        if (numActions == 20000)
        {
            normal_sampler = normal_variate_generator( rng, boost::normal_distribution<ValueType>(0, value_type(0.05)) );
        }

        return false;
    }