double normRand(double mean, double sigma) { // Create a Mersenne twister random number generator // that is seeded once with #seconds since 1970 static boost::mt19937 rng(static_cast<unsigned> (std::time(0))); // select Gaussian probability distribution boost::normal_distribution<double> norm_dist(mean, sigma); // bind random number generator to distribution, forming a function boost::variate_generator<boost::mt19937&, boost::normal_distribution<double> > normal_sampler(rng, norm_dist); // sample from the distribution return normal_sampler(); }
int SampleNormal (double mean, double sigma, int seed) { // Create a Mersenne twister random number generator // that is seeded once with #seconds since 1970 static mt19937 rng(static_cast<unsigned> (seed)); // select Gaussian probability distribution normal_distribution<double> norm_dist(mean, sigma); // bind random number generator to distribution, forming a function variate_generator<mt19937&, normal_distribution<double> > normal_sampler(rng, norm_dist); // sample from the distribution return (int) normal_sampler(); }
void new_episode() { numActions = 0; numEpisodes += 1; if (numEpisodes > 300) normal_sampler = normal_variate_generator( rng, boost::normal_distribution<ValueType>(0, value_type(0.1)) ); environment.query_state( lastState.begin() ); lastValue = valueFunction.compute( lastState.begin(), lastState.end() ); // select first action bestAction.compute( lastState.begin(), lastState.end(), lastAction.begin() ); for (size_t i = 0; i < actionSize; ++i) { //lastAction[i] += (*normal_sampler[i])(); lastAction[i] += normal_sampler(); } lastReward = environment.perform_action( lastAction.begin(), lastAction.end() ); eligibility = vector_type(eligibility.size()); std::fill(eligibility.begin(), eligibility.end(), value_type()); }
bool update() { bool terminal = environment.query_state( currentState.begin() ); if (terminal) { // V_{t+1}(s_t) <- r_t valueFunction.update( lastState.begin(), lastState.end(), lastReward, beta ); return true; } else // nonterminal state { // V_{t+1}(s_t) <- r_t + gamma*V_t(s_{t+1}) ValueType delta = lastReward + gamma * valueFunction.compute( currentState.begin(), currentState.end() ) - lastValue; /*valueFunction.update( lastState.begin(), lastState.end(), value, beta ); value = valueFunction.compute( lastState.begin(), lastState.end() );*/ vector_type gradient(valueFunction.num_params()); valueFunction.get_gradient(lastState.begin(), lastState.end(), gradient.begin()); eligibility = gamma*lambda*eligibility + gradient; vector_type params(valueFunction.num_params()); valueFunction.get_params(params.begin()); params += beta*delta*eligibility; valueFunction.set_params(params.begin(), params.end()); ValueType value = valueFunction.compute( lastState.begin(), lastState.end() ); if (value > lastValue) { // update best action bestAction.update( lastState.begin(), lastState.end(), lastAction.begin(), lastAction.end(), alpha ); } lastValue = valueFunction.compute( currentState.begin(), currentState.end() ); } // select new action bestAction.compute( currentState.begin(), currentState.end(), lastAction.begin() ); // sum with variation for (size_t i = 0; i < actionSize; ++i) { //lastAction[i] += (*normal_sampler[i])(); lastAction[i] += normal_sampler(); } value_type distance = value_type(); for (size_t i = 0; i<currentState.size(); ++i) { distance += sqrt( (currentState[i] - lastState[i]) * (currentState[i] - lastState[i]) ); } if (distance < 0.1) { std::cerr << "random action" << std::endl; for (size_t i = 0; i < actionSize; ++i) { //lastAction[i] += (*normal_sampler[i])(); lastAction[i] = uniform_sampler(); } } averaged_reward = averaged_reward*gamma + lastReward; //fileStat << averaged_reward << std::endl; lastState.swap(currentState); lastReward = environment.perform_action( lastAction.begin(), lastAction.end() ); ++numActions; if (numActions == 20000) { normal_sampler = normal_variate_generator( rng, boost::normal_distribution<ValueType>(0, value_type(0.05)) ); } return false; }