// Sample one possible sequence of future events, up to 'dfr' cycles. reward_t SearchNode::sample(Agent &agent, unsigned int dfr) { double newReward; if (dfr == 0) { return 0; } else if (m_chance_node) { // Generate whole observation-reward percept, // according to the agent's model of the environment. percept_t obs; percept_t rew; agent.genPerceptAndUpdate(obs, rew); // Calculate the index of whole percept percept_t percept = (rew << agent.numObsBits()) | obs; if (m_child.count(percept) == 0) { m_child[percept] = new SearchNode(false, agent.numActions()); } newReward = rew + m_child[percept]->sample(agent, dfr - 1); } else if (m_visits == 0) { newReward = playout(agent, dfr); } else { // Select an action to sample. action_t action = selectAction(agent, dfr); agent.modelUpdate(action); newReward = m_child[action]->sample(agent, dfr); } // Update our estimate of the future reward. m_mean = (1.0 / (double) (m_visits + 1)) * (newReward + m_visits * m_mean); ++m_visits; return newReward; }
// simulate a sequence of random actions, returning the accumulated reward. static reward_t playout(Agent &agent, unsigned int playout_len) { reward_t r = 0; for (unsigned int i = 0; i < playout_len; ++i) { // Pick a random action action_t a = agent.genRandomAction(); agent.modelUpdate(a); // Generate a random percept distributed according to the agent's // internal model of the environment. percept_t rew; percept_t obs; agent.genPerceptAndUpdate(obs, rew); r = r + rew; } return r; }