예제 #1
0
// Sample one possible sequence of future events, up to 'dfr' cycles.
reward_t SearchNode::sample(Agent &agent, unsigned int dfr) {
    double newReward;
    if (dfr == 0) {
        return 0;
    } else if (m_chance_node) {
        // Generate whole observation-reward percept,
        // according to the agent's model of the environment.
        percept_t obs;
        percept_t rew;
        agent.genPerceptAndUpdate(obs, rew);

        // Calculate the index of whole percept
        percept_t percept = (rew << agent.numObsBits()) | obs;
        
        if (m_child.count(percept) == 0) {
            m_child[percept] = new SearchNode(false, agent.numActions());
        }
        newReward = rew + m_child[percept]->sample(agent, dfr - 1);
    } else if (m_visits == 0) {
        newReward = playout(agent, dfr);
    } else {
    	// Select an action to sample.
        action_t action = selectAction(agent, dfr);
        agent.modelUpdate(action);
        newReward = m_child[action]->sample(agent, dfr);
    }
    // Update our estimate of the future reward.
    m_mean = (1.0 / (double) (m_visits + 1)) * (newReward + m_visits * m_mean);
    ++m_visits;
    return newReward;
}
예제 #2
0
// simulate a sequence of random actions, returning the accumulated reward.
static reward_t playout(Agent &agent, unsigned int playout_len) {
	reward_t r = 0;
	for (unsigned int i = 0; i < playout_len; ++i) {
	    // Pick a random action
	    action_t a = agent.genRandomAction();
	    agent.modelUpdate(a);
		
		// Generate a random percept distributed according to the agent's
		// internal model of the environment.
        percept_t rew;
        percept_t obs;
	    agent.genPerceptAndUpdate(obs, rew);
	    
	    r = r + rew;
    }
	return r;
}