Action SMDPAgent::step(double reward, const State &state, std::ostream &output)
{
	giveReward(reward, output);
	Action action = nextAction(state, output);
	applyAction(action, output);
	return action;
}
void SMDPAgent::endEpisode(double reward, std::ostream &output)
{
	giveReward(reward, output);
	finalize(output);
}
示例#3
0
void MRLAgent::giveMotivationReward(){
    //Initialise novelty
    float novelty=0;
    //Initialise the state-action vector to be fed to expectation map.
    vector<float> observation_action(num_inputs, 0);

    //If less than 2 observations have been made, leave observation_action a zero vector. Otherwise set to second last observation.
    if ((int)observations.size()>=2){
       observation_action = observations[observations.size()-2];
    }
    //Add action to state in which the action is taken
    observation_action.push_back(actions[actions.size()-2]);
    //Get expected observation and actual observation.
    vector<float> expected_observation = expectation_map->getValues(observation_action);
    vector<float> actual_observation = observations[observations.size()-1];

    //Setup values for learning. Moves the expected_observation along the straight line toward actual_observation in R^n.
    vector<vector<float> > obs(1,observation_action);//Setup vector with one input to be used for expectation_map learning.
    vector<vector<float> > val(1,expected_observation);//setup unit list of values.
    //Change values:
    for (int j=0; j<expected_observation.size();j++){
        float diff = actual_observation[j]-expected_observation[j];

        val[0][j]+= lambda*diff;
    }


    /*If expected observation is the zero vector it is highly likely that the state has not been visited before.
    In this case set the novelty to zero.
    */
    float sum_exp = 0;
    for (int j=0; j<expected_observation.size();j++){
        sum_exp = expected_observation[j]*expected_observation[j];
    }
    if(sum_exp == 0){
        //Do learning for expectation_map
        expectation_map->doLearningEpisode(obs,val,1.0,1);

        float motivation = wundtFunction(0);
        //cout<<"MRLAGENT:: giveMotivationReward - novelty = "<<0<<endl;
        giveReward(motivation);
        return;

    }

    //If expected observation non-zero, calculate error in prediction to get novelty:

    for (int j=0; j<expected_observation.size();j++){
        float diff = 0;
        diff = (expected_observation[j]-actual_observation[j]);
        novelty+= diff*diff;
    }

    //cout<<"Novelty = "<<novelty<<endl;
    //Do learning for expectation_map
    expectation_map->doLearningEpisode(obs,val,1.0,1);

    float motivation = wundtFunction(novelty);
    //cout<<"MRLAGENT:: giveMotivationReward - novelty = "<<novelty<< " Reward = "<< motivation<<endl;
    giveReward(motivation);

   /*OLD NOVELTY

    //Debug
    // cout<<"obs list size = "<<observations.size()<<endl;
    float memory_decay = 0.5;
    int count=0;
    //Iterate backwards over all observations and calculate a sum of euclidean metrics of latest observation with old observations.
    for (int i=observations.size()-2; i>=std::max(0,((int)observations.size()-memory_length-1));i--){


        for (int j=0; j<observations[i].size();j++){
            float diff = observations[observations.size()-1][j]-observations[i][j];
            novelty+= memory_decay*diff*diff;
        }

        count++;
        memory_decay*=memory_decay;

    }

    END OLD NOVELTY
*/ 
}