Exemple #1
0
void Learner::act(ALEInterface& ale, int action, vector<float> &reward, vector<vector<vector<float> > > &learnedOptions){

	float r_alg = 0.0, r_real = 0.0;

	FRam.clear();
	ramFeatures.getCompleteFeatureVector(ale.getRAM(), FRam);

	if(action < numBasicActions){
		r_real = ale.act(actions[action]);
	} 
	else{
		int option_idx = action - numBasicActions;
		r_real = playOption(ale, option_idx, learnedOptions);
	}

	FnextRam.clear();
	ramFeatures.getCompleteFeatureVector(ale.getRAM(), FnextRam);
	updateTransitionVector(FRam, FnextRam);

	for(int i = 0; i < transitions.size(); i++){
		transitions[i] = (transitions[i] - mean[i])/std[i];
		r_alg += eigVector[i] * transitions[i];
	}

	reward[0] = r_alg;
	reward[1] = r_real;
}
Exemple #2
0
int Learner::playOption(ALEInterface& ale, int option, vector<vector<vector<float> > > &learnedOptions){

	int r_real = 0;
	int currentAction;
	vector<int> Fbpro;
	vector<float> Q(numBasicActions, 0.0);

	while(rand()%1000 > 1000 * PROB_TERMINATION && !ale.game_over()){
		//Get state and features active on that state:		
		Fbpro.clear();
		bproFeatures.getActiveFeaturesIndices(ale.getScreen(), Fbpro);

		//Update Q-values for each possible action
		for(int a = 0; a < numBasicActions; a++){
			float sumW = 0;
			for(unsigned int i = 0; i < Fbpro.size(); i++){
				sumW += learnedOptions[option][a][Fbpro[i]];
			}
			Q[a] = sumW;
		}

		currentAction = epsilonGreedy(Q);
		//Take action, observe reward and next state:
		r_real += ale.act((Action) currentAction);
	}
	return r_real;
}
int main(int argc, char** argv) {
    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " rom_file" << std::endl;
        return 1;
    }

    ALEInterface ale;

    // Get & Set the desired settings
    ale.setInt("random_seed", 123);
    //The default is already 0.25, this is just an example
    ale.setFloat("repeat_action_probability", 0.25);

#ifdef __USE_SDL
    ale.setBool("display_screen", true);
    ale.setBool("sound", true);
#endif

    // Load the ROM file. (Also resets the system for new settings to
    // take effect.)
    ale.loadROM(argv[1]);

    // Get the vector of legal actions
    ActionVect legal_actions = ale.getLegalActionSet();

    // Play 10 episodes
    for (int episode=0; episode<1; episode++) {
        float totalReward = 0;
        while (!ale.game_over()) {
            /*
            const ALEScreen screen = ale.getScreen();
            float row_sum = 0, column_sum = 0, tot = 0.0;
            for (int i = 0; i < screen.height(); ++i)
            {
                for (int j = 0; j < screen.width(); ++j)
                {
                    pixel_t tmp = screen.get(i, j);
                    if(tmp >= 41 && tmp <= 43) {
                        row_sum += i;
                        column_sum += j;
                        tot++;
                    }
                }
            }
            cout << (row_sum/tot) << "," << (column_sum/tot) << endl;
            */
            Action a = legal_actions[rand() % legal_actions.size()];
            // Apply the action and get the resulting reward
            float reward = ale.act(a);
            totalReward += reward;
        }
        cout << "Episode " << episode << " ended with score: " << totalReward << endl;
        ale.reset_game();
    }

    return 0;
}
Exemple #4
0
/**
 * The first parameter is the one that is used by Sarsa. The second is used to
 * pass aditional information to the running algorithm (like 'real score' if one
 * is using a surrogate reward function).
 */
void RLLearner::act(ALEInterface& ale, int action, vector<double> &reward){
	double r_alg = 0.0, r_real = 0.0;
	
	for(int i = 0; i < numStepsPerAction; i++){
		r_real = ale.act(actions[action]);
		if(toUseOnlyRewardSign){
			if(r_real > 0){ 
				r_alg = 1.0;
			}
			else if(r_real < 0){
				r_alg = -1.0;
			}
		//Normalizing reward according to the first
		//reward, Marc did this in his JAIR paper:
		} else{
			if(r_real != 0.0){
				if(!sawFirstReward){
					firstReward = std::abs(r_real);
					sawFirstReward = 1;
				}
			}
			if(sawFirstReward){
				if(toBeOptimistic){
					r_alg = (r_real - firstReward)/firstReward + gamma;
				}
				else{
					r_alg = r_real/firstReward;	
				}
			}
			else{
				if(toBeOptimistic){
					r_alg = gamma - 1.0;
				}
			}
		}
		frame++;
		reward[0] += r_alg;
		reward[1] += r_real;
	}
	//If doing optimistic initialization, to avoid the agent
	//to "die" soon to avoid -1 as reward at each step, when
	//the agent dies we give him -1 for each time step remaining,
	//this would be the worst case ever...
	if(ale.game_over() && toBeOptimistic){
		int missedSteps = episodeLength - frame + 1;
		double penalty = pow(gamma, missedSteps) - 1;
		reward[0] -= penalty;
	}
}
Exemple #5
0
int playGame(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *bpro, 
	vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId){
	ale.reset_game();
	vector<bool> F(NUM_BITS, 0); //Set of active features
	vector<bool> Fprev;

	int score = 0;
	while(!ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){
		int nextAction = getNextAction(ale, param.numOptions);
		score += actUpdatingAvg(ale, ram, bpro, nextAction, w, param, totalNumFrames, gameId, F, Fprev);
	}
	totalNumFrames += ale.getEpisodeFrameNumber();
	printf("Episode: %d, Final score: %d, Total Num. Frames: %d\n", gameId+1, score, totalNumFrames);
	return totalNumFrames;
}
int main(int argc, char** argv) {
    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " rom_file" << std::endl;
        return 1;
    }

    ALEInterface ale;

    // Get & Set the desired settings
    ale.setInt("random_seed", 123);
    //The default is already 0.25, this is just an example
    ale.setFloat("repeat_action_probability", 0.25);

#ifdef __USE_SDL
    ale.setBool("display_screen", true);
    ale.setBool("sound", true);
#endif

    // Load the ROM file. (Also resets the system for new settings to
    // take effect.)
    ale.loadROM(argv[1]);

    // Get the vector of legal actions
    ActionVect legal_actions = ale.getLegalActionSet();

    // Play 10 episodes
    for (int episode=0; episode<10; episode++) {
        float totalReward = 0;
        while (!ale.game_over()) {
            Action a = legal_actions[rand() % legal_actions.size()];
            // Apply the action and get the resulting reward
            float reward = ale.act(a);
            totalReward += reward;
            if (reward != 0){
              cout<<"totalReward: "<<totalReward<<endl;
            }
        }
        cout << "Episode " << episode << " ended with score: " << totalReward << endl;
        ale.reset_game();
    }

    return 0;
}
void TrueOnlineSarsaLearner::evaluatePolicy(ALEInterface& ale, Features *features){
	double reward = 0;
	double cumReward = 0; 
	double prevCumReward = 0;

	//Repeat (for each episode):
	for(int episode = 0; episode < numEpisodesEval; episode++){
		//Repeat(for each step of episode) until game is over:
		for(int step = 0; !ale.game_over() && step < episodeLength; step++){
			//Get state and features active on that state:		
			F.clear();
			features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), F);
			updateQValues(F, Q);       //Update Q-values for each possible action
			currentAction = epsilonGreedy(Q);
			//Take action, observe reward and next state:
			reward = 0;
			for(int i = 0; i < numStepsPerAction && !ale.game_over() ; i++){
				reward += ale.act(actions[currentAction]);
			}
			cumReward  += reward;
		}
		ale.reset_game();
		sanityCheck();
		
		printf("%d, %f, %f \n", episode + 1, (double)cumReward/(episode + 1.0), cumReward-prevCumReward);
		
		prevCumReward = cumReward;
	}
}
bool does_value_change(ALEInterface &ale,
					   const vector<Action> &possible_actions,
					   unsigned int addr) {
	ALEState s0 = ale.cloneSystemState();
	ale.environment->oneStepAct(possible_actions.at(0), PLAYER_B_NOOP);
//	printf("initial X: %d\n", ale.getRAM().get(addr));
	const byte_t x0 = ale.getRAM().get(addr);
	bool controllable = false;
	for(size_t i=1; !controllable && i<possible_actions.size(); i++) {
		ale.restoreSystemState(s0);
		ale.environment->oneStepAct(possible_actions.at(i), PLAYER_B_NOOP);
//		printf("X: %zu %d\n", i, ale.getRAM().get(addr));
		const byte_t xi = ale.getRAM().get(addr);
		if(x0 != xi) {
			controllable = true;
		}
	}
	ale.restoreSystemState(s0);
	ale.environment->processRAM();
	ale.environment->processScreen();
	return controllable;
}
Exemple #9
0
RLLearner::RLLearner(ALEInterface& ale, Parameters *param){
	frame               = 0;
	randomActionTaken   = 0;

	gamma               = param->getGamma();
	epsilon             = param->getEpsilon();
	toUseOnlyRewardSign = param->getUseRewardSign();
	numStepsPerAction   = param->getNumStepsPerAction();
	toBeOptimistic      = param->getOptimisticInitialization();
	
	episodeLength       = param->getEpisodeLength();
	numEpisodesEval     = param->getNumEpisodesEval();
	numEpisodesLearn    = param->getNumEpisodesLearn();

	//Get the number of effective actions:
	if(param->isMinimalAction()){
		actions = ale.getMinimalActionSet();
	}
	else{
		actions = ale.getLegalActionSet();
	}
	numActions = actions.size();
}
Exemple #10
0
Learner::Learner(ALEInterface& ale, Parameters *param) : bproFeatures(param->gameName){
	delta = 0.0;
	cumReward = 0; 
	prevCumReward = 0;
	cumIntrReward = 0;
	prevCumIntrReward = 0;
	maxFeatVectorNorm = 1;
	pathToSaveLearnedWeights = param->outputPath;

	for(int i = 0; i < (ramFeatures.getNumberOfFeatures() - 1) * 2; i++){
		transitions.push_back(0);
	}

	actions = ale.getLegalActionSet();

	numOptions      = param->numOptions;
	numBasicActions = actions.size();
	numTotalActions = numBasicActions + numOptions;

	//Reading file containing the vector that describes the reward for the option learning
	//The first X positions encode the transition 0->1 and the other X encode 1->0.
	pathToRewardDescription = param->eigVectorPath;
	std::ifstream infile1(pathToRewardDescription.c_str());
	float value;
	while(infile1 >> value){
		eigVector.push_back(value);
	}
	pathToStatsDescription = param->statEigVectorPath;
	std::ifstream infile2((pathToStatsDescription + "_mean.out").c_str());
	while(infile2 >> value){
		mean.push_back(value);
	}
	std::ifstream infile3((pathToStatsDescription + "_std.out").c_str());
	while(infile3 >> value){
		std.push_back(value);
	}

	numFeatures = bproFeatures.getNumberOfFeatures();
	
	for(int i = 0; i < numTotalActions; i++){
		//Initialize Q;
		Q.push_back(0);
		Qnext.push_back(0);
		//Initialize e:
		e.push_back(vector<float>(numFeatures, 0.0));
		w.push_back(vector<float>(numFeatures, 0.0));
		nonZeroElig.push_back(vector<int>());
	}
}
Exemple #11
0
int actUpdatingAvg(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *features, int nextAction, 
	vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId,
	vector<bool> &F, vector<bool> &Fprev){

	int reward = 0;

	//If the selected action was one of the primitive actions
	if(nextAction < NUM_ACTIONS){ 
		for(int i = 0; i < FRAME_SKIP && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES; i++){
			reward += ale.act((Action) nextAction);
			Fprev.swap(F);
			F.clear();
			ram->getCompleteFeatureVector(ale.getRAM(), F);
			F.pop_back();
			updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId);
		}
	}
	//If the selected action was one of the options
	else{
		int currentAction;
		vector<int> Fbpro;	                  //Set of features active
		vector<float> Q(NUM_ACTIONS, 0.0);    //Q(a) entries

		int option = nextAction - NUM_ACTIONS;
		while(rand()%1000 > 1000 * PROB_TERMINATION && !ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){
			//Get state and features active on that state:		
			Fbpro.clear();
			features->getActiveFeaturesIndices(ale.getScreen(), Fbpro);
			updateQValues(Fbpro, Q, w, option);       //Update Q-values for each possible action
			currentAction = epsilonGreedy(Q);
			//Take action, observe reward and next state:
			reward += ale.act((Action) currentAction);
			Fprev.swap(F);
			F.clear();
			ram->getCompleteFeatureVector(ale.getRAM(), F);
			F.pop_back();
			updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId);
		}
	}
	return reward;
}
Exemple #12
0
//Run the Arcade Learning Environment using the DQN agent.
void run_ale(int argc, char** argv)
{
	//Create Arcade Learning Environment
	ALEInterface* ale = new ALEInterface(false);
	//Load the Atari Rom we are going to play
	ale->loadROM(argv[1]);
	//Get the set of possible actions from ALE ROM
	ActionVect action_set = {PLAYER_A_LEFTFIRE,PLAYER_A_FIRE,PLAYER_A_RIGHTFIRE}; //ale->getMinimalActionSet();
	//Create action descriptor
	ActionDescriptor descriptor({action_set.size()},{});
	//Create Learning System
	DACN system(descriptor,0.9,4,1000000,100,32);
	//Set exploration rate
	system.exploration_rate(0.9);

	cudaProfilerStart(); nvtxRangePushA("2 Atari Games");
	for(int episode=0; episode<2; episode++)
	{	
		string tmp = "episode: " + to_string(episode);
		nvtxRangePushA(tmp.c_str());

		//Restart the game
		ale->reset_game();

		//Game Loop
		while(!ale->game_over())
		{
			nvtxRangePushA("step");
				//Convert screen to input
				ALEScreen screen = ale->getScreen();
				gray8_image_t img = to_image(screen);
				vector<unsigned char> input = to_input(img);
				float raw_action = system.forward( input )[0];
				//cast action
				int action = static_cast<int>(raw_action);
				//Execute the action and get the reward
				float reward = ale->act(action_set[action]);
				//Normalize the reward
				float normalized_reward = max(min(1.0f,reward),-1.0f);
				//Backward the result
				system.backward(normalized_reward,ale->game_over());
			nvtxRangePop();
		}
		nvtxRangePop();
	}
	cudaProfilerStop(); nvtxRangePop();
}
Exemple #13
0
/**
 * one episode learning and return the total score
 */
double EpisodeLearning( ALEInterface& ale, deepRL::DeepQLearner& dqlearner, const bool update) {
  assert(!ale.game_over());
  std::deque<deepRL::FrameDataSp> past_frames;
  //dqlearner.replay_memory_.resetPool();

  auto total_score = 0.0;
  for (auto frame = 0; !ale.game_over(); ++frame) {
    //std::cout << "frame: " << frame << std::endl;
    const auto current_frame = deepRL::PreprocessScreen(ale.getScreen());

    past_frames.push_back(current_frame);
    if (past_frames.size() < deepRL::kInputFrameCount) {
      // If there are not past frames enough for DQN input, just select NOOP
      for (auto i = 0; i < argmap["skip_frame"].as<int>() + 1 && !ale.game_over(); ++i) {
        total_score += ale.act(PLAYER_A_NOOP);
      }
    } else {
      if (past_frames.size() > deepRL::kInputFrameCount) {
        past_frames.pop_front();
      }
      deepRL::InputFrames input_frames;
      std::copy(past_frames.begin(), past_frames.end(), input_frames.begin());
      const auto action = dqlearner.SelectAction(input_frames);
      auto immediate_score = 0.0;

      for (auto i = 0; i < argmap["skip_frame"].as<int>() + 1 && !ale.game_over(); ++i) {
        // Last action is repeated on skipped frames
        immediate_score += ale.act(action);
      }

      total_score += immediate_score;

      //clip reward for robust gradient update
      // Rewards for DQN are normalized as follows:
      // 1 for any positive score, -1 for any negative score, otherwise 0
      const auto reward =
          immediate_score == 0 ?
              0 :
              immediate_score /= std::abs(immediate_score);

      if (update) {
        // Add the current transition to replay memory
        const auto transition = ale.game_over() ?
            deepRL::Transition(input_frames, action, reward, boost::none) :
            deepRL::Transition(
                input_frames,
                action,
                reward,
                deepRL::PreprocessScreen(ale.getScreen()));
        dqlearner.replay_memory_.addTransition(transition);
	//std::cout << "Memorypool Size: " << dqlearner.replay_memory_.memory_size() << std::endl;
        // If the size of replay memory is enough, update DQN
        if (dqlearner.replay_memory_.memory_size() >= argmap["replay_start_size"].as<int>()
	           and dqlearner.numSteps()%argmap["update_frequency"].as<int>()==0 ) {
             dqlearner.MiniBatchUpdate();
        }
      }
    }
  }
  ale.reset_game();
  return total_score;
}
Exemple #14
0
void Learner::learnPolicy(ALEInterface& ale, vector<vector<vector<float> > > &learnedOptions){
	
	vector<float> reward;
	//Repeat (for each episode):
	int episode, totalNumberFrames = 0;
	//This is going to be interrupted by the ALE code since I set max_num_frames beforehand
	for(episode = 0; totalNumberFrames < MAX_NUM_FRAMES; episode++){ 
		//We have to clean the traces every episode:
		for(unsigned int a = 0; a < nonZeroElig.size(); a++){
			for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){
				int idx = nonZeroElig[a][i];
				e[a][idx] = 0.0;
			}
			nonZeroElig[a].clear();
		}
		F.clear();
		bproFeatures.getActiveFeaturesIndices(ale.getScreen(), F);
		updateQValues(F, Q);
		currentAction = epsilonGreedy(Q);
		//Repeat(for each step of episode) until game is over:
		gettimeofday(&tvBegin, NULL);

		//This also stops when the maximum number of steps per episode is reached
		while(!ale.game_over()){
			reward.clear();
			reward.push_back(0.0);
			reward.push_back(0.0);
			updateQValues(F, Q);
			sanityCheck();
			//Take action, observe reward and next state:
			act(ale, currentAction, reward, learnedOptions);
			cumIntrReward += reward[0];
			cumReward  += reward[1];
			if(!ale.game_over()){
				//Obtain active features in the new state:
				Fnext.clear();
				bproFeatures.getActiveFeaturesIndices(ale.getScreen(), Fnext);
				updateQValues(Fnext, Qnext);     //Update Q-values for the new active features
				nextAction = epsilonGreedy(Qnext);
			}
			else{
				nextAction = 0;
				for(unsigned int i = 0; i < Qnext.size(); i++){
					Qnext[i] = 0;
				}
			}
			//To ensure the learning rate will never increase along
			//the time, Marc used such approach in his JAIR paper		
			if (F.size() > maxFeatVectorNorm){
				maxFeatVectorNorm = F.size();
			}

			delta = reward[0] + GAMMA * Qnext[nextAction] - Q[currentAction];
			updateReplTrace(currentAction, F);

			//Update weights vector:
			float stepSize = ALPHA/maxFeatVectorNorm;
			for(unsigned int a = 0; a < nonZeroElig.size(); a++){
				for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){
					int idx = nonZeroElig[a][i];
					w[a][idx] = w[a][idx] + stepSize * delta * e[a][idx];
				}
			}
			F = Fnext;
			FRam = FnextRam;
			currentAction = nextAction;
		}
		gettimeofday(&tvEnd, NULL);
		timeval_subtract(&tvDiff, &tvEnd, &tvBegin);
		elapsedTime = float(tvDiff.tv_sec) + float(tvDiff.tv_usec)/1000000.0;
		
		float fps = float(ale.getEpisodeFrameNumber())/elapsedTime;
		printf("episode: %d,\t%.0f points,\tavg. return: %.1f,\tnovelty reward: %.2f (%.2f),\t%d frames,\t%.0f fps\n",
			episode + 1, cumReward - prevCumReward, (float)cumReward/(episode + 1.0),
			cumIntrReward - prevCumIntrReward, cumIntrReward/(episode + 1.0), ale.getEpisodeFrameNumber(), fps);
		totalNumberFrames += ale.getEpisodeFrameNumber();
		prevCumReward = cumReward;
		prevCumIntrReward = cumIntrReward;
		ale.reset_game();
	}
	
	stringstream ss;
	ss << episode;
	saveWeightsToFile(ss.str());
}
static hexq::Reward
move_to_the(ALEInterface &ale, DisplayScreen *display, const Action action, const hexq::Reward discount_rate, hexq::MontezumaOptionsMdp &mdp, size_t &elapsed_time, hexq::Reward &nophi_reward, hexq::Reward &phi_reward, vector<pair<hexq::Reward,hexq::State> > &all_steps) {
	const vector<Action> *axis_actions;
	unsigned int unchanging_addr, changing_addr;
	if(action == PLAYER_A_LEFT || action == PLAYER_A_RIGHT) {
		axis_actions = &vertical_actions;
		unchanging_addr = ADDR_Y;
		changing_addr = ADDR_X;
	} else {
		axis_actions = &horizontal_actions;
		unchanging_addr = ADDR_X;
		changing_addr = ADDR_Y;
	}

	const bool initial_cannot_change_axis =
		!does_value_change(ale, *axis_actions, unchanging_addr);
	hexq::State prev_s = mdp.StateUniqueID();
	phi_reward = mdp.ComputeState(ale.act(action), nophi_reward);

	vector<pair<pair<hexq::Reward, hexq::Reward>, ALEState> > frames;
	frames.push_back(make_pair(make_pair(phi_reward, nophi_reward), ale.cloneSystemState()));
	all_steps.push_back(make_pair(phi_reward, prev_s));
	byte_t prev_changing = ale.getRAM().get(changing_addr);
	const int initial_lives = ale.lives();
	int n_frames_unchanged;
	bool controllable = true;
	bool lost_life =  false;
	for(size_t max_n_iterations=0; !lost_life && max_n_iterations<MAX_FRAMES; max_n_iterations++) {
		hexq::Reward nophi_r;
		prev_s = mdp.StateUniqueID();
		hexq::Reward reward = mdp.ComputeState(ale.act(action), nophi_r);

		frames.push_back(make_pair(make_pair(reward, nophi_r), ale.cloneSystemState()));
		all_steps.push_back(make_pair(reward, prev_s));
		DISPLAY(display);
		controllable = !(ale.getRAM().get(0xd8) != 0x00 || ale.getRAM().get(0xd6) != 0xff);
		if(!controllable)break;
		bool stop_for_axis_change = initial_cannot_change_axis &&
			does_value_change(ale, *axis_actions, unchanging_addr);
		if(stop_for_axis_change) {
//			printf("Break because axis change possibility %d\n", ABHG++);
			break;
		}
//		printf("X: %d Y: %d\n", ale.getRAM().get(ADDR_X), ale.getRAM().get(ADDR_Y));
		byte_t new_changing = ale.getRAM().get(changing_addr);
		if(new_changing == prev_changing && controllable) {
			n_frames_unchanged++;
			if(n_frames_unchanged >= NOT_MOVING_FRAMES) {
//				printf("Break because not moving %d\n", ABHG++);
				break;
			}
		} else {
			n_frames_unchanged = 0;
			prev_changing = new_changing;
		}
		lost_life = ale.lives() < initial_lives;
	}
	if((lost_life || !controllable) && frames.size() > N_BACK_FRAMES) {
		size_t new_size = frames.size() - N_BACK_FRAMES;
		frames.resize(new_size);
		all_steps.resize(new_size);
		printf("went back\n");
		ale.restoreSystemState(frames.rbegin()->second);
		ale.environment->processRAM();
		ale.environment->processScreen();
		DISPLAY(display);
		hexq::Reward r;
		(void)mdp.ComputeState(0, r);
	}
	hexq::Reward discount = 1.;
	hexq::Reward total_reward = 0;
	phi_reward = nophi_reward = 0;
	for(size_t i=0; i<frames.size(); i++) {
		total_reward += discount*frames.at(i).first.first;
		discount *= discount_rate;
		nophi_reward += frames.at(i).first.second;
		phi_reward += frames.at(i).first.first;
	}
	elapsed_time += frames.size();
	return total_reward;
}
int main(int argc, char** argv) {
    if (argc < 2) {
        std::cout << "Usage: " << argv[0] << " rom_file" << std::endl;
        return 1;
    }

    ALEInterface ale;

    // Get & Set the desired settings
    ale.setInt("random_seed", 123);

    // We enable both screen and sound, which we will need for recording.
    ale.setBool("display_screen", true);
    // You may leave sound disabled (by setting this flag to false) if so desired.
    ale.setBool("sound", true);

    std::string recordPath = "record";
    std::cout << std::endl;

    // Set record flags
    ale.setString("record_screen_dir", recordPath.c_str());
    ale.setString("record_sound_filename", (recordPath + "/sound.wav").c_str());
    // We set fragsize to 64 to ensure proper sound sync
    ale.setInt("fragsize", 64);

    // Not completely portable, but will work in most cases
    std::string cmd = "mkdir ";
    cmd += recordPath;
    system(cmd.c_str());

    // Load the ROM file. (Also resets the system for new settings to
    // take effect.)
    ale.loadROM(argv[1]);

    // Get the vector of legal actions
    ActionVect legal_actions = ale.getLegalActionSet();

    // Play a single episode, which we record.
    while (!ale.game_over()) {

        Action a = legal_actions[rand() % legal_actions.size()];
        // Apply the action (discard the resulting reward)
        ale.act(a);
    }

    std::cout << std::endl;
    std::cout << "Recording complete. To create a video, you may want to run \n"
              "  doc/scripts/videoRecordingExampleJoinXXX.sh. See manual for details.." << std::endl;

    return 0;
}
int main(int argc, char** argv) {
    ALEInterface ale;

    // Get & Set the desired settings
    ale.setInt("random_seed", 123);
    //The default is now 0 because we don't want stochasity
    ale.setFloat("repeat_action_probability", 0);

#ifdef __USE_SDL
    ale.setBool("display_screen", false);
    ale.setBool("sound", false);
#endif

    /// Uncomment to Record
       // std::string recordPath = "record";
       // std::cout << std::endl;
    
       // // Set record flags
       // ale.setString("record_screen_dir", recordPath.c_str());
       // ale.setString("record_sound_filename", (recordPath + "/sound.wav").c_str());
       // // We set fragsize to 64 to ensure proper sound sync
       // ale.setInt("fragsize", 64);
    
       // // Not completely portable, but will work in most cases
       // std::string cmd = "mkdir ";
       // cmd += recordPath;
       // system(cmd.c_str());


    // Load the ROM file. (Also resets the system for new settings to
    // take effect.)
    ale.loadROM("gravitar.bin");

    // Get the vector of minimal actions
    const ActionVect minimal_actions = ale.getMinimalActionSet();

    // Erase actions that move, but don't fire
    //minimal_actions.erase(minimal_actions.begin() + 2, minimal_actions.begin() + 10);

    // Store all rewards earned in all episodes
    float allRewards = 0;
    double allTimes = 0;
    Timer timer;

    // Play 10 episodes
    int episodes = 200;
    int number = 0;
    int count = 0;
    int lastLives = ale.lives();
    bool reset = false;

    Decision decision = Decision(ale.getMinimalActionSet(), ale.getScreen());


    for (int episode=0; episode<episodes; episode++) {
        float totalReward = 0;
        double episodeTime = 0;
        timer.start();
        while (!ale.game_over()) {
            if (ale.lives() < lastLives){
                lastLives = ale.lives();
                number = 0;
                count = 0;
                reset = true;
                //cout << " DIE " << endl;
            } else{
            	reset = false;
            }

            // Apply the action and get the resulting reward
            float reward = ale.act(decision.getDecision(ale.getScreen(), ale.lives(), reset));
			//decision.print();
            totalReward += reward;
        }
        timer.stop();
        episodeTime = timer();
        timer.reset();
        count = 0;
        number = 0;
        allRewards += totalReward;
        allTimes += episodeTime;
        cout << "Episode " << episode << " ended with score: " << totalReward << " with time: "<< episodeTime <<endl;
        ale.reset_game();
    }

    // Display average reward per game
    cout << "Average Reward: " << (allRewards / episodes) << " Average Time: " << (allTimes/episodes) << endl;

    return 0;
}
void TrueOnlineSarsaLearner::learnPolicy(ALEInterface& ale, Features *features){
	
	struct timeval tvBegin, tvEnd, tvDiff;
	vector<double> reward;
	double elapsedTime;
	double norm_a;
	double q_old, delta_q;
	double cumReward = 0, prevCumReward = 0;
	unsigned int maxFeatVectorNorm = 1;
	sawFirstReward = 0; firstReward = 1.0;

	//Repeat (for each episode):
	for(int episode = 0; episode < numEpisodesLearn; episode++){
		for(unsigned int a = 0; a < nonZeroElig.size(); a++){
			for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){
				int idx = nonZeroElig[a][i];
				e[a][idx] = 0.0;
			}
			nonZeroElig[a].clear();
		}
		//We have to clean the traces every episode:
		for(unsigned int i = 0; i < e.size(); i++){
			for(unsigned int j = 0; j < e[i].size(); j++){
				e[i][j] = 0.0;
			}
		}
		F.clear();
		features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), F);
		updateQValues(F, Q);
		currentAction = epsilonGreedy(Q);
		
		q_old = Q[currentAction];

		//Repeat(for each step of episode) until game is over:
		gettimeofday(&tvBegin, NULL);
		frame = 0;
		while(frame < episodeLength && !ale.game_over()){
			reward.clear();
			reward.push_back(0.0);
			reward.push_back(0.0);
			updateQValues(F, Q);
			sanityCheck();

			//Take action, observe reward and next state:
			act(ale, currentAction, reward);
			cumReward  += reward[1];
			if(!ale.game_over()){
				//Obtain active features in the new state:
				Fnext.clear();
				features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), Fnext);
				updateQValues(Fnext, Qnext);     //Update Q-values for the new active features
				nextAction = epsilonGreedy(Qnext);
			}
			else{
				nextAction = 0;
				for(unsigned int i = 0; i < Qnext.size(); i++){
					Qnext[i] = 0;
				}
			}
			//To ensure the learning rate will never increase along
			//the time, Marc used such approach in his JAIR paper		
			if (F.size() > maxFeatVectorNorm){
				maxFeatVectorNorm = F.size();
			}

			norm_a = alpha/maxFeatVectorNorm;
			delta_q =  Q[currentAction] - q_old;
			q_old   = Qnext[nextAction];
			delta   = reward[0] + gamma * Qnext[nextAction] - Q[currentAction];
			//e <- e + [1 - alpha * e^T phi(S,A)] phi(S,A)
			updateTrace(currentAction, norm_a);
			//theta <- theta + alpha * delta * e + alpha * delta_q (e - phi(S,A))
			updateWeights(currentAction, norm_a, delta_q);
			//e <- gamma * lambda * e
			decayTrace();

			F = Fnext;
			currentAction = nextAction;
		}
		ale.reset_game();
		gettimeofday(&tvEnd, NULL);
		timeval_subtract(&tvDiff, &tvEnd, &tvBegin);
		elapsedTime = double(tvDiff.tv_sec) + double(tvDiff.tv_usec)/1000000.0;
		
		double fps = double(frame)/elapsedTime;
		printf("episode: %d,\t%.0f points,\tavg. return: %.1f,\t%d frames,\t%.0f fps\n", 
			episode + 1, (cumReward-prevCumReward), (double)cumReward/(episode + 1.0), frame, fps);
		prevCumReward = cumReward;
	}
}