void Learner::act(ALEInterface& ale, int action, vector<float> &reward, vector<vector<vector<float> > > &learnedOptions){ float r_alg = 0.0, r_real = 0.0; FRam.clear(); ramFeatures.getCompleteFeatureVector(ale.getRAM(), FRam); if(action < numBasicActions){ r_real = ale.act(actions[action]); } else{ int option_idx = action - numBasicActions; r_real = playOption(ale, option_idx, learnedOptions); } FnextRam.clear(); ramFeatures.getCompleteFeatureVector(ale.getRAM(), FnextRam); updateTransitionVector(FRam, FnextRam); for(int i = 0; i < transitions.size(); i++){ transitions[i] = (transitions[i] - mean[i])/std[i]; r_alg += eigVector[i] * transitions[i]; } reward[0] = r_alg; reward[1] = r_real; }
int Learner::playOption(ALEInterface& ale, int option, vector<vector<vector<float> > > &learnedOptions){ int r_real = 0; int currentAction; vector<int> Fbpro; vector<float> Q(numBasicActions, 0.0); while(rand()%1000 > 1000 * PROB_TERMINATION && !ale.game_over()){ //Get state and features active on that state: Fbpro.clear(); bproFeatures.getActiveFeaturesIndices(ale.getScreen(), Fbpro); //Update Q-values for each possible action for(int a = 0; a < numBasicActions; a++){ float sumW = 0; for(unsigned int i = 0; i < Fbpro.size(); i++){ sumW += learnedOptions[option][a][Fbpro[i]]; } Q[a] = sumW; } currentAction = epsilonGreedy(Q); //Take action, observe reward and next state: r_real += ale.act((Action) currentAction); } return r_real; }
int main(int argc, char** argv) { if (argc < 2) { std::cerr << "Usage: " << argv[0] << " rom_file" << std::endl; return 1; } ALEInterface ale; // Get & Set the desired settings ale.setInt("random_seed", 123); //The default is already 0.25, this is just an example ale.setFloat("repeat_action_probability", 0.25); #ifdef __USE_SDL ale.setBool("display_screen", true); ale.setBool("sound", true); #endif // Load the ROM file. (Also resets the system for new settings to // take effect.) ale.loadROM(argv[1]); // Get the vector of legal actions ActionVect legal_actions = ale.getLegalActionSet(); // Play 10 episodes for (int episode=0; episode<1; episode++) { float totalReward = 0; while (!ale.game_over()) { /* const ALEScreen screen = ale.getScreen(); float row_sum = 0, column_sum = 0, tot = 0.0; for (int i = 0; i < screen.height(); ++i) { for (int j = 0; j < screen.width(); ++j) { pixel_t tmp = screen.get(i, j); if(tmp >= 41 && tmp <= 43) { row_sum += i; column_sum += j; tot++; } } } cout << (row_sum/tot) << "," << (column_sum/tot) << endl; */ Action a = legal_actions[rand() % legal_actions.size()]; // Apply the action and get the resulting reward float reward = ale.act(a); totalReward += reward; } cout << "Episode " << episode << " ended with score: " << totalReward << endl; ale.reset_game(); } return 0; }
/** * The first parameter is the one that is used by Sarsa. The second is used to * pass aditional information to the running algorithm (like 'real score' if one * is using a surrogate reward function). */ void RLLearner::act(ALEInterface& ale, int action, vector<double> &reward){ double r_alg = 0.0, r_real = 0.0; for(int i = 0; i < numStepsPerAction; i++){ r_real = ale.act(actions[action]); if(toUseOnlyRewardSign){ if(r_real > 0){ r_alg = 1.0; } else if(r_real < 0){ r_alg = -1.0; } //Normalizing reward according to the first //reward, Marc did this in his JAIR paper: } else{ if(r_real != 0.0){ if(!sawFirstReward){ firstReward = std::abs(r_real); sawFirstReward = 1; } } if(sawFirstReward){ if(toBeOptimistic){ r_alg = (r_real - firstReward)/firstReward + gamma; } else{ r_alg = r_real/firstReward; } } else{ if(toBeOptimistic){ r_alg = gamma - 1.0; } } } frame++; reward[0] += r_alg; reward[1] += r_real; } //If doing optimistic initialization, to avoid the agent //to "die" soon to avoid -1 as reward at each step, when //the agent dies we give him -1 for each time step remaining, //this would be the worst case ever... if(ale.game_over() && toBeOptimistic){ int missedSteps = episodeLength - frame + 1; double penalty = pow(gamma, missedSteps) - 1; reward[0] -= penalty; } }
int playGame(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *bpro, vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId){ ale.reset_game(); vector<bool> F(NUM_BITS, 0); //Set of active features vector<bool> Fprev; int score = 0; while(!ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){ int nextAction = getNextAction(ale, param.numOptions); score += actUpdatingAvg(ale, ram, bpro, nextAction, w, param, totalNumFrames, gameId, F, Fprev); } totalNumFrames += ale.getEpisodeFrameNumber(); printf("Episode: %d, Final score: %d, Total Num. Frames: %d\n", gameId+1, score, totalNumFrames); return totalNumFrames; }
int main(int argc, char** argv) { if (argc < 2) { std::cerr << "Usage: " << argv[0] << " rom_file" << std::endl; return 1; } ALEInterface ale; // Get & Set the desired settings ale.setInt("random_seed", 123); //The default is already 0.25, this is just an example ale.setFloat("repeat_action_probability", 0.25); #ifdef __USE_SDL ale.setBool("display_screen", true); ale.setBool("sound", true); #endif // Load the ROM file. (Also resets the system for new settings to // take effect.) ale.loadROM(argv[1]); // Get the vector of legal actions ActionVect legal_actions = ale.getLegalActionSet(); // Play 10 episodes for (int episode=0; episode<10; episode++) { float totalReward = 0; while (!ale.game_over()) { Action a = legal_actions[rand() % legal_actions.size()]; // Apply the action and get the resulting reward float reward = ale.act(a); totalReward += reward; if (reward != 0){ cout<<"totalReward: "<<totalReward<<endl; } } cout << "Episode " << episode << " ended with score: " << totalReward << endl; ale.reset_game(); } return 0; }
void TrueOnlineSarsaLearner::evaluatePolicy(ALEInterface& ale, Features *features){ double reward = 0; double cumReward = 0; double prevCumReward = 0; //Repeat (for each episode): for(int episode = 0; episode < numEpisodesEval; episode++){ //Repeat(for each step of episode) until game is over: for(int step = 0; !ale.game_over() && step < episodeLength; step++){ //Get state and features active on that state: F.clear(); features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), F); updateQValues(F, Q); //Update Q-values for each possible action currentAction = epsilonGreedy(Q); //Take action, observe reward and next state: reward = 0; for(int i = 0; i < numStepsPerAction && !ale.game_over() ; i++){ reward += ale.act(actions[currentAction]); } cumReward += reward; } ale.reset_game(); sanityCheck(); printf("%d, %f, %f \n", episode + 1, (double)cumReward/(episode + 1.0), cumReward-prevCumReward); prevCumReward = cumReward; } }
bool does_value_change(ALEInterface &ale, const vector<Action> &possible_actions, unsigned int addr) { ALEState s0 = ale.cloneSystemState(); ale.environment->oneStepAct(possible_actions.at(0), PLAYER_B_NOOP); // printf("initial X: %d\n", ale.getRAM().get(addr)); const byte_t x0 = ale.getRAM().get(addr); bool controllable = false; for(size_t i=1; !controllable && i<possible_actions.size(); i++) { ale.restoreSystemState(s0); ale.environment->oneStepAct(possible_actions.at(i), PLAYER_B_NOOP); // printf("X: %zu %d\n", i, ale.getRAM().get(addr)); const byte_t xi = ale.getRAM().get(addr); if(x0 != xi) { controllable = true; } } ale.restoreSystemState(s0); ale.environment->processRAM(); ale.environment->processScreen(); return controllable; }
RLLearner::RLLearner(ALEInterface& ale, Parameters *param){ frame = 0; randomActionTaken = 0; gamma = param->getGamma(); epsilon = param->getEpsilon(); toUseOnlyRewardSign = param->getUseRewardSign(); numStepsPerAction = param->getNumStepsPerAction(); toBeOptimistic = param->getOptimisticInitialization(); episodeLength = param->getEpisodeLength(); numEpisodesEval = param->getNumEpisodesEval(); numEpisodesLearn = param->getNumEpisodesLearn(); //Get the number of effective actions: if(param->isMinimalAction()){ actions = ale.getMinimalActionSet(); } else{ actions = ale.getLegalActionSet(); } numActions = actions.size(); }
Learner::Learner(ALEInterface& ale, Parameters *param) : bproFeatures(param->gameName){ delta = 0.0; cumReward = 0; prevCumReward = 0; cumIntrReward = 0; prevCumIntrReward = 0; maxFeatVectorNorm = 1; pathToSaveLearnedWeights = param->outputPath; for(int i = 0; i < (ramFeatures.getNumberOfFeatures() - 1) * 2; i++){ transitions.push_back(0); } actions = ale.getLegalActionSet(); numOptions = param->numOptions; numBasicActions = actions.size(); numTotalActions = numBasicActions + numOptions; //Reading file containing the vector that describes the reward for the option learning //The first X positions encode the transition 0->1 and the other X encode 1->0. pathToRewardDescription = param->eigVectorPath; std::ifstream infile1(pathToRewardDescription.c_str()); float value; while(infile1 >> value){ eigVector.push_back(value); } pathToStatsDescription = param->statEigVectorPath; std::ifstream infile2((pathToStatsDescription + "_mean.out").c_str()); while(infile2 >> value){ mean.push_back(value); } std::ifstream infile3((pathToStatsDescription + "_std.out").c_str()); while(infile3 >> value){ std.push_back(value); } numFeatures = bproFeatures.getNumberOfFeatures(); for(int i = 0; i < numTotalActions; i++){ //Initialize Q; Q.push_back(0); Qnext.push_back(0); //Initialize e: e.push_back(vector<float>(numFeatures, 0.0)); w.push_back(vector<float>(numFeatures, 0.0)); nonZeroElig.push_back(vector<int>()); } }
int actUpdatingAvg(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *features, int nextAction, vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId, vector<bool> &F, vector<bool> &Fprev){ int reward = 0; //If the selected action was one of the primitive actions if(nextAction < NUM_ACTIONS){ for(int i = 0; i < FRAME_SKIP && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES; i++){ reward += ale.act((Action) nextAction); Fprev.swap(F); F.clear(); ram->getCompleteFeatureVector(ale.getRAM(), F); F.pop_back(); updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId); } } //If the selected action was one of the options else{ int currentAction; vector<int> Fbpro; //Set of features active vector<float> Q(NUM_ACTIONS, 0.0); //Q(a) entries int option = nextAction - NUM_ACTIONS; while(rand()%1000 > 1000 * PROB_TERMINATION && !ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){ //Get state and features active on that state: Fbpro.clear(); features->getActiveFeaturesIndices(ale.getScreen(), Fbpro); updateQValues(Fbpro, Q, w, option); //Update Q-values for each possible action currentAction = epsilonGreedy(Q); //Take action, observe reward and next state: reward += ale.act((Action) currentAction); Fprev.swap(F); F.clear(); ram->getCompleteFeatureVector(ale.getRAM(), F); F.pop_back(); updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId); } } return reward; }
//Run the Arcade Learning Environment using the DQN agent. void run_ale(int argc, char** argv) { //Create Arcade Learning Environment ALEInterface* ale = new ALEInterface(false); //Load the Atari Rom we are going to play ale->loadROM(argv[1]); //Get the set of possible actions from ALE ROM ActionVect action_set = {PLAYER_A_LEFTFIRE,PLAYER_A_FIRE,PLAYER_A_RIGHTFIRE}; //ale->getMinimalActionSet(); //Create action descriptor ActionDescriptor descriptor({action_set.size()},{}); //Create Learning System DACN system(descriptor,0.9,4,1000000,100,32); //Set exploration rate system.exploration_rate(0.9); cudaProfilerStart(); nvtxRangePushA("2 Atari Games"); for(int episode=0; episode<2; episode++) { string tmp = "episode: " + to_string(episode); nvtxRangePushA(tmp.c_str()); //Restart the game ale->reset_game(); //Game Loop while(!ale->game_over()) { nvtxRangePushA("step"); //Convert screen to input ALEScreen screen = ale->getScreen(); gray8_image_t img = to_image(screen); vector<unsigned char> input = to_input(img); float raw_action = system.forward( input )[0]; //cast action int action = static_cast<int>(raw_action); //Execute the action and get the reward float reward = ale->act(action_set[action]); //Normalize the reward float normalized_reward = max(min(1.0f,reward),-1.0f); //Backward the result system.backward(normalized_reward,ale->game_over()); nvtxRangePop(); } nvtxRangePop(); } cudaProfilerStop(); nvtxRangePop(); }
/** * one episode learning and return the total score */ double EpisodeLearning( ALEInterface& ale, deepRL::DeepQLearner& dqlearner, const bool update) { assert(!ale.game_over()); std::deque<deepRL::FrameDataSp> past_frames; //dqlearner.replay_memory_.resetPool(); auto total_score = 0.0; for (auto frame = 0; !ale.game_over(); ++frame) { //std::cout << "frame: " << frame << std::endl; const auto current_frame = deepRL::PreprocessScreen(ale.getScreen()); past_frames.push_back(current_frame); if (past_frames.size() < deepRL::kInputFrameCount) { // If there are not past frames enough for DQN input, just select NOOP for (auto i = 0; i < argmap["skip_frame"].as<int>() + 1 && !ale.game_over(); ++i) { total_score += ale.act(PLAYER_A_NOOP); } } else { if (past_frames.size() > deepRL::kInputFrameCount) { past_frames.pop_front(); } deepRL::InputFrames input_frames; std::copy(past_frames.begin(), past_frames.end(), input_frames.begin()); const auto action = dqlearner.SelectAction(input_frames); auto immediate_score = 0.0; for (auto i = 0; i < argmap["skip_frame"].as<int>() + 1 && !ale.game_over(); ++i) { // Last action is repeated on skipped frames immediate_score += ale.act(action); } total_score += immediate_score; //clip reward for robust gradient update // Rewards for DQN are normalized as follows: // 1 for any positive score, -1 for any negative score, otherwise 0 const auto reward = immediate_score == 0 ? 0 : immediate_score /= std::abs(immediate_score); if (update) { // Add the current transition to replay memory const auto transition = ale.game_over() ? deepRL::Transition(input_frames, action, reward, boost::none) : deepRL::Transition( input_frames, action, reward, deepRL::PreprocessScreen(ale.getScreen())); dqlearner.replay_memory_.addTransition(transition); //std::cout << "Memorypool Size: " << dqlearner.replay_memory_.memory_size() << std::endl; // If the size of replay memory is enough, update DQN if (dqlearner.replay_memory_.memory_size() >= argmap["replay_start_size"].as<int>() and dqlearner.numSteps()%argmap["update_frequency"].as<int>()==0 ) { dqlearner.MiniBatchUpdate(); } } } } ale.reset_game(); return total_score; }
void Learner::learnPolicy(ALEInterface& ale, vector<vector<vector<float> > > &learnedOptions){ vector<float> reward; //Repeat (for each episode): int episode, totalNumberFrames = 0; //This is going to be interrupted by the ALE code since I set max_num_frames beforehand for(episode = 0; totalNumberFrames < MAX_NUM_FRAMES; episode++){ //We have to clean the traces every episode: for(unsigned int a = 0; a < nonZeroElig.size(); a++){ for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){ int idx = nonZeroElig[a][i]; e[a][idx] = 0.0; } nonZeroElig[a].clear(); } F.clear(); bproFeatures.getActiveFeaturesIndices(ale.getScreen(), F); updateQValues(F, Q); currentAction = epsilonGreedy(Q); //Repeat(for each step of episode) until game is over: gettimeofday(&tvBegin, NULL); //This also stops when the maximum number of steps per episode is reached while(!ale.game_over()){ reward.clear(); reward.push_back(0.0); reward.push_back(0.0); updateQValues(F, Q); sanityCheck(); //Take action, observe reward and next state: act(ale, currentAction, reward, learnedOptions); cumIntrReward += reward[0]; cumReward += reward[1]; if(!ale.game_over()){ //Obtain active features in the new state: Fnext.clear(); bproFeatures.getActiveFeaturesIndices(ale.getScreen(), Fnext); updateQValues(Fnext, Qnext); //Update Q-values for the new active features nextAction = epsilonGreedy(Qnext); } else{ nextAction = 0; for(unsigned int i = 0; i < Qnext.size(); i++){ Qnext[i] = 0; } } //To ensure the learning rate will never increase along //the time, Marc used such approach in his JAIR paper if (F.size() > maxFeatVectorNorm){ maxFeatVectorNorm = F.size(); } delta = reward[0] + GAMMA * Qnext[nextAction] - Q[currentAction]; updateReplTrace(currentAction, F); //Update weights vector: float stepSize = ALPHA/maxFeatVectorNorm; for(unsigned int a = 0; a < nonZeroElig.size(); a++){ for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){ int idx = nonZeroElig[a][i]; w[a][idx] = w[a][idx] + stepSize * delta * e[a][idx]; } } F = Fnext; FRam = FnextRam; currentAction = nextAction; } gettimeofday(&tvEnd, NULL); timeval_subtract(&tvDiff, &tvEnd, &tvBegin); elapsedTime = float(tvDiff.tv_sec) + float(tvDiff.tv_usec)/1000000.0; float fps = float(ale.getEpisodeFrameNumber())/elapsedTime; printf("episode: %d,\t%.0f points,\tavg. return: %.1f,\tnovelty reward: %.2f (%.2f),\t%d frames,\t%.0f fps\n", episode + 1, cumReward - prevCumReward, (float)cumReward/(episode + 1.0), cumIntrReward - prevCumIntrReward, cumIntrReward/(episode + 1.0), ale.getEpisodeFrameNumber(), fps); totalNumberFrames += ale.getEpisodeFrameNumber(); prevCumReward = cumReward; prevCumIntrReward = cumIntrReward; ale.reset_game(); } stringstream ss; ss << episode; saveWeightsToFile(ss.str()); }
static hexq::Reward move_to_the(ALEInterface &ale, DisplayScreen *display, const Action action, const hexq::Reward discount_rate, hexq::MontezumaOptionsMdp &mdp, size_t &elapsed_time, hexq::Reward &nophi_reward, hexq::Reward &phi_reward, vector<pair<hexq::Reward,hexq::State> > &all_steps) { const vector<Action> *axis_actions; unsigned int unchanging_addr, changing_addr; if(action == PLAYER_A_LEFT || action == PLAYER_A_RIGHT) { axis_actions = &vertical_actions; unchanging_addr = ADDR_Y; changing_addr = ADDR_X; } else { axis_actions = &horizontal_actions; unchanging_addr = ADDR_X; changing_addr = ADDR_Y; } const bool initial_cannot_change_axis = !does_value_change(ale, *axis_actions, unchanging_addr); hexq::State prev_s = mdp.StateUniqueID(); phi_reward = mdp.ComputeState(ale.act(action), nophi_reward); vector<pair<pair<hexq::Reward, hexq::Reward>, ALEState> > frames; frames.push_back(make_pair(make_pair(phi_reward, nophi_reward), ale.cloneSystemState())); all_steps.push_back(make_pair(phi_reward, prev_s)); byte_t prev_changing = ale.getRAM().get(changing_addr); const int initial_lives = ale.lives(); int n_frames_unchanged; bool controllable = true; bool lost_life = false; for(size_t max_n_iterations=0; !lost_life && max_n_iterations<MAX_FRAMES; max_n_iterations++) { hexq::Reward nophi_r; prev_s = mdp.StateUniqueID(); hexq::Reward reward = mdp.ComputeState(ale.act(action), nophi_r); frames.push_back(make_pair(make_pair(reward, nophi_r), ale.cloneSystemState())); all_steps.push_back(make_pair(reward, prev_s)); DISPLAY(display); controllable = !(ale.getRAM().get(0xd8) != 0x00 || ale.getRAM().get(0xd6) != 0xff); if(!controllable)break; bool stop_for_axis_change = initial_cannot_change_axis && does_value_change(ale, *axis_actions, unchanging_addr); if(stop_for_axis_change) { // printf("Break because axis change possibility %d\n", ABHG++); break; } // printf("X: %d Y: %d\n", ale.getRAM().get(ADDR_X), ale.getRAM().get(ADDR_Y)); byte_t new_changing = ale.getRAM().get(changing_addr); if(new_changing == prev_changing && controllable) { n_frames_unchanged++; if(n_frames_unchanged >= NOT_MOVING_FRAMES) { // printf("Break because not moving %d\n", ABHG++); break; } } else { n_frames_unchanged = 0; prev_changing = new_changing; } lost_life = ale.lives() < initial_lives; } if((lost_life || !controllable) && frames.size() > N_BACK_FRAMES) { size_t new_size = frames.size() - N_BACK_FRAMES; frames.resize(new_size); all_steps.resize(new_size); printf("went back\n"); ale.restoreSystemState(frames.rbegin()->second); ale.environment->processRAM(); ale.environment->processScreen(); DISPLAY(display); hexq::Reward r; (void)mdp.ComputeState(0, r); } hexq::Reward discount = 1.; hexq::Reward total_reward = 0; phi_reward = nophi_reward = 0; for(size_t i=0; i<frames.size(); i++) { total_reward += discount*frames.at(i).first.first; discount *= discount_rate; nophi_reward += frames.at(i).first.second; phi_reward += frames.at(i).first.first; } elapsed_time += frames.size(); return total_reward; }
int main(int argc, char** argv) { if (argc < 2) { std::cout << "Usage: " << argv[0] << " rom_file" << std::endl; return 1; } ALEInterface ale; // Get & Set the desired settings ale.setInt("random_seed", 123); // We enable both screen and sound, which we will need for recording. ale.setBool("display_screen", true); // You may leave sound disabled (by setting this flag to false) if so desired. ale.setBool("sound", true); std::string recordPath = "record"; std::cout << std::endl; // Set record flags ale.setString("record_screen_dir", recordPath.c_str()); ale.setString("record_sound_filename", (recordPath + "/sound.wav").c_str()); // We set fragsize to 64 to ensure proper sound sync ale.setInt("fragsize", 64); // Not completely portable, but will work in most cases std::string cmd = "mkdir "; cmd += recordPath; system(cmd.c_str()); // Load the ROM file. (Also resets the system for new settings to // take effect.) ale.loadROM(argv[1]); // Get the vector of legal actions ActionVect legal_actions = ale.getLegalActionSet(); // Play a single episode, which we record. while (!ale.game_over()) { Action a = legal_actions[rand() % legal_actions.size()]; // Apply the action (discard the resulting reward) ale.act(a); } std::cout << std::endl; std::cout << "Recording complete. To create a video, you may want to run \n" " doc/scripts/videoRecordingExampleJoinXXX.sh. See manual for details.." << std::endl; return 0; }
int main(int argc, char** argv) { ALEInterface ale; // Get & Set the desired settings ale.setInt("random_seed", 123); //The default is now 0 because we don't want stochasity ale.setFloat("repeat_action_probability", 0); #ifdef __USE_SDL ale.setBool("display_screen", false); ale.setBool("sound", false); #endif /// Uncomment to Record // std::string recordPath = "record"; // std::cout << std::endl; // // Set record flags // ale.setString("record_screen_dir", recordPath.c_str()); // ale.setString("record_sound_filename", (recordPath + "/sound.wav").c_str()); // // We set fragsize to 64 to ensure proper sound sync // ale.setInt("fragsize", 64); // // Not completely portable, but will work in most cases // std::string cmd = "mkdir "; // cmd += recordPath; // system(cmd.c_str()); // Load the ROM file. (Also resets the system for new settings to // take effect.) ale.loadROM("gravitar.bin"); // Get the vector of minimal actions const ActionVect minimal_actions = ale.getMinimalActionSet(); // Erase actions that move, but don't fire //minimal_actions.erase(minimal_actions.begin() + 2, minimal_actions.begin() + 10); // Store all rewards earned in all episodes float allRewards = 0; double allTimes = 0; Timer timer; // Play 10 episodes int episodes = 200; int number = 0; int count = 0; int lastLives = ale.lives(); bool reset = false; Decision decision = Decision(ale.getMinimalActionSet(), ale.getScreen()); for (int episode=0; episode<episodes; episode++) { float totalReward = 0; double episodeTime = 0; timer.start(); while (!ale.game_over()) { if (ale.lives() < lastLives){ lastLives = ale.lives(); number = 0; count = 0; reset = true; //cout << " DIE " << endl; } else{ reset = false; } // Apply the action and get the resulting reward float reward = ale.act(decision.getDecision(ale.getScreen(), ale.lives(), reset)); //decision.print(); totalReward += reward; } timer.stop(); episodeTime = timer(); timer.reset(); count = 0; number = 0; allRewards += totalReward; allTimes += episodeTime; cout << "Episode " << episode << " ended with score: " << totalReward << " with time: "<< episodeTime <<endl; ale.reset_game(); } // Display average reward per game cout << "Average Reward: " << (allRewards / episodes) << " Average Time: " << (allTimes/episodes) << endl; return 0; }
void TrueOnlineSarsaLearner::learnPolicy(ALEInterface& ale, Features *features){ struct timeval tvBegin, tvEnd, tvDiff; vector<double> reward; double elapsedTime; double norm_a; double q_old, delta_q; double cumReward = 0, prevCumReward = 0; unsigned int maxFeatVectorNorm = 1; sawFirstReward = 0; firstReward = 1.0; //Repeat (for each episode): for(int episode = 0; episode < numEpisodesLearn; episode++){ for(unsigned int a = 0; a < nonZeroElig.size(); a++){ for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){ int idx = nonZeroElig[a][i]; e[a][idx] = 0.0; } nonZeroElig[a].clear(); } //We have to clean the traces every episode: for(unsigned int i = 0; i < e.size(); i++){ for(unsigned int j = 0; j < e[i].size(); j++){ e[i][j] = 0.0; } } F.clear(); features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), F); updateQValues(F, Q); currentAction = epsilonGreedy(Q); q_old = Q[currentAction]; //Repeat(for each step of episode) until game is over: gettimeofday(&tvBegin, NULL); frame = 0; while(frame < episodeLength && !ale.game_over()){ reward.clear(); reward.push_back(0.0); reward.push_back(0.0); updateQValues(F, Q); sanityCheck(); //Take action, observe reward and next state: act(ale, currentAction, reward); cumReward += reward[1]; if(!ale.game_over()){ //Obtain active features in the new state: Fnext.clear(); features->getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), Fnext); updateQValues(Fnext, Qnext); //Update Q-values for the new active features nextAction = epsilonGreedy(Qnext); } else{ nextAction = 0; for(unsigned int i = 0; i < Qnext.size(); i++){ Qnext[i] = 0; } } //To ensure the learning rate will never increase along //the time, Marc used such approach in his JAIR paper if (F.size() > maxFeatVectorNorm){ maxFeatVectorNorm = F.size(); } norm_a = alpha/maxFeatVectorNorm; delta_q = Q[currentAction] - q_old; q_old = Qnext[nextAction]; delta = reward[0] + gamma * Qnext[nextAction] - Q[currentAction]; //e <- e + [1 - alpha * e^T phi(S,A)] phi(S,A) updateTrace(currentAction, norm_a); //theta <- theta + alpha * delta * e + alpha * delta_q (e - phi(S,A)) updateWeights(currentAction, norm_a, delta_q); //e <- gamma * lambda * e decayTrace(); F = Fnext; currentAction = nextAction; } ale.reset_game(); gettimeofday(&tvEnd, NULL); timeval_subtract(&tvDiff, &tvEnd, &tvBegin); elapsedTime = double(tvDiff.tv_sec) + double(tvDiff.tv_usec)/1000000.0; double fps = double(frame)/elapsedTime; printf("episode: %d,\t%.0f points,\tavg. return: %.1f,\t%d frames,\t%.0f fps\n", episode + 1, (cumReward-prevCumReward), (double)cumReward/(episode + 1.0), frame, fps); prevCumReward = cumReward; } }