int main(int argc, char** argv) { ALEInterface ale; // Get & Set the desired settings ale.setInt("random_seed", 123); //The default is now 0 because we don't want stochasity ale.setFloat("repeat_action_probability", 0); #ifdef __USE_SDL ale.setBool("display_screen", false); ale.setBool("sound", false); #endif /// Uncomment to Record // std::string recordPath = "record"; // std::cout << std::endl; // // Set record flags // ale.setString("record_screen_dir", recordPath.c_str()); // ale.setString("record_sound_filename", (recordPath + "/sound.wav").c_str()); // // We set fragsize to 64 to ensure proper sound sync // ale.setInt("fragsize", 64); // // Not completely portable, but will work in most cases // std::string cmd = "mkdir "; // cmd += recordPath; // system(cmd.c_str()); // Load the ROM file. (Also resets the system for new settings to // take effect.) ale.loadROM("gravitar.bin"); // Get the vector of minimal actions const ActionVect minimal_actions = ale.getMinimalActionSet(); // Erase actions that move, but don't fire //minimal_actions.erase(minimal_actions.begin() + 2, minimal_actions.begin() + 10); // Store all rewards earned in all episodes float allRewards = 0; double allTimes = 0; Timer timer; // Play 10 episodes int episodes = 200; int number = 0; int count = 0; int lastLives = ale.lives(); bool reset = false; Decision decision = Decision(ale.getMinimalActionSet(), ale.getScreen()); for (int episode=0; episode<episodes; episode++) { float totalReward = 0; double episodeTime = 0; timer.start(); while (!ale.game_over()) { if (ale.lives() < lastLives){ lastLives = ale.lives(); number = 0; count = 0; reset = true; //cout << " DIE " << endl; } else{ reset = false; } // Apply the action and get the resulting reward float reward = ale.act(decision.getDecision(ale.getScreen(), ale.lives(), reset)); //decision.print(); totalReward += reward; } timer.stop(); episodeTime = timer(); timer.reset(); count = 0; number = 0; allRewards += totalReward; allTimes += episodeTime; cout << "Episode " << episode << " ended with score: " << totalReward << " with time: "<< episodeTime <<endl; ale.reset_game(); } // Display average reward per game cout << "Average Reward: " << (allRewards / episodes) << " Average Time: " << (allTimes/episodes) << endl; return 0; }
static hexq::Reward move_to_the(ALEInterface &ale, DisplayScreen *display, const Action action, const hexq::Reward discount_rate, hexq::MontezumaOptionsMdp &mdp, size_t &elapsed_time, hexq::Reward &nophi_reward, hexq::Reward &phi_reward, vector<pair<hexq::Reward,hexq::State> > &all_steps) { const vector<Action> *axis_actions; unsigned int unchanging_addr, changing_addr; if(action == PLAYER_A_LEFT || action == PLAYER_A_RIGHT) { axis_actions = &vertical_actions; unchanging_addr = ADDR_Y; changing_addr = ADDR_X; } else { axis_actions = &horizontal_actions; unchanging_addr = ADDR_X; changing_addr = ADDR_Y; } const bool initial_cannot_change_axis = !does_value_change(ale, *axis_actions, unchanging_addr); hexq::State prev_s = mdp.StateUniqueID(); phi_reward = mdp.ComputeState(ale.act(action), nophi_reward); vector<pair<pair<hexq::Reward, hexq::Reward>, ALEState> > frames; frames.push_back(make_pair(make_pair(phi_reward, nophi_reward), ale.cloneSystemState())); all_steps.push_back(make_pair(phi_reward, prev_s)); byte_t prev_changing = ale.getRAM().get(changing_addr); const int initial_lives = ale.lives(); int n_frames_unchanged; bool controllable = true; bool lost_life = false; for(size_t max_n_iterations=0; !lost_life && max_n_iterations<MAX_FRAMES; max_n_iterations++) { hexq::Reward nophi_r; prev_s = mdp.StateUniqueID(); hexq::Reward reward = mdp.ComputeState(ale.act(action), nophi_r); frames.push_back(make_pair(make_pair(reward, nophi_r), ale.cloneSystemState())); all_steps.push_back(make_pair(reward, prev_s)); DISPLAY(display); controllable = !(ale.getRAM().get(0xd8) != 0x00 || ale.getRAM().get(0xd6) != 0xff); if(!controllable)break; bool stop_for_axis_change = initial_cannot_change_axis && does_value_change(ale, *axis_actions, unchanging_addr); if(stop_for_axis_change) { // printf("Break because axis change possibility %d\n", ABHG++); break; } // printf("X: %d Y: %d\n", ale.getRAM().get(ADDR_X), ale.getRAM().get(ADDR_Y)); byte_t new_changing = ale.getRAM().get(changing_addr); if(new_changing == prev_changing && controllable) { n_frames_unchanged++; if(n_frames_unchanged >= NOT_MOVING_FRAMES) { // printf("Break because not moving %d\n", ABHG++); break; } } else { n_frames_unchanged = 0; prev_changing = new_changing; } lost_life = ale.lives() < initial_lives; } if((lost_life || !controllable) && frames.size() > N_BACK_FRAMES) { size_t new_size = frames.size() - N_BACK_FRAMES; frames.resize(new_size); all_steps.resize(new_size); printf("went back\n"); ale.restoreSystemState(frames.rbegin()->second); ale.environment->processRAM(); ale.environment->processScreen(); DISPLAY(display); hexq::Reward r; (void)mdp.ComputeState(0, r); } hexq::Reward discount = 1.; hexq::Reward total_reward = 0; phi_reward = nophi_reward = 0; for(size_t i=0; i<frames.size(); i++) { total_reward += discount*frames.at(i).first.first; discount *= discount_rate; nophi_reward += frames.at(i).first.second; phi_reward += frames.at(i).first.first; } elapsed_time += frames.size(); return total_reward; }