Ejemplo n.º 1
0
void ALEState::reset(int numResetSteps) {
  // reset the rom
  m_settings->reset();
  
  // Reset the paddles
  default_paddles();

  // Reset the emulator
  m_osystem->console().system().reset();

  // NOOP for 60 steps
  for (int i = 0; i < 60; i++) {
    apply_action(PLAYER_A_NOOP, PLAYER_B_NOOP);
    simulate();
    // Don't count these frames
    frame_number--;
  }

  // reset for n steps
  for (int i = 0; i < numResetSteps; i++) {
    apply_action(RESET, PLAYER_B_NOOP);
    simulate();
    // Don't count these frames
    frame_number--;
  }

  // Apply necessary actions specified by the rom itself
  ActionVect startingActions = m_settings->getStartingActions();
  for (int i = 0; i < startingActions.size(); i++) {
      apply_action(startingActions[i], PLAYER_B_NOOP);
      simulate();
      frame_number--;
  }
}
Ejemplo n.º 2
0
/** Resets the system to its start state. */
void StellaEnvironment::reset() {
  // RNG for generating environments
  Random randGen;

  // Reset the paddles
  m_state.resetVariables(m_osystem->event());

  // Reset the emulator
  m_osystem->console().system().reset();

  // NOOP for 60 steps in the deterministic environment setting, or some random amount otherwise 
  int noopSteps;
  if (m_stochastic_start)
    noopSteps = 60 + rand() % NUM_RANDOM_ENVIRONMENTS;
  else
    noopSteps = 60;

  emulate(PLAYER_A_NOOP, PLAYER_B_NOOP, noopSteps);
  // reset for n steps
  emulate(RESET, PLAYER_B_NOOP, m_num_reset_steps);

  // reset the rom (after emulating, in case the NOOPs led to reward)
  m_settings->reset();
  
  // Apply necessary actions specified by the rom itself
  if (m_use_starting_actions) {
    ActionVect startingActions = m_settings->getStartingActions();
    for (size_t i = 0; i < startingActions.size(); i++)
      emulate(startingActions[i], PLAYER_B_NOOP);
  }
}
/** Resets the system to its start state. */
void StellaEnvironment::reset() {
    m_state.resetEpisodeFrameNumber();
    // Reset the paddles
    m_state.resetPaddles(m_osystem->event());

    // Reset the emulator
    m_osystem->console().system().reset();

    // NOOP for 60 steps in the deterministic environment setting, or some random amount otherwise
    int noopSteps;
    noopSteps = 60;

    emulate(PLAYER_A_NOOP, PLAYER_B_NOOP, noopSteps);
    // reset for n steps
    emulate(RESET, PLAYER_B_NOOP, m_num_reset_steps);

    // reset the rom (after emulating, in case the NOOPs led to reward)
    m_settings->reset();

    // Apply necessary actions specified by the rom itself
    ActionVect startingActions = m_settings->getStartingActions();
    for (size_t i = 0; i < startingActions.size(); i++) {
        emulate(startingActions[i], PLAYER_B_NOOP);
    }
}
Ejemplo n.º 4
0
double RandomAgent::evaluatePolicy(Environment<bool>& env){
	int reward = 0;
	int totalReward = 0;
	int cumulativeReward = 0;
	int numActions;
	ActionVect actions;
	//Check if one wants to sample from all possible actions or only the valid ones:
	if(useMinActions){
		actions = env.getMinimalActionSet();
	}
	else{
		actions = env.getLegalActionSet();
	}
	numActions = actions.size();
	printf("Number of Actions: %d\n\n", numActions);
	//Repeat (for each episode):
	for(int episode = 0; episode < numEpisodesToEval; episode++){
		int step = 0;
		while(!env.game_over() && step < maxStepsInEpisode) {
			reward = env.act(actions[rand()%numActions]);
			cumulativeReward += reward;
			step++;
		}
		printf("Episode %d, Cumulative Reward: %d\n", episode + 1, cumulativeReward);
		totalReward += cumulativeReward;
		cumulativeReward = 0;
		env.reset_game(); //Start the game again when the episode is over
	}
	return double(totalReward)/numEpisodesToEval;
}
Action getAction(ActionVect av, ALEState state, ALEInterface& ale) {
    float bestReward = 0;
    Action bestAction = av[rand() % av.size()];
    //	for(int i = 0; i < av.size(); i++) {
    //		float reward = ale.act(av[i]);
    //		if(reward > bestReward) {
    //			bestAction = av[i];
    //			bestReward = reward;
    //		}
    //		ale.restoreState(state);
    //	}
    return bestAction;
}
Ejemplo n.º 6
0
int main(int argc, char** argv){
	ALEInterface ale(1);

	if(argc != 4){
		printf("Usage: %s rom_file path_to_save_bits seed\n", argv[0]);
		exit(1);
	}

	int seed = atoi(argv[3]);
	
	ale.setInt("frame_skip", 5);
	ale.setInt("random_seed", seed);
	ale.setInt("max_num_frames_per_episode", 18000);
	ale.setFloat("repeat_action_prob", 0.00);

	ale.loadROM(argv[1]);
	string outputFile = argv[2];
	srand(seed);

	ActionVect actions = ale.getLegalActionSet();
    RAMFeatures features;
    vector<bool> F;

    ofstream outFile;
    outFile.open(outputFile);

	int reward = 0;
	
	F.clear();
    features.getCompleteFeatureVector(ale.getRAM(), F);
    for(int i = 0; i < F.size(); i++){
        outFile << F[i] << ",";
    }
    outFile << endl;

	while(!ale.game_over()) {
		reward += ale.act(actions[rand() % actions.size()]);
        F.clear();
        features.getCompleteFeatureVector(ale.getRAM(), F);

        for(int i = 0; i < F.size(); i++){
            outFile << F[i] << ",";
        }
        outFile << endl;
	}
	printf("Episode ended with a score of %d points\n", reward);
	outFile.close();
    return 0;
}
Ejemplo n.º 7
0
std::string PrintQValues(
    const std::vector<float>& q_values, const ActionVect& actions) {
  assert(!q_values.empty());
  assert(!actions.empty());
  assert(q_values.size() == actions.size());
  std::ostringstream actions_buf;
  std::ostringstream q_values_buf;
  for (auto i = 0; i < q_values.size(); ++i) {
    const auto a_str =
        boost::algorithm::replace_all_copy(
            action_to_string(actions[i]), "PLAYER_A_", "");
    const auto q_str = std::to_string(q_values[i]);
    const auto column_size = std::max(a_str.size(), q_str.size()) + 1;
    actions_buf.width(column_size);
    actions_buf << a_str;
    q_values_buf.width(column_size);
    q_values_buf << q_str;
  }
  actions_buf << std::endl;
  q_values_buf << std::endl;
  return actions_buf.str() + q_values_buf.str();
}
int main(int argc, char** argv){

	//For the use of options:
	RAMFeatures ramFeatures;
	vector<bool> FRam, FnextRam;
	vector<double> transitions((ramFeatures.getNumberOfFeatures() - 1)*2, 0);

	readParameters(argc, argv);
	readParamFiles();
	srand(seed);

	//Initializing ALE:
	ALEInterface ale(1);
	ale.setFloat("frame_skip", NUM_STEPS_PER_ACTION);
	ale.setFloat("stochasticity", STOCHASTICITY);
	ale.setInt("random_seed", seed);
	ale.setInt("max_num_frames_per_episode", MAX_LENGTH_EPISODE);
	ale.loadROM(romPath.c_str());

	//Initializing useful things to agent:
	BPROFeatures features;
	actions     = ale.getLegalActionSet();
	numActions  = actions.size();
	numFeatures = NUM_COLUMNS * NUM_ROWS * NUM_COLORS 
					+ (2 * NUM_COLUMNS - 1) * (2 * NUM_ROWS - 1) * NUM_COLORS * NUM_COLORS + 1;

	for(int i = 0; i < numActions; i++){
		//Initialize Q;
		Q.push_back(0);
		w.push_back(vector<double>(numFeatures, 0.0));
	}

	loadWeights(wgtPath);
	int reward = 0;
	double intr_reward = 0.0;
	
	FRam.clear();
	ramFeatures.getCompleteFeatureVector(ale.getScreen(), ale.getRAM(), FRam);
	while(!ale.game_over()){
		//Get state and features active on that state:		
		F.clear();
		features.getActiveFeaturesIndices(ale.getScreen(), ale.getRAM(), F);
		
		updateQValues();       //Update Q-values for each possible action
		currentAction = epsilonGreedy();
		//Take action, observe reward and next state:
		reward += ale.act(actions[currentAction]);

		FnextRam.clear();
		ramFeatures.getCompleteFeatureVector(ale.getScreen(), ale.getRAM(), FnextRam);
		updateTransitionVector(FRam, FnextRam, transitions);
		//Calculating intrinsic reward:
		for(int i = 0; i < transitions.size(); i++){
			transitions[i] = (transitions[i] - mean[i])/var[i];
		}
		intr_reward = 0.0;
		for(int i = 0; i < transitions.size(); i++){
			intr_reward += option[i] * transitions[i];
		}
		printf("%f\n", intr_reward);
		FRam = FnextRam;
	}

	printf("Final score: %d\n", reward);

	return 0;
}