Ejemplo n.º 1
0
int playGame(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *bpro, 
	vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId){
	ale.reset_game();
	vector<bool> F(NUM_BITS, 0); //Set of active features
	vector<bool> Fprev;

	int score = 0;
	while(!ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){
		int nextAction = getNextAction(ale, param.numOptions);
		score += actUpdatingAvg(ale, ram, bpro, nextAction, w, param, totalNumFrames, gameId, F, Fprev);
	}
	totalNumFrames += ale.getEpisodeFrameNumber();
	printf("Episode: %d, Final score: %d, Total Num. Frames: %d\n", gameId+1, score, totalNumFrames);
	return totalNumFrames;
}
Ejemplo n.º 2
0
int actUpdatingAvg(ALEInterface& ale, RAMFeatures *ram, BPROFeatures *features, int nextAction, 
	vector<vector<vector<float> > > &w, Parameters param, int totalNumFrames, int gameId,
	vector<bool> &F, vector<bool> &Fprev){

	int reward = 0;

	//If the selected action was one of the primitive actions
	if(nextAction < NUM_ACTIONS){ 
		for(int i = 0; i < FRAME_SKIP && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES; i++){
			reward += ale.act((Action) nextAction);
			Fprev.swap(F);
			F.clear();
			ram->getCompleteFeatureVector(ale.getRAM(), F);
			F.pop_back();
			updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId);
		}
	}
	//If the selected action was one of the options
	else{
		int currentAction;
		vector<int> Fbpro;	                  //Set of features active
		vector<float> Q(NUM_ACTIONS, 0.0);    //Q(a) entries

		int option = nextAction - NUM_ACTIONS;
		while(rand()%1000 > 1000 * PROB_TERMINATION && !ale.game_over() && totalNumFrames + ale.getEpisodeFrameNumber() < MAX_NUM_FRAMES){
			//Get state and features active on that state:		
			Fbpro.clear();
			features->getActiveFeaturesIndices(ale.getScreen(), Fbpro);
			updateQValues(Fbpro, Q, w, option);       //Update Q-values for each possible action
			currentAction = epsilonGreedy(Q);
			//Take action, observe reward and next state:
			reward += ale.act((Action) currentAction);
			Fprev.swap(F);
			F.clear();
			ram->getCompleteFeatureVector(ale.getRAM(), F);
			F.pop_back();
			updateAverage(Fprev, F, ale.getEpisodeFrameNumber(), param, gameId);
		}
	}
	return reward;
}
Ejemplo n.º 3
0
void Learner::learnPolicy(ALEInterface& ale, vector<vector<vector<float> > > &learnedOptions){
	
	vector<float> reward;
	//Repeat (for each episode):
	int episode, totalNumberFrames = 0;
	//This is going to be interrupted by the ALE code since I set max_num_frames beforehand
	for(episode = 0; totalNumberFrames < MAX_NUM_FRAMES; episode++){ 
		//We have to clean the traces every episode:
		for(unsigned int a = 0; a < nonZeroElig.size(); a++){
			for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){
				int idx = nonZeroElig[a][i];
				e[a][idx] = 0.0;
			}
			nonZeroElig[a].clear();
		}
		F.clear();
		bproFeatures.getActiveFeaturesIndices(ale.getScreen(), F);
		updateQValues(F, Q);
		currentAction = epsilonGreedy(Q);
		//Repeat(for each step of episode) until game is over:
		gettimeofday(&tvBegin, NULL);

		//This also stops when the maximum number of steps per episode is reached
		while(!ale.game_over()){
			reward.clear();
			reward.push_back(0.0);
			reward.push_back(0.0);
			updateQValues(F, Q);
			sanityCheck();
			//Take action, observe reward and next state:
			act(ale, currentAction, reward, learnedOptions);
			cumIntrReward += reward[0];
			cumReward  += reward[1];
			if(!ale.game_over()){
				//Obtain active features in the new state:
				Fnext.clear();
				bproFeatures.getActiveFeaturesIndices(ale.getScreen(), Fnext);
				updateQValues(Fnext, Qnext);     //Update Q-values for the new active features
				nextAction = epsilonGreedy(Qnext);
			}
			else{
				nextAction = 0;
				for(unsigned int i = 0; i < Qnext.size(); i++){
					Qnext[i] = 0;
				}
			}
			//To ensure the learning rate will never increase along
			//the time, Marc used such approach in his JAIR paper		
			if (F.size() > maxFeatVectorNorm){
				maxFeatVectorNorm = F.size();
			}

			delta = reward[0] + GAMMA * Qnext[nextAction] - Q[currentAction];
			updateReplTrace(currentAction, F);

			//Update weights vector:
			float stepSize = ALPHA/maxFeatVectorNorm;
			for(unsigned int a = 0; a < nonZeroElig.size(); a++){
				for(unsigned int i = 0; i < nonZeroElig[a].size(); i++){
					int idx = nonZeroElig[a][i];
					w[a][idx] = w[a][idx] + stepSize * delta * e[a][idx];
				}
			}
			F = Fnext;
			FRam = FnextRam;
			currentAction = nextAction;
		}
		gettimeofday(&tvEnd, NULL);
		timeval_subtract(&tvDiff, &tvEnd, &tvBegin);
		elapsedTime = float(tvDiff.tv_sec) + float(tvDiff.tv_usec)/1000000.0;
		
		float fps = float(ale.getEpisodeFrameNumber())/elapsedTime;
		printf("episode: %d,\t%.0f points,\tavg. return: %.1f,\tnovelty reward: %.2f (%.2f),\t%d frames,\t%.0f fps\n",
			episode + 1, cumReward - prevCumReward, (float)cumReward/(episode + 1.0),
			cumIntrReward - prevCumIntrReward, cumIntrReward/(episode + 1.0), ale.getEpisodeFrameNumber(), fps);
		totalNumberFrames += ale.getEpisodeFrameNumber();
		prevCumReward = cumReward;
		prevCumIntrReward = cumIntrReward;
		ale.reset_game();
	}
	
	stringstream ss;
	ss << episode;
	saveWeightsToFile(ss.str());
}