// virtual void GQLearner::refinePolicyAndChooseNextAction(const double* pSenses, double* pOutActions) { double reward; if(m_teleported) reward = UNKNOWN_REAL_VALUE; else reward = rewardFromLastAction(); if(reward != UNKNOWN_REAL_VALUE) { // Find the best next action double maxQ = 0; double q; m_pActionIterator->reset(pSenses); int i; for(i = 0; i < m_actionCap; i++) { if(!m_pActionIterator->nextAction(pOutActions)) break; q = getQValue(pSenses, pOutActions); if(q > maxQ) maxQ = q; } // Update the Q-values q = reward + m_discountFactor * maxQ; setQValue(m_pSenses, m_pAction, (1.0 - m_learningRate) * getQValue(m_pSenses, m_pAction) + m_learningRate * q); } // Decide what to do next GVec::copy(m_pSenses, pSenses, m_senseDims); chooseAction(pSenses, pOutActions); GVec::copy(m_pAction, pOutActions, m_actionDims); m_teleported = false; }
// virtual void GIncrementalLearnerQAgent::chooseAction(const double* pSenses, double* pActions) { m_pActionIterator->reset(pSenses); if(m_explore && m_pRand->uniform() >= m_softMaxThresh) { // Explore m_pActionIterator->randomAction(pActions, m_pRand); } else { // Exploit double bestQ = -1e200; double q; int i; GTEMPBUF(double, pCand, m_actionDims); for(i = 1; i < m_actionCap; i++) { if(!m_pActionIterator->nextAction(pCand)) break; q = getQValue(pSenses, pCand); if(q > bestQ) { bestQ = q; GVec::copy(pActions, pCand, m_actionDims); } } } }
LearningProblemState* QLearner::doLearningIteration(LearningProblemState * state) { // Pick a new state once in a while if (randomReal() < nu) { state = problem->getRandomState(); } // Get the list of actions LearningProblemAction* actions = problem->getActions(state); LearningProblemAction* action = NULL; // Check if we should use a random action, or the best one if (randomReal() < rho) { unsigned randPos = randomInt(actions->getCount()); action = actions->getAtPositionInList(randPos); } else { action = getBestAction(state); } // Make sure we've got something to do if (action != NULL) { // Carry out the action LearningProblemActionResult result = problem->getResult(state, action); // Get the current q value real q = getQValue(state, action); // Get the q of the best action from the new state real maxQ = getBestQValue(result.state); // recalculate the q q = ((real)1.0-alpha) * q + alpha * (result.reward + gamma * maxQ); // Store the new Q value storeQValue(state, action, q); return result.state; } // Otherwise we need to get a new state - we've reached the // end of the road. else { return problem->getRandomState(); } }
LearningProblemAction* QLearner::getBestAction(LearningProblemState *state) { // Get the actions LearningProblemAction* action = problem->getActions(state); // Check them in turn real best = (real)0; LearningProblemAction * bestAction = action; while (action != NULL) { real q = getQValue(state, action); if (q > best) { best = q; bestAction = action; } action = action->next; } return bestAction; }