void CRLModule::UpdateQEntry(CWorldState *pWorldState, bool bInmediateReward) { int idQ_s_a, idQ_s_a2; Q_VALUE_TYPE *pQ_s_a= 0; double maxQ; double reward; if (bInmediateReward)//IsConstraint() && bConstraints) { reward= GetReward(pWorldState); m_lastReward= reward; idQ_s_a= m_pState->GetStateId(); pQ_s_a= m_pQMatrix->GetQEntry(idQ_s_a); idQ_s_a2= m_pNextState->GetStateId(); if (idQ_s_a2>=0) maxQ= m_pQMatrix->GetMaxQ(idQ_s_a2); else maxQ= m_negativeReward; pQ_s_a[m_lastAction]+= m_alpha*(reward + m_gamma*maxQ - pQ_s_a[m_lastAction]); UpdateState(); } else if (!IsConstraint())// && !bConstraints) { reward= GetReward(pWorldState); m_lastReward= reward; idQ_s_a= m_lastStateId;//m_pState->GetStateId(); if (idQ_s_a>=0) pQ_s_a= m_pQMatrix->GetQEntry(idQ_s_a); idQ_s_a2= m_pState->GetStateId();//m_pNextState->GetStateId(); if (idQ_s_a2>=0 && pQ_s_a) { maxQ= m_pQMatrix->GetMaxQ(idQ_s_a2); pQ_s_a[m_lastAction]+= m_alpha*(reward + m_gamma*maxQ - pQ_s_a[m_lastAction]); } m_lastStateId= idQ_s_a2; } }
double TDErrorCritic::GetCritique( const ros::Time& time ) const { double reward = GetReward( time ); double currValue = _valueFunction->GetCritique( time ); double nextValue = _valueFunction->GetCritique( time + _timestep ); double tdError = reward + _discountFactor * nextValue - currValue; return tdError; }