Exemplo n.º 1
0
void CRLModule::UpdateQEntry(CWorldState *pWorldState, bool bInmediateReward)
{
	int idQ_s_a, idQ_s_a2;
	Q_VALUE_TYPE *pQ_s_a= 0;
	double maxQ;
	double reward;

	
	if (bInmediateReward)//IsConstraint() && bConstraints)
	{
		reward= GetReward(pWorldState);
		m_lastReward= reward;

		idQ_s_a= m_pState->GetStateId();

		pQ_s_a= m_pQMatrix->GetQEntry(idQ_s_a);

		idQ_s_a2= m_pNextState->GetStateId();

		if (idQ_s_a2>=0) maxQ= m_pQMatrix->GetMaxQ(idQ_s_a2);
		else maxQ= m_negativeReward;

		pQ_s_a[m_lastAction]+= m_alpha*(reward + m_gamma*maxQ - pQ_s_a[m_lastAction]);

		UpdateState();
	}
	else if (!IsConstraint())// && !bConstraints)
	{
		reward= GetReward(pWorldState);
		m_lastReward= reward;

		idQ_s_a= m_lastStateId;//m_pState->GetStateId();
		if (idQ_s_a>=0)
			pQ_s_a= m_pQMatrix->GetQEntry(idQ_s_a);

		idQ_s_a2= m_pState->GetStateId();//m_pNextState->GetStateId();


		if (idQ_s_a2>=0 && pQ_s_a)
		{
			maxQ= m_pQMatrix->GetMaxQ(idQ_s_a2);
			pQ_s_a[m_lastAction]+= m_alpha*(reward + m_gamma*maxQ - pQ_s_a[m_lastAction]);
		}
		m_lastStateId= idQ_s_a2;
	}
}
Exemplo n.º 2
0
double TDErrorCritic::GetCritique( const ros::Time& time ) const
{
	double reward = GetReward( time );
	double currValue = _valueFunction->GetCritique( time );
	double nextValue = _valueFunction->GetCritique( time + _timestep );

	double tdError = reward + _discountFactor * nextValue - currValue;
	return tdError;
}