Example #1
0
double FlatMCTS::SimulateQ(STATE &state, QNODE &qnode, int action, int depth) {
  int observation;
  double immediateReward;
  double delayedReward = 0.0;

  bool terminal = Simulator.Step(state, action, observation, immediateReward);  
  if (Params.ThompsonSampling) {
    qnode.Update(observation, immediateReward, 1);  
  }

  History.Add(action, observation);

  if (Params.Verbose >= 3) {
    Simulator.DisplayAction(action, cout);
    Simulator.DisplayObservation(state, observation, cout);
    Simulator.DisplayReward(immediateReward, cout);
    Simulator.DisplayState(state, cout);
  }

  VNODE *&vnode = qnode.Child(observation);

  if (!terminal) {
    if (vnode) {  
      delayedReward = SimulateV(state, vnode, depth + 1);
    } else {                       
      vnode = ExpandNode(&state, History);  

      STATE *copy = Simulator.Copy(state);
      delayedReward = Rollout(*copy, depth + 1);
      Simulator.FreeState(copy);

      if (Params.ThompsonSampling) {
        vnode->GetCumulativeReward(state).Add(delayedReward);
      }
      else {
        vnode->UCB.Value.Add(delayedReward);
      }
    }
  } else {
    if (!vnode) {
      vnode = ExpandNode(&state, History);  
    }

    if (Params.ThompsonSampling) {
      vnode->GetCumulativeReward(state).Add(0.0);
    }
    else {
      vnode->UCB.Value.Add(0.0);
    }
  }

  double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward;
  if (!Params.ThompsonSampling) {
    qnode.UCB.Value.Add(totalReward);
  }

  return totalReward;
}
Example #2
0
double FlatMCTS::QValue(QNODE &qnode, bool sampling, int depth) const  
{
  double qvalue = 0;

  {
    const std::vector<std::pair<int, double>> &observations =
        qnode.TS.Observation.ThompsonSampling(sampling);  
    for (std::vector<std::pair<int, double>>::const_iterator it =
        observations.begin();
         it != observations.end(); ++it) {
      qvalue += it->second * HValue(qnode.Child(it->first), sampling, depth);
    }
  }

  qvalue *= Simulator.GetDiscount();

  {
    const std::vector<std::pair<double, double>> &rewards =
        qnode.TS.ImmediateReward.ThompsonSampling(sampling);  
    for (std::vector<std::pair<double, double>>::const_iterator it =
        rewards.begin();
         it != rewards.end(); ++it) {
      qvalue += it->second * it->first;
    }
  }

  return qvalue;
}
Example #3
0
File: bauct.cpp Project: caomw/BBRL
double BAUCT::SimulateQ(uint state, QNODE& qnode, uint action)
{
    uint observation;
    double immediateReward, delayedReward = 0;

		uint ii = state*SA+action*S;
		Sampler* nextSSampler = SampFact.getTransitionSampler(pcounts+ii,state,action,S);
		observation =  nextSSampler->getNextStateSample();
		immediateReward = getReward(state,action,observation);
		delete nextSSampler;
		pcounts[state*SA+action*S+observation] += 1;
		bool terminal = false; //FIXME Assumes non-episodic tasks...

    History.Add(action, observation);
	    
		VNODE*& vnode = qnode.Child(observation);
    if (!vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount)
        vnode = ExpandNode(); //&state);

    if (!terminal)
    {
        TreeDepth++;
        if (vnode)
            delayedReward = SimulateV(observation, vnode);
        else{
					delayedReward = Rollout(observation);
				}
        TreeDepth--;
    }

    double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward;
    qnode.Value.Add(totalReward);
    return totalReward;
}
Example #4
0
double MCTS::SimulateQ(STATE &state, QNODE &qnode, int action) {
    int observation;
    double immediateReward, delayedReward = 0;

    if( Simulator.HasAlpha())
        Simulator.UpdateAlpha(qnode, state);
    bool terminal = Simulator.Step(state, action, observation, immediateReward);
    assert(observation >= 0 && observation < Simulator.GetNumObservations());
    History.Add(action, observation);

    if( Params.Verbose >= 3 ) {
        Simulator.DisplayAction(action, cout);
        Simulator.DisplayObservation(state, observation, cout);
        Simulator.DisplayReward(immediateReward, cout);
        Simulator.DisplayState(state, cout);
    }

    VNODE *&vnode = qnode.Child(observation);
    if( !vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount )
        vnode = ExpandNode(&state);

    if( !terminal ) {
        TreeDepth++;
        if( vnode )
            delayedReward = SimulateV(state, vnode);
        else
            delayedReward = Rollout(state);
        TreeDepth--;
    }

    double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward;
    qnode.Value.Add(totalReward);
    return totalReward;
}
Example #5
0
File: bamcp.cpp Project: tttor/babt
double BAMCP::SimulateQ(const SIMULATOR* mdp, uint state, QNODE& qnode, uint action)
{
    uint observation;
    double immediateReward, delayedReward = 0;

    bool terminal = mdp->Step(state, action, observation, immediateReward);
    //assert(observation >= 0 && observation < mdp.GetNumObservations());
    History.Add(action, observation);
        
    if (Params.Verbose >= 3)
    {
        mdp->DisplayAction(action, cout);
        mdp->DisplayObservation(state, observation, cout);
        mdp->DisplayReward(immediateReward, cout);
        mdp->DisplayState(state, cout);
    }
    
    VNODE*& vnode = qnode.Child(observation);
    if (!vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount)
        vnode = ExpandNode(); //&state);

    if (!terminal)
    {
        TreeDepth++;
        if (vnode)
            delayedReward = SimulateV(mdp, observation, vnode);
        else {
            if(Params.RB < 0){
                delayedReward = Rollout(mdp, observation);
            }else{
                //Warning this will fail with lazy sampling factories
                assert(mdp->T != 0);
                MDPutils::valueIterationRmax (  S,
                                                A,
                                                Simulator.rsas,
                                                mdp->T,
                                                Simulator.R,
                                                Simulator.GetDiscount(),
                                                0.0001,
                                                RLPI,
                                                V,counts,Params.RB);
                delayedReward = V[observation];
            }   
        }
        TreeDepth--;
    }

    double totalReward = immediateReward + mdp->GetDiscount() * delayedReward;
    qnode.Value.Add(totalReward);
    return totalReward;
}