double FlatMCTS::SimulateQ(STATE &state, QNODE &qnode, int action, int depth) { int observation; double immediateReward; double delayedReward = 0.0; bool terminal = Simulator.Step(state, action, observation, immediateReward); if (Params.ThompsonSampling) { qnode.Update(observation, immediateReward, 1); } History.Add(action, observation); if (Params.Verbose >= 3) { Simulator.DisplayAction(action, cout); Simulator.DisplayObservation(state, observation, cout); Simulator.DisplayReward(immediateReward, cout); Simulator.DisplayState(state, cout); } VNODE *&vnode = qnode.Child(observation); if (!terminal) { if (vnode) { delayedReward = SimulateV(state, vnode, depth + 1); } else { vnode = ExpandNode(&state, History); STATE *copy = Simulator.Copy(state); delayedReward = Rollout(*copy, depth + 1); Simulator.FreeState(copy); if (Params.ThompsonSampling) { vnode->GetCumulativeReward(state).Add(delayedReward); } else { vnode->UCB.Value.Add(delayedReward); } } } else { if (!vnode) { vnode = ExpandNode(&state, History); } if (Params.ThompsonSampling) { vnode->GetCumulativeReward(state).Add(0.0); } else { vnode->UCB.Value.Add(0.0); } } double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward; if (!Params.ThompsonSampling) { qnode.UCB.Value.Add(totalReward); } return totalReward; }
double FlatMCTS::QValue(QNODE &qnode, bool sampling, int depth) const { double qvalue = 0; { const std::vector<std::pair<int, double>> &observations = qnode.TS.Observation.ThompsonSampling(sampling); for (std::vector<std::pair<int, double>>::const_iterator it = observations.begin(); it != observations.end(); ++it) { qvalue += it->second * HValue(qnode.Child(it->first), sampling, depth); } } qvalue *= Simulator.GetDiscount(); { const std::vector<std::pair<double, double>> &rewards = qnode.TS.ImmediateReward.ThompsonSampling(sampling); for (std::vector<std::pair<double, double>>::const_iterator it = rewards.begin(); it != rewards.end(); ++it) { qvalue += it->second * it->first; } } return qvalue; }
double BAUCT::SimulateQ(uint state, QNODE& qnode, uint action) { uint observation; double immediateReward, delayedReward = 0; uint ii = state*SA+action*S; Sampler* nextSSampler = SampFact.getTransitionSampler(pcounts+ii,state,action,S); observation = nextSSampler->getNextStateSample(); immediateReward = getReward(state,action,observation); delete nextSSampler; pcounts[state*SA+action*S+observation] += 1; bool terminal = false; //FIXME Assumes non-episodic tasks... History.Add(action, observation); VNODE*& vnode = qnode.Child(observation); if (!vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount) vnode = ExpandNode(); //&state); if (!terminal) { TreeDepth++; if (vnode) delayedReward = SimulateV(observation, vnode); else{ delayedReward = Rollout(observation); } TreeDepth--; } double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward; qnode.Value.Add(totalReward); return totalReward; }
double MCTS::SimulateQ(STATE &state, QNODE &qnode, int action) { int observation; double immediateReward, delayedReward = 0; if( Simulator.HasAlpha()) Simulator.UpdateAlpha(qnode, state); bool terminal = Simulator.Step(state, action, observation, immediateReward); assert(observation >= 0 && observation < Simulator.GetNumObservations()); History.Add(action, observation); if( Params.Verbose >= 3 ) { Simulator.DisplayAction(action, cout); Simulator.DisplayObservation(state, observation, cout); Simulator.DisplayReward(immediateReward, cout); Simulator.DisplayState(state, cout); } VNODE *&vnode = qnode.Child(observation); if( !vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount ) vnode = ExpandNode(&state); if( !terminal ) { TreeDepth++; if( vnode ) delayedReward = SimulateV(state, vnode); else delayedReward = Rollout(state); TreeDepth--; } double totalReward = immediateReward + Simulator.GetDiscount() * delayedReward; qnode.Value.Add(totalReward); return totalReward; }
double BAMCP::SimulateQ(const SIMULATOR* mdp, uint state, QNODE& qnode, uint action) { uint observation; double immediateReward, delayedReward = 0; bool terminal = mdp->Step(state, action, observation, immediateReward); //assert(observation >= 0 && observation < mdp.GetNumObservations()); History.Add(action, observation); if (Params.Verbose >= 3) { mdp->DisplayAction(action, cout); mdp->DisplayObservation(state, observation, cout); mdp->DisplayReward(immediateReward, cout); mdp->DisplayState(state, cout); } VNODE*& vnode = qnode.Child(observation); if (!vnode && !terminal && qnode.Value.GetCount() >= Params.ExpandCount) vnode = ExpandNode(); //&state); if (!terminal) { TreeDepth++; if (vnode) delayedReward = SimulateV(mdp, observation, vnode); else { if(Params.RB < 0){ delayedReward = Rollout(mdp, observation); }else{ //Warning this will fail with lazy sampling factories assert(mdp->T != 0); MDPutils::valueIterationRmax ( S, A, Simulator.rsas, mdp->T, Simulator.R, Simulator.GetDiscount(), 0.0001, RLPI, V,counts,Params.RB); delayedReward = V[observation]; } } TreeDepth--; } double totalReward = immediateReward + mdp->GetDiscount() * delayedReward; qnode.Value.Add(totalReward); return totalReward; }