void SwingPendulumTest::testOffPACSwingPendulum2() { Random<double>* random = new Random<double>; RLProblem<double>* problem = new SwingPendulum<double>; Hashing<double>* hashing = new MurmurHashing<double>(random, 1000000); Projector<double>* projector = new TileCoderHashing<double>(hashing, problem->dimension(), 10, 10, true); StateToStateAction<double>* toStateAction = new StateActionTilings<double>(projector, problem->getDiscreteActions()); double alpha_v = 0.1 / projector->vectorNorm(); double alpha_w = .005 / projector->vectorNorm(); double gamma = 0.99; Trace<double>* critice = new AMaxTrace<double>(projector->dimension()); Trace<double>* criticeML = new MaxLengthTrace<double>(critice, 1000); GTDLambda<double>* critic = new GTDLambda<double>(alpha_v, alpha_w, gamma, 0.4, criticeML); double alpha_u = 0.5 / projector->vectorNorm(); PolicyDistribution<double>* target = new BoltzmannDistribution<double>(random, problem->getDiscreteActions(), projector->dimension()); Trace<double>* actore = new AMaxTrace<double>(projector->dimension()); Trace<double>* actoreML = new MaxLengthTrace<double>(actore, 1000); Traces<double>* actoreTraces = new Traces<double>(); actoreTraces->push_back(actoreML); ActorOffPolicy<double>* actor = new ActorLambdaOffPolicy<double>(alpha_u, gamma, 0.4, target, actoreTraces); /*Policy<double>* behavior = new RandomPolicy<double>( &problem->getActions());*/ Policy<double>* behavior = new BoltzmannDistribution<double>(random, problem->getDiscreteActions(), projector->dimension()); OffPolicyControlLearner<double>* control = new OffPAC<double>(behavior, critic, actor, toStateAction, projector); RLAgent<double>* agent = new LearnerAgent<double>(control); RLRunner<double>* sim = new RLRunner<double>(agent, problem, 5000, 200, 1); sim->setTestEpisodesAfterEachRun(true); sim->run(); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete criticeML; delete critic; delete actore; delete actoreML; delete actoreTraces; delete actor; delete behavior; delete target; delete control; delete agent; delete sim; }
void SwingPendulumTest::testOffPACOnPolicySwingPendulum() { Random<double>* random = new Random<double>; RLProblem<double>* problem = new SwingPendulum<double>; Hashing<double>* hashing = new MurmurHashing<double>(random, 1000); Projector<double>* projector = new TileCoderHashing<double>(hashing, problem->dimension(), 10, 10, true); StateToStateAction<double>* toStateAction = new StateActionTilings<double>(projector, problem->getDiscreteActions()); double alpha_v = 0.1 / projector->vectorNorm(); double alpha_w = .0001 / projector->vectorNorm(); double gamma = 0.99; double lambda = 0.4; Trace<double>* critice = new ATrace<double>(projector->dimension()); GTDLambda<double>* critic = new GTDLambda<double>(alpha_v, alpha_w, gamma, lambda, critice); double alpha_u = 0.5 / projector->vectorNorm(); PolicyDistribution<double>* acting = new BoltzmannDistribution<double>(random, problem->getDiscreteActions(), projector->dimension()); Trace<double>* actore = new ATrace<double>(projector->dimension()); Traces<double>* actoreTraces = new Traces<double>(); actoreTraces->push_back(actore); ActorOffPolicy<double>* actor = new ActorLambdaOffPolicy<double>(alpha_u, gamma, lambda, acting, actoreTraces); OffPolicyControlLearner<double>* control = new OffPAC<double>(acting, critic, actor, toStateAction, projector); RLAgent<double>* agent = new LearnerAgent<double>(control); RLRunner<double>* sim = new RLRunner<double>(agent, problem, 5000, 10, 5); sim->setTestEpisodesAfterEachRun(true); sim->run(); sim->computeValueFunction(); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore; delete actoreTraces; delete actor; delete acting; delete control; delete agent; delete sim; }
void NAOTest::testTrain() { // OffLine { Random<float>* random = new Random<float>; RLProblem<float>* problem = new MountainCar<float>(random); Hashing<float>* hashing = new MurmurHashing<float>(random, 1000000); Projector<float>* projector = new TileCoderHashing<float>(hashing, problem->dimension(), 10, 10); StateToStateAction<float>* toStateAction = new StateActionTilings<float>(projector, problem->getDiscreteActions()); double alpha_v = 0.05 / projector->vectorNorm(); double alpha_w = 0.0001 / projector->vectorNorm(); double lambda = 0.0; //0.4; double gamma = 0.99; Trace<float>* critice = new ATrace<float>(projector->dimension()); OffPolicyTD<float>* critic = new GTDLambda<float>(alpha_v, alpha_w, gamma, lambda, critice); double alpha_u = 1.0 / projector->vectorNorm(); PolicyDistribution<float>* target = new BoltzmannDistribution<float>(random, problem->getDiscreteActions(), projector->dimension()); Trace<float>* actore = new ATrace<float>(projector->dimension()); Traces<float>* actoreTraces = new Traces<float>(); actoreTraces->push_back(actore); ActorOffPolicy<float>* actor = new ActorLambdaOffPolicy<float>(alpha_u, gamma, lambda, target, actoreTraces); Policy<float>* behavior = new RandomPolicy<float>(random, problem->getDiscreteActions()); OffPolicyControlLearner<float>* control = new OffPAC<float>(behavior, critic, actor, toStateAction, projector); RLAgent<float>* agent = new LearnerAgent<float>(control); Simulator<float>* sim = new Simulator<float>(agent, problem, 5000, 100, 1); //sim->setVerbose(false); sim->run(); control->persist("NAOTest_x32_M.bin"); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore; delete actoreTraces; delete actor; delete behavior; delete target; delete control; delete agent; delete sim; } // OnLine { Random<double>* random = new Random<double>; RLProblem<double>* problem = new SwingPendulum<double>(random); Hashing<double>* hashing = new MurmurHashing<double>(random, 1000); Projector<double>* projector = new TileCoderHashing<double>(hashing, problem->dimension(), 10, 10, false); StateToStateAction<double>* toStateAction = new StateActionTilings<double>(projector, problem->getContinuousActions()); double alpha_v = 0.1 / projector->vectorNorm(); double alpha_u = 0.001 / projector->vectorNorm(); double alpha_r = .0001; double gamma = 1.0; double lambda = 0.5; Trace<double>* critice = new ATrace<double>(projector->dimension()); TDLambda<double>* critic = new TDLambda<double>(alpha_v, gamma, lambda, critice); PolicyDistribution<double>* policyDistribution = new NormalDistributionScaled<double>(random, problem->getContinuousActions(), 0, 1.0, projector->dimension()); Range<double> policyRange(-2.0, 2.0); Range<double> problemRange(-2.0, 2.0); PolicyDistribution<double>* acting = new ScaledPolicyDistribution<double>( problem->getContinuousActions(), policyDistribution, &policyRange, &problemRange); Trace<double>* actore1 = new ATrace<double>(projector->dimension()); Trace<double>* actore2 = new ATrace<double>(projector->dimension()); Traces<double>* actoreTraces = new Traces<double>(); actoreTraces->push_back(actore1); actoreTraces->push_back(actore2); ActorOnPolicy<double>* actor = new ActorLambda<double>(alpha_u, gamma, lambda, acting, actoreTraces); OnPolicyControlLearner<double>* control = new AverageRewardActorCritic<double>(critic, actor, projector, toStateAction, alpha_r); RLAgent<double>* agent = new LearnerAgent<double>(control); Simulator<double>* sim = new Simulator<double>(agent, problem, 5000, 100, 1); sim->run(); control->persist("NAOTest_x32_S.bin"); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore1; delete actore2; delete actoreTraces; delete actor; delete policyDistribution; delete acting; delete control; delete agent; delete sim; } }
void NAOTest::testEvaluate() { { Random<float>* random = new Random<float>; RLProblem<float>* problem = new MountainCar<float>(random); Hashing<float>* hashing = new MurmurHashing<float>(random, 1000000); Projector<float>* projector = new TileCoderHashing<float>(hashing, problem->dimension(), 10, 10, true); StateToStateAction<float>* toStateAction = new StateActionTilings<float>(projector, problem->getDiscreteActions()); Trace<float>* critice = new ATrace<float>(projector->dimension()); OffPolicyTD<float>* critic = new GTDLambda<float>(0, 0, 0, 0, critice); PolicyDistribution<float>* target = new BoltzmannDistribution<float>(random, problem->getDiscreteActions(), projector->dimension()); Trace<float>* actore = new ATrace<float>(projector->dimension()); Traces<float>* actoreTraces = new Traces<float>(); actoreTraces->push_back(actore); ActorOffPolicy<float>* actor = new ActorLambdaOffPolicy<float>(0, 0, 0, target, actoreTraces); Policy<float>* behavior = new RandomPolicy<float>(random, problem->getDiscreteActions()); OffPolicyControlLearner<float>* control = new OffPAC<float>(behavior, critic, actor, toStateAction, projector); RLAgent<float>* agent = new ControlAgent<float>(control); Simulator<float>* sim = new Simulator<float>(agent, problem, 5000, 10, 10); control->reset(); control->resurrect("NAOTest_x32_M.bin"); sim->runEvaluate(10, 10); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore; delete actoreTraces; delete actor; delete behavior; delete target; delete control; delete agent; delete sim; } // OnLine { Random<double>* random = new Random<double>; RLProblem<double>* problem = new SwingPendulum<double>(random); Hashing<double>* hashing = new MurmurHashing<double>(random, 1000); Projector<double>* projector = new TileCoderHashing<double>(hashing, problem->dimension(), 10, 10, false); StateToStateAction<double>* toStateAction = new StateActionTilings<double>(projector, problem->getContinuousActions()); Trace<double>* critice = new ATrace<double>(projector->dimension()); TDLambda<double>* critic = new TDLambda<double>(0, 0, 0, critice); PolicyDistribution<double>* policyDistribution = new NormalDistributionScaled<double>(random, problem->getContinuousActions(), 0, 1.0, projector->dimension()); Range<double> policyRange(-2.0, 2.0); Range<double> problemRange(-2.0, 2.0); PolicyDistribution<double>* acting = new ScaledPolicyDistribution<double>( problem->getContinuousActions(), policyDistribution, &policyRange, &problemRange); Trace<double>* actore1 = new ATrace<double>(projector->dimension()); Trace<double>* actore2 = new ATrace<double>(projector->dimension()); Traces<double>* actoreTraces = new Traces<double>(); actoreTraces->push_back(actore1); actoreTraces->push_back(actore2); ActorOnPolicy<double>* actor = new ActorLambda<double>(0, 0, 0, acting, actoreTraces); OnPolicyControlLearner<double>* control = new AverageRewardActorCritic<double>(critic, actor, projector, toStateAction, 0); RLAgent<double>* agent = new ControlAgent<double>(control); Simulator<double>* sim = new Simulator<double>(agent, problem, 5000, 10, 10); control->reset(); control->resurrect("NAOTest_x32_S.bin"); sim->run(); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore1; delete actore2; delete actoreTraces; delete actor; delete policyDistribution; delete acting; delete control; delete agent; delete sim; } }
void SwingPendulumTest::testOnPolicySwingPendulum() { Random<double>* random = new Random<double>; RLProblem<double>* problem = new SwingPendulum<double>; Hashing<double>* hashing = new MurmurHashing<double>(random, 1000); Projector<double>* projector = new TileCoderHashing<double>(hashing, problem->dimension(), 10, 10, false); StateToStateAction<double>* toStateAction = new StateActionTilings<double>(projector, problem->getContinuousActions()); double alpha_v = 0.1 / projector->vectorNorm(); double alpha_u = 0.001 / projector->vectorNorm(); double alpha_r = .0001; double gamma = 1.0; double lambda = 0.5; Trace<double>* critice = new ATrace<double>(projector->dimension()); TDLambda<double>* critic = new TDLambda<double>(alpha_v, gamma, lambda, critice); PolicyDistribution<double>* policyDistribution = new NormalDistributionScaled<double>(random, problem->getContinuousActions(), 0, 1.0, projector->dimension()); Range<double> policyRange(-2.0, 2.0); Range<double> problemRange(-2.0, 2.0); PolicyDistribution<double>* acting = new ScaledPolicyDistribution<double>( problem->getContinuousActions(), policyDistribution, &policyRange, &problemRange); Trace<double>* actore1 = new ATrace<double>(projector->dimension()); Trace<double>* actore2 = new ATrace<double>(projector->dimension()); Traces<double>* actoreTraces = new Traces<double>(); actoreTraces->push_back(actore1); actoreTraces->push_back(actore2); ActorOnPolicy<double>* actor = new ActorLambda<double>(alpha_u, gamma, lambda, acting, actoreTraces); OnPolicyControlLearner<double>* control = new AverageRewardActorCritic<double>(critic, actor, projector, toStateAction, alpha_r); RLAgent<double>* agent = new LearnerAgent<double>(control); RLRunner<double>* sim = new RLRunner<double>(agent, problem, 5000, 100, 10); sim->setVerbose(true); sim->run(); sim->runEvaluate(100); sim->computeValueFunction(); delete random; delete problem; delete hashing; delete projector; delete toStateAction; delete critice; delete critic; delete actore1; delete actore2; delete actoreTraces; delete actor; delete policyDistribution; delete acting; delete control; delete agent; delete sim; }