// simulate a sequence of random actions, returning the accumulated reward. static reward_t playout(Agent &agent, unsigned int playout_len) { reward_t r = 0; for (unsigned int i = 0; i < playout_len; ++i) { // Pick a random action action_t a = agent.genRandomAction(); agent.modelUpdate(a); // Generate a random percept distributed according to the agent's // internal model of the environment. percept_t rew; percept_t obs; agent.genPerceptAndUpdate(obs, rew); r = r + rew; } return r; }
// The main agent/environment interaction loop void mainLoop(Agent &ai, Environment &env, options_t &options) { // Determine exploration options bool explore = options.count("exploration") > 0; double explore_rate, explore_decay; if (explore) { strExtract(options["exploration"], explore_rate); strExtract(options["explore-decay"], explore_decay); assert(0.0 <= explore_rate && explore_rate <= 1.0); assert(0.0 <= explore_decay && explore_decay <= 1.0); } // Determine termination age bool terminate_check = options.count("terminate-age") > 0; age_t terminate_age; if (terminate_check) { strExtract(options["terminate-age"], terminate_age); assert(0 <= terminate_age); } // Determine mc-timelimit timelimit_t mc_timelimit; strExtract(options["mc-timelimit"], mc_timelimit); //if we assume that time_limit > agent.numActions() we can be sure //that every action is selected at least once if(mc_timelimit < ai.numActions()){ std::cerr << "WARNING: time_limit not large enough to sample all actions" << std::endl; } // Determine whether to write cts during the process, or only at the end. bool intermediate_ct = true; if(options.count("intermediate-ct") > 0){ intermediate_ct = !(options["intermediate-ct"] == "0"); } std::cout << "starting agent/environment interaction loop...\n"; // Agent/environment interaction loop for (unsigned int cycle = 1; !env.isFinished(); cycle++) { // check for agent termination if (terminate_check && ai.age() >= terminate_age) { verboseLog << "info: terminating agent" << std::endl; break; } // Get a percept from the environment percept_t observation = env.getObservation(); percept_t reward = env.getReward(); // Update agent's environment model with the new percept ai.modelUpdate(observation, reward); // Determine best exploitive action, or explore action_t action; bool explored = false; if (explore && rand01() < explore_rate) { explored = true; action = ai.genRandomAction(); } else { action = search(ai, mc_timelimit); } // Send an action to the environment env.performAction(action); // Update agent's environment model with the chosen action ai.modelUpdate(action); // Log this turn verboseLog << "cycle: " << cycle << std::endl; verboseLog << "observation: " << observation << std::endl; verboseLog << "reward: " << reward << std::endl; verboseLog << "action: " << action << std::endl; verboseLog << "explored: " << (explored ? "yes" : "no") << std::endl; verboseLog << "explore rate: " << explore_rate << std::endl; verboseLog << "total reward: " << ai.reward() << std::endl; verboseLog << "average reward: " << ai.averageReward() << std::endl; // Log the data in a more compact form compactLog << cycle << ", " << observation << ", " << reward << ", " << action << ", " << explored << ", " << explore_rate << ", " << ai.reward() << ", " << ai.averageReward() << std::endl; // Print to standard output when cycle == 2^n if ((cycle & (cycle - 1)) == 0) { std::cout << "cycle: " << cycle << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; if (explore) { std::cout << "explore rate: " << explore_rate << std::endl; } // Write context tree file if(options["write-ct"] != "" && intermediate_ct){ // write a ct for each 2^n cycles. char cycle_string[256]; sprintf(cycle_string, "%d", cycle); std::ofstream ct((options["write-ct"] + std::string(cycle_string) + ".ct").c_str()); ai.writeCT(ct); ct.close(); } } // Update exploration rate if (explore) explore_rate *= explore_decay; } // Print summary to standard output std::cout << std::endl << std::endl << "SUMMARY" << std::endl; std::cout << "agent age: " << ai.age() << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; // Write context tree file if(options["write-ct"] != ""){ // write a ct for the final cycle too. char cycle_string[256]; sprintf(cycle_string, "%lld", ai.age()); std::ofstream ct((options["write-ct"] + std::string(cycle_string) + ".ct").c_str()); ai.writeCT(ct); ct.close(); } }
// The main agent/environment interaction loop void mainLoop(Agent &ai, Environment &env, options_t &options) { // Determine exploration options bool explore = options.count("exploration") > 0; double explore_rate, explore_decay; if (explore) { strExtract(options["exploration"], explore_rate); strExtract(options["explore-decay"], explore_decay); assert(0.0 <= explore_rate && explore_rate <= 1.0); assert(0.0 <= explore_decay && explore_decay <= 1.0); } // Determine termination lifetime bool terminate_check = options.count("terminate-lifetime") > 0; lifetime_t terminate_lifetime; if (terminate_check) { strExtract(options["terminate-lifetime"], terminate_lifetime); assert(0 <= terminate_lifetime); } // Agent/environment interaction loop for (unsigned int cycle = 1; !env.isFinished(); cycle++) { // check for agent termination if (terminate_check && ai.lifetime() > terminate_lifetime) { log << "info: terminating lifetiment" << std::endl; break; } // Get a percept from the environment percept_t observation = env.getObservation(); percept_t reward = env.getReward(); // Update agent's environment model with the new percept ai.modelUpdate(observation, reward); // TODO: implement in agent.cpp // Determine best exploitive action, or explore action_t action; bool explored = false; if (explore && rand01() < explore_rate) { explored = true; action = ai.genRandomAction(); } else { action = search(ai); // TODO: implement in search.cpp } // Send an action to the environment env.performAction(action); // TODO: implement for each environment // Update agent's environment model with the chosen action ai.modelUpdate(action); // TODO: implement in agent.cpp // Log this turn log << "cycle: " << cycle << std::endl; log << "observation: " << observation << std::endl; log << "reward: " << reward << std::endl; log << "action: " << action << std::endl; log << "explored: " << (explored ? "yes" : "no") << std::endl; log << "explore rate: " << explore_rate << std::endl; log << "total reward: " << ai.reward() << std::endl; log << "average reward: " << ai.averageReward() << std::endl; // Log the data in a more compact form compactLog << cycle << ", " << observation << ", " << reward << ", " << action << ", " << explored << ", " << explore_rate << ", " << ai.reward() << ", " << ai.averageReward() << std::endl; // Print to standard output when cycle == 2^n if ((cycle & (cycle - 1)) == 0) { std::cout << "cycle: " << cycle << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; if (explore) { std::cout << "explore rate: " << explore_rate << std::endl; } } // Update exploration rate if (explore) explore_rate *= explore_decay; } // Print summary to standard output std::cout << std::endl << std::endl << "SUMMARY" << std::endl; std::cout << "agent lifetime: " << ai.lifetime() << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; }
/** The main agent/environment interaction loop. Each interaction cycle begins * with the agent receiving an observation and reward from the environment. * Subsequently, the agent selects an action and informs the environment. The * interactions that took place are logged to the ::logger and ::compactLogger * streams. When the cycle equals a power of two, a summary of the interactions * is printed to the standard output. * \param ai The agent. * \param env The environment. * \param options The configuration options. */ void mainLoop(Agent &ai, Environment &env, options_t &options) { // Apply random seed (Defaut: 0) srand(getOption<unsigned int>(options, "random-seed", 0)); // Verbose output (Default: false) bool verbose = getOption<bool>(options, "verbose", false); // Determine exploration options (Default: don't explore, don't decay) bool explore = options.count("exploration") > 0; double explore_rate = getOption<double>(options, "exploration", 0.0); double explore_decay = getOption<double>(options, "explore-decay", 1.0); assert(0.0 <= explore_rate); assert(0.0 <= explore_decay && explore_decay <= 1.0); // Determine termination age (Default: don't terminate) bool terminate_check = options.count("terminate-age") > 0; age_t terminate_age = getOption<age_t>(options, "terminate-age", 0); assert(0 <= terminate_age); // Determine the cycle after which the agent stops learning (if ever) int learning_period = getOption<int>(options, "learning-period", 0); assert(0 <= learning_period); // Agent/environment interaction loop for (int cycle = 1; !env.isFinished(); cycle++) { // Check for agent termination if (terminate_check && ai.age() > terminate_age) { break; } // Save the current clock cycle (to compute how long this cycle took) clock_t cycle_start = clock(); // Get a percept from the environment percept_t observation = env.getObservation(); percept_t reward = env.getReward(); if (learning_period > 0 && cycle > learning_period) explore = false; // Update agent's environment model with the new percept ai.modelUpdate(observation, reward); // Determine best exploitive action, or explore action_t action; bool explored = false; if (explore && (rand01() < explore_rate)) { // Explore explored = true; action = ai.genRandomAction(); } else { // Exploit action = ai.search(); } // Send an action to the environment env.performAction(action); // Update agent's environment model with the chosen action ai.modelUpdate(action); // Calculate how long this cycle took double time = double(clock() - cycle_start) / double(CLOCKS_PER_SEC); // Log this turn logger << cycle << ", " << observation << ", " << reward << ", " << action << ", " << explored << ", " << explore_rate << ", " << ai.totalReward() << ", " << ai.averageReward() << ", " << time << ", " << ai.modelSize() << std::endl; // Print to standard output when cycle == 2^n or on verbose option if (verbose || (cycle & (cycle - 1)) == 0) { std::cout << "cycle: " << cycle << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; if (explore) { std::cout << "explore rate: " << explore_rate << std::endl; } } // Print environment state if verbose option is true if (verbose) { std::cout << env.print(); } // Update exploration rate if (explore) explore_rate *= explore_decay; } // Print summary to standard output std::cout << std::endl << std::endl << "SUMMARY" << std::endl; std::cout << "agent age: " << ai.age() << std::endl; std::cout << "average reward: " << ai.averageReward() << std::endl; }