int search_uct(int color, int node_n) { NODE *pN = &node[node_n]; CHILD *c = NULL; int select, z, err, win, current_depth; for (;;) { select = select_best_ucb(node_n, color); c = &pN->child[select]; z = c->z; err = put_stone(z, color, FILL_EYE_ERR); if ( err == 0 ) break; c->z = ILLEGAL_Z; // select other move } current_depth = depth; path[depth++] = c->z; // playout in first time. <= 10 can reduce node. if ( c->games <= 0 || depth == D_MAX || (c->z == 0 && depth>=2 && path[depth-2]==0) ) { win = -playout(flip_color(color)); } else { if ( c->next == NODE_EMPTY ) c->next = create_node(c->z); win = -search_uct(flip_color(color), c->next); } update_rave(pN, color, current_depth, win); // update winrate c->rate = (c->rate * c->games + win) / (c->games + 1); c->games++; pN->child_games_sum++; return win; }
void test_playout() { flag_test_playout = 1; playout(1); print_board(); print_sgf(); }
// Sample one possible sequence of future events, up to 'dfr' cycles. reward_t SearchNode::sample(Agent &agent, unsigned int dfr) { double newReward; if (dfr == 0) { return 0; } else if (m_chance_node) { // Generate whole observation-reward percept, // according to the agent's model of the environment. percept_t obs; percept_t rew; agent.genPerceptAndUpdate(obs, rew); // Calculate the index of whole percept percept_t percept = (rew << agent.numObsBits()) | obs; if (m_child.count(percept) == 0) { m_child[percept] = new SearchNode(false, agent.numActions()); } newReward = rew + m_child[percept]->sample(agent, dfr - 1); } else if (m_visits == 0) { newReward = playout(agent, dfr); } else { // Select an action to sample. action_t action = selectAction(agent, dfr); agent.modelUpdate(action); newReward = m_child[action]->sample(agent, dfr); } // Update our estimate of the future reward. m_mean = (1.0 / (double) (m_visits + 1)) * (newReward + m_visits * m_mean); ++m_visits; return newReward; }
int clean_playout(state s, const state_info si, jkiss *jk) { playout(&s, si, jk); kill_groups(&s); int score = chinese_liberty_score(&s); if (s.white_to_play) { return -score; } return score; }
void UCTParallel::search_uct_root(Board& board, const Color color, UCTNode* node, UCTNode* copychild) { // UCBからプレイアウトする手を選択 // rootノードはアトミックに更新するためUCB計算ではロックしない UCTNode* selected_node = select_node_with_ucb(node); // rootでは全て合法手なのでエラーチェックはしない board.move_legal(selected_node->xy, color); // コピーされたノードに変換 UCTNode* selected_node_copy = copychild + (selected_node - node->child); int win; // 閾値以下の場合プレイアウト(全スレッドの合計値) if (selected_node->playout_num < THR) { win = 1 - playout(board, opponent(color)); } else { if (selected_node_copy->child_num == 0) { // ノードを展開 if (selected_node_copy->expand_node(board)) { win = 1 - search_uct(board, opponent(color), selected_node_copy); } else { // ノードプール不足 win = 1 - playout(board, opponent(color)); } } else { win = 1 - search_uct(board, opponent(color), selected_node_copy); } } // 勝率を更新(アトミックに加算) _InterlockedExchangeAdd(&selected_node->win_num, win); _InterlockedIncrement(&selected_node->playout_num); _InterlockedIncrement(&node->playout_num_sum); }
Board::Grid UCT::playout(Board::Grid grid, int depth){ if(depth == 0) return grid; if(! Board::alive(grid)) return grid; Dir dir = allDirs[mt()%4]; grid = Board::moveAndBirth(grid, dir).second; // for(int i(0); i < 4; ++i){ // for(int j(0); j < 4; ++j){ // grid = Board::set(grid, i, j, 1); // } // } return playout(grid, depth - 1); }
int primitive_monte_calro(int color) { int try_num = 30; // number of playout int best_z = 0; double best_value; double win_rate; int x,y,err,i,win_sum,win; int ko_z_copy; int board_copy[BOARD_MAX]; // keep current board ko_z_copy = ko_z; memcpy(board_copy, board, sizeof(board)); best_value = -100; // try all empty point for (y=0;y<B_SIZE;y++) for (x=0;x<B_SIZE;x++) { int z = get_z(x+1,y+1); if ( board[z] != 0 ) continue; err = put_stone(z, color, FILL_EYE_ERR); if ( err != 0 ) continue; win_sum = 0; for (i=0;i<try_num;i++) { int board_copy2[BOARD_MAX]; int ko_z_copy2 = ko_z; memcpy(board_copy2, board, sizeof(board)); win = -playout(flip_color(color)); win_sum += win; ko_z = ko_z_copy2; memcpy(board, board_copy2, sizeof(board)); } win_rate = (double)win_sum / try_num; // print_board(); // prt("z=%d,win=%5.3f\n",get81(z),win_rate); if ( win_rate > best_value ) { best_value = win_rate; best_z = z; // prt("best_z=%d,color=%d,v=%5.3f,try_num=%d\n",get81(best_z),color,best_value,try_num); } ko_z = ko_z_copy; memcpy(board, board_copy, sizeof(board)); // resume board } return best_z; }
Dir UCT::decideDir(){ std::array<int, 4> counts; std::array<double, 4> sums; int count(4 * ITERATION_BOOT); sums.fill(0); for(auto dir: allDirs){ for(int i(0); i < ITERATION_BOOT; ++i){ sums[dirToInt(dir)] += staticEval(playout(Board::moved(grid, dir), PLAYOUT_DEPTH)); ++counts[dirToInt(dir)]; } } for(int i(0); i < ITERATION; ++i){ std::array<double, 4> ucb1s; for(auto dir: allDirs){ ucb1s[dirToInt(dir)] = (sums[dirToInt(dir)] / counts[dirToInt(dir)]) + std::sqrt(2 * std::log2(count) / counts[dirToInt(dir)]); } Dir bestDir = Dir::Up; double maxUcb1 = 0.0; for(auto dir: allDirs){ if(maxUcb1 < ucb1s[dirToInt(dir)]){ bestDir = dir; maxUcb1 = ucb1s[dirToInt(dir)]; } } sums[dirToInt(bestDir)] += staticEval(playout(Board::moved(grid, bestDir), PLAYOUT_DEPTH)); } Dir bestDir = Dir::Up; double maxAve = 0.0; for(auto dir: allDirs){ if(maxAve < sums[dirToInt(dir)] / counts[dirToInt(dir)]){ bestDir = dir; maxAve = sums[dirToInt(dir)] / counts[dirToInt(dir)]; } } std::cout << maxAve << std::endl; return bestDir; }
// single iteration of monte-carlo tree search. void mcIteration(Node *root) { vector<Node *> pathFromRoot; Node *cur = root; while (!cur->IsLeaf()) { pathFromRoot.push_back(cur); cur = cur->Select(P_RANDOM); } pathFromRoot.push_back(cur); Node *playoutNode = cur->Expand(); if (playoutNode == nullptr) { playoutNode = cur; } else { pathFromRoot.push_back(playoutNode); } double utility = playout(playoutNode); for (int i = pathFromRoot.size() - 1; i >= 0; i--) { pathFromRoot[i]->AddUtility(utility); utility = -utility; } }
int main() { size_t i; size_t num_iter_single = 1000000; size_t num_iter_parallel = 10000000; state base_state = (state) {rectangle(7, 7), 0, 0, 0, 0, 0, 0}; int asdf; make_move(&base_state, one(3, 3), &asdf); state_info si; init_state(&base_state, &si); state s = base_state; jkiss jk; jkiss_init(&jk); printf("%d\n", playout(&s, si, &jk)); print_state(&s); kill_groups(&s); print_state(&s); printf("%d\n", chinese_liberty_score(&s)); score_bins sb = score_bins_new(base_state); for (i = 0; i < num_iter_single; i++) { score_bins_add(sb, clean_playout(base_state, si, &jk)); } print_score_bins(sb, 50); printf("-----------------------------\n"); score_bins_reset(sb); monte_carlo(base_state, si, sb, num_iter_parallel); print_score_bins(sb, 50); /* int num_threads = omp_get_max_threads(); jkiss *jks = malloc(num_threads * sizeof(jkiss)); bin_t **binss = malloc(num_threads * sizeof(bin_t*)); for (int i = 0; i < num_threads; i++) { binss[i] = calloc(num_bins, sizeof(bin_t)); jkiss_init(jks + i); } int tid; int finished = 0; #pragma omp parallel private(i, s, tid) { tid = omp_get_thread_num(); for (i = 0; i < num_iter_parallel; i++) { s = base_state; playout(&s, si, jks + tid); kill_groups(&s); int score = chinese_liberty_score(&s); if (s.white_to_play) { score = -score; } binss[tid][si.size + score]++; // Bailout once one thread finishes. if (finished) { break; } } finished = 1; } for (i = 0; i < num_bins; i++) { bins[i] = 0; } for (i = 0; i < num_threads; i++) { for (int j = 0; j < num_bins; j++) { bins[j] += binss[i][j]; } } print_bins(bins, num_bins, 128); */ return 0; }