/** * Offer help to a randomly picked host */ void PROCESSOR::offer_help() { if(!help_messages && n_idle_processors == n_processors && !use_abdada_cluster ) { register int i, count = 0,dest; l_lock(lock_smp); for(i = 0;i < n_processors;i++) { if(processors[i]->state == WAIT) count++; } l_unlock(lock_smp); if(count == n_processors) { while(true) { dest = (rand() % n_hosts); if((dest != host_id) && (dest != prev_dest)) break; } ISend(dest,HELP); help_messages++; prev_dest = dest; } } }
void manager_t::manager_idle_loop() { int flag, i; int n_message = 0; split_point_t *sp_pointer; host_should_stop = false; while (!host_should_stop) { do { //Polling. MPI_Iprobe<->MPI_Recv is not thread safe. mutex_lock(&lock_mpi); MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, &mpi_status); // Message recieved? if (flag) { int message_id = mpi_status.MPI_TAG; int source = mpi_status.MPI_SOURCE; // write this to log file logf << "Receive: " << get_time() << " " << MSG_NAME[message_id] << endl; if (message_id == SUBMIT_SPLIT) { sp_msg_t sp_msg; MPI_Recv(&sp_msg, sizeof(sp_msg_t), MPI_BYTE, source, message_id, MPI_COMM_WORLD, MPI_STATUS_IGNORE); mutex_unlock(&lock_mpi); // set up split point if (true) { int master = source; int stck_top = sp_stack_top[source]; sp_pointer = &(sp_stack[source][stck_top]);//SplitPointStack[master] + ActiveSplitPoints[master]; sp_stack_top[source]++; if (sp_stack_top[source] > max_stck[source]) { max_stck[source] = sp_stack_top[source]; } // Initialize the split point object: //copy_position(&(sp_pointer->parent_pos), p); memcpy(sp_pointer->fen, sp_msg.fen, (sizeof(char)) * 256); set_position(&(sp_pointer->parent_pos), sp_msg.fen); //sp_pointer sp_pointer->sp_id = host_id * 1000000 + global_sp_id; sp_pointer->ply = sp_msg.ply; sp_pointer->depth = sp_msg.depth; //split_point->alpha = *alpha; split_point->beta = *beta; //split_point->pvnode = pvnode; //split_point->bestvalue = *bestvalue; sp_pointer->master = sp_msg.master_id;//source; memcpy(sp_pointer->mstack2, sp_msg.mstack, (sizeof(move_stack_t)) * 256); sp_pointer->current = sp_pointer->mstack2 + sp_msg.current; sp_pointer->end = sp_pointer->mstack2 + sp_msg.end; sp_pointer->cpus = 0; sp_pointer->nodes = sp_msg.nodes; logf << "spid(" << sp_pointer->sp_id << ") init nodes " << sp_msg.nodes << endl; // clear all hosts status for (i = 0; i < n_host; i++) { sp_pointer->slaves[i] = 0; } // Make copies of the current position and search stack for each thread: for (i = 0; i < n_host; i++) { //if (host_is_avaliable(i) || i == master) { logf << "submit_split at [" << i << "]: " << all_status[i] << " " << all_master[i] << endl; if ((i == master) || (all_status[i] == HOST_IDLE && all_master[i] == master)) { logf << "set slave " << i << endl; sp_pointer->slaves[i] = 1; sp_pointer->cpus++; } } // Tell the threads that they have work to do. This will make them leave // their idle loop. for (i = 0; i < n_host; i++) { //if(i == master || split_point->slaves[i]) { if (sp_pointer->slaves[i]) { // don't send message to master host // send init message init_message_t init_msg; //copy_position(&(init_msg.init_pos), p); memcpy(init_msg.fen, sp_pointer->fen, (sizeof(char)) * 256); init_msg.master_id = master; init_msg.stack_top = stck_top; init_msg.sp_id = (sp_pointer->sp_id); // set the slave host in "RUNNING" status all_status[i] = HOST_RUNNING; // set it running before it is really running ISend(i, INIT, (void*)(&init_msg), sizeof(init_message_t)); } } // =========== log ================= logf << "Split { " << endl; logf << " CPUs: " << sp_pointer->cpus << endl; logf << " Master: " << sp_pointer->master << endl; logf << " Stack_top: " << sp_stack_top << endl; logf << " Split_depth: " << sp_pointer->depth << endl; logf << " Slaves: "; for (i = 0; i < n_host; i++) { if (sp_pointer->slaves[i]) { logf << i << " "; } } logf << endl; logf << "}" << endl; // ================================= } } else if (message_id == TRY_SPLIT) { MPI_Recv(MPI_BOTTOM,0,MPI_INT,source,message_id,MPI_COMM_WORLD,MPI_STATUS_IGNORE); mutex_unlock(&lock_mpi); // is this split ok? //ISend(source, DECLINE); int avaliable_helpers[64]; if (sp_stack_top[source] < 4 && idle_host_exist(source, avaliable_helpers)) { // make a reservation so that other "TRY_SPLIT" int this_master = source; // host that apply for a split for (i = 0; i < n_host; i++) { if ((avaliable_helpers[i] > 0) || (i == this_master)) { all_master[i] = this_master; } } // ========================== logf << "[" << host_id << "] approve split application with "; for (i = 0; i < n_host; i++) { if (all_master[i] == this_master) { logf << i << " "; } } logf << endl; // ========================= ISend(source, ACCHELP); } else { logf << "[" << host_id << "] split application got rejected!" << endl; ISend(source, DECLINE); } } else if (message_id == MERGE) { // merge merge_message_t merge_msg; MPI_Recv(&merge_msg, sizeof(merge_message_t), MPI_BYTE, source, message_id, MPI_COMM_WORLD, MPI_STATUS_IGNORE); uint64_t nodes = merge_msg.nodes; int master = merge_msg.master_id; int stack_top = merge_msg.stack_top; cout << " get merge msg from " << source << " for split stack " << stack_top << " and master " << master << " with value " << nodes << endl; sp_pointer = &(sp_stack[master][stack_top]); // record the number of nodes! sp_pointer->nodes += nodes; // next task at split point? search_task_t next_task; pop_next_task(*sp_pointer, next_task); if (next_task.move_to_search != 0) { // a legal move split_message_t next_split_msg; next_split_msg.task = next_task; // send next task next_split_msg.master_id = master; // from my host_id next_split_msg.stack_top = stack_top; cout << "send split move " << next_task.move_to_search << endl; // sent new task ISend(source, SPLIT, (void*)(&next_split_msg), sizeof(split_message_t)); } else { // sent cancel to let this host be in idle //// /* sp_pointer->cpus--; sp_pointer->slaves[source] = 0; if (source == (sp_pointer->master)) { if (sp_pointer->cpus == 0) { sp_stack_top[master]--; // write back the result! writeback_msg_t write_back; write_back.master_id = master; write_back.host_id = source; write_back.nodes = sp_pointer->nodes; ISend(source, WRITEBACK_SPLIT, (void*)(&write_back), sizeof(writeback_msg_t)); usleep(1000); // let master quit ISend(source, QUIT); } else { ISend(source, CANCEL); // idle master } } else { ISend(source, CANCEL); } */ sp_pointer->cpus--; sp_pointer->slaves[source] = 0; if (sp_pointer->cpus == 0) { sp_stack_top[master]--; // write back the result! writeback_msg_t write_back; write_back.master_id = master; write_back.host_id = source; write_back.nodes = sp_pointer->nodes; //ISend(source, WRITEBACK_SPLIT, (void*)(&write_back), sizeof(writeback_msg_t)); ISend(master, WRITEBACK_SPLIT, (void*)(&write_back), sizeof(writeback_msg_t)); usleep(1000); // let master quit if (source == master) { ISend(source, QUIT); // ISend(master, QUIT); } else { ISend(source, CANCEL); usleep(1000); ISend(master, QUIT); } } else { // not done yet, other cpus is still working if (source == master) { // ISend(source, CANCEL); // idle master, might be used as helpful master } else { ISend(source, CANCEL); // slave is free, and is avaliable to help others } } } mutex_unlock(&lock_mpi); } else if (message_id == STATUS) { update_message_t update_msg; MPI_Recv(&update_msg, sizeof(update_message_t), MPI_BYTE, source, message_id, MPI_COMM_WORLD, MPI_STATUS_IGNORE); all_status[source] = update_msg.new_host_status; // if (all_status[source] == HOST_IDLE) { all_master[source] = -1; // I have no master now // tell others that there are some idle host, and a split might be allowed for (i = 0; i < n_host; i++) { if ((i != source) && (all_status[i] == HOST_RUNNING)) { ISend(i, SPLIT_OPPORTU); } } } else if (all_status[source] == HOST_RUNNING) { //if (all_master[source] == host_id) { // assert(all_master[source] >= 0); // //} } print_status_table(all_status); mutex_unlock(&lock_mpi); } else if (message_id == QUIT) { mutex_unlock(&lock_mpi); usleep(5000 * host_id); for (i = 0; i < n_host; i++ ) { cout << "["<< host_id << "] max_sp_stack_top = " << max_stck[i] << endl; } host_should_stop = true; break; } else { mutex_unlock(&lock_mpi); } // number of n_message++; } else { mutex_unlock(&lock_mpi); } } while(flag); if (host_should_stop) { break; } } cout << "Manager host exit!" << endl; };
/** * Handle messages */ void PROCESSOR::handle_message(int source,int message_id) { const PSEARCHER psb = processors[0]->searcher; /************************************************** * SPLIT - Search from recieved position **************************************************/ if(message_id == SPLIT) { SPLIT_MESSAGE split; Recv(source,message_id,&split,sizeof(SPLIT_MESSAGE)); message_available = 0; /*setup board by undoing old moves and making new ones*/ register int i,score,move,using_pvs; if(split.pv_length) { for(i = 0;i < split.pv_length && i < psb->ply;i++) { if(split.pv[i] != psb->hstack[psb->hply - psb->ply + i].move) break; } while(psb->ply > i) { if(psb->hstack[psb->hply - 1].move) psb->POP_MOVE(); else psb->POP_NULL(); } for(;i < split.pv_length;i++) { if(split.pv[i]) psb->PUSH_MOVE(split.pv[i]); else psb->PUSH_NULL(); } } else { psb->PUSH_MOVE(split.pv[0]); } /*reset*/ SEARCHER::abort_search = 0; psb->clear_block(); /************************************** * PVS-search on root node *************************************/ processors[0]->state = GO; using_pvs = false; psb->pstack->extension = split.extension; psb->pstack->reduction = split.reduction; psb->pstack->depth = split.depth; psb->pstack->alpha = split.alpha; psb->pstack->beta = split.beta; psb->pstack->node_type = split.node_type; psb->pstack->search_state = split.search_state; if(psb->pstack->beta != psb->pstack->alpha + 1) { psb->pstack->node_type = CUT_NODE; psb->pstack->beta = psb->pstack->alpha + 1; using_pvs = true; } /*Search move and re-search if necessary*/ move = psb->hstack[psb->hply - 1].move; while(true) { psb->search(); if(psb->stop_searcher || SEARCHER::abort_search) { move = 0; score = 0; break; } score = -psb->pstack->best_score; /*research with full depth*/ if(psb->pstack->reduction && score >= -split.alpha ) { psb->pstack->depth += psb->pstack->reduction; psb->pstack->reduction = 0; psb->pstack->alpha = split.alpha; psb->pstack->beta = split.alpha + 1; psb->pstack->node_type = CUT_NODE; psb->pstack->search_state = NULL_MOVE; continue; } /*research with full window*/ if(using_pvs && score > -split.beta && score < -split.alpha ) { using_pvs = false; psb->pstack->alpha = split.alpha; psb->pstack->beta = split.beta; psb->pstack->node_type = split.node_type; psb->pstack->search_state = NULL_MOVE; continue; } break; } /*undomove : Go to previous ply even if search was interrupted*/ while(psb->ply > psb->stop_ply - 1) { if(psb->hstack[psb->hply - 1].move) psb->POP_MOVE(); else psb->POP_NULL(); } processors[0]->state = WAIT; /*********************************************************** * Send result back and release all helper nodes we aquired ***********************************************************/ PROCESSOR::cancel_idle_hosts(); MERGE_MESSAGE merge; merge.nodes = psb->nodes; merge.qnodes = psb->qnodes; merge.time_check = psb->time_check; merge.splits = psb->splits; merge.bad_splits = psb->bad_splits; merge.egbb_probes = psb->egbb_probes; /*pv*/ merge.master = split.master; merge.best_move = move; merge.best_score = score; merge.pv_length = 0; if(move && score > -split.beta && score < -split.alpha) { merge.pv[0] = move; memcpy(&merge.pv[1],&(psb->pstack + 1)->pv[psb->ply + 1], ((psb->pstack + 1)->pv_length - psb->ply ) * sizeof(MOVE)); merge.pv_length = (psb->pstack + 1)->pv_length - psb->ply; } /*send it*/ MPI_Request rq; ISend(source,PROCESSOR::MERGE,&merge,MERGE_MESSAGE_SIZE(merge),&rq); Wait(&rq); } else if(message_id == MERGE) { /************************************************** * MERGE - Merge result of move at split point **************************************************/ MERGE_MESSAGE merge; Recv(source,message_id,&merge,sizeof(MERGE_MESSAGE)); /*update master*/ PSEARCHER master = (PSEARCHER)merge.master; l_lock(master->lock); if(merge.best_move && merge.best_score > master->pstack->best_score) { master->pstack->best_score = merge.best_score; master->pstack->best_move = merge.best_move; if(merge.best_score > master->pstack->alpha) { if(merge.best_score > master->pstack->beta) { master->pstack->flag = LOWER; l_unlock(master->lock); master->handle_fail_high(); l_lock(master->lock); } else { master->pstack->flag = EXACT; master->pstack->alpha = merge.best_score; memcpy(&master->pstack->pv[master->ply],&merge.pv[0], merge.pv_length * sizeof(MOVE)); master->pstack->pv_length = merge.pv_length + master->ply; } } } /*update counts*/ master->nodes += merge.nodes; master->qnodes += merge.qnodes; master->time_check += merge.time_check; master->splits += merge.splits; master->bad_splits += merge.bad_splits; master->egbb_probes += merge.egbb_probes; l_unlock(master->lock); /* * We finished searching one move from the current split. * Check for more moves there and keep on searching. * Otherwise remove the node from the split's helper list, * and add it to the list of idle helpers. */ l_lock(lock_smp); SPLIT_MESSAGE& split = global_split[source]; if(!master->stop_searcher && master->get_cluster_move(&split,true)) { l_unlock(lock_smp); ISend(source,PROCESSOR::SPLIT,&split,RESPLIT_MESSAGE_SIZE(split)); } else { if(n_hosts > 2) ISend(source,CANCEL); else available_host_workers.push_back(source); l_unlock(lock_smp); /*remove from current split*/ l_lock(master->lock); master->host_workers.remove(source); master->n_host_workers--; l_unlock(master->lock); } /****************************************************************** * INIT - Set up poistion from FEN and prepare threaded search ******************************************************************/ } else if(message_id == INIT) { INIT_MESSAGE init; Recv(source,message_id,&init,sizeof(INIT_MESSAGE)); /*setup board*/ psb->set_board((char*)init.fen); /*make moves*/ register int i; for(i = 0;i < init.pv_length;i++) { if(init.pv[i]) psb->do_move(init.pv[i]); else psb->do_null(); } #ifdef PARALLEL /*wakeup processors*/ for(i = 0;i < n_processors;i++) processors[i]->state = WAIT; #endif /*********************************** * Distributed transposition table ************************************/ #if DST_TT_TYPE == 1 } else if(message_id == RECORD_TT) { TT_MESSAGE ttmsg; Recv(source,message_id,&ttmsg,sizeof(TT_MESSAGE)); /*record*/ psb->record_hash(ttmsg.col,ttmsg.hash_key,ttmsg.depth,ttmsg.ply, ttmsg.flags,ttmsg.score,ttmsg.move, ttmsg.mate_threat,ttmsg.singular); } else if(message_id == PROBE_TT) { TT_MESSAGE ttmsg; Recv(source,message_id,&ttmsg,sizeof(TT_MESSAGE)); /*probe*/ int proc_id = ttmsg.flags; int h_depth,score,mate_threat,singular; ttmsg.flags = psb->probe_hash(ttmsg.col,ttmsg.hash_key,ttmsg.depth,ttmsg.ply, score,ttmsg.move,ttmsg.alpha,ttmsg.beta, mate_threat,singular,h_depth,false); ttmsg.depth = h_depth; ttmsg.score = (BMP16)score; ttmsg.mate_threat = (UBMP8)mate_threat; ttmsg.singular = (UBMP8)singular; ttmsg.ply = proc_id; //embed processor_id in message /*send*/ MPI_Request rq; ISend(source,PROCESSOR::PROBE_TT_RESULT,&ttmsg,sizeof(TT_MESSAGE),&rq); Wait(&rq); } else if(message_id == PROBE_TT_RESULT) { TT_MESSAGE ttmsg; Recv(source,message_id,&ttmsg,sizeof(TT_MESSAGE)); /*copy tt entry to processor*/ int proc_id = ttmsg.ply; PPROCESSOR proc = processors[proc_id]; proc->ttmsg = ttmsg; proc->ttmsg_recieved = true; #endif /****************************************** * Handle notification (zero-size) messages *******************************************/ } else { Recv(source,message_id); if(message_id == HELP) { l_lock(lock_smp); if(n_idle_processors == n_processors) ISend(source,CANCEL); else available_host_workers.push_back(source); l_unlock(lock_smp); } else if(message_id == CANCEL) { help_messages--; } else if(message_id == QUIT) { SEARCHER::abort_search = 1; } else if(message_id == GOROOT) { message_available = 0; SEARCHER::chess_clock.infinite_mode = true; int save = processors[0]->state; processors[0]->state = GO; psb->find_best(); processors[0]->state = save; } else if(message_id == PING) { ISend(source,PONG); } } }