/* * Send a msg to the next msg aggregation collector node. If primary * collector is unavailable or returns error, try backup collector. * If backup collector is unavailable or returns error, send msg * directly to controller. */ static int _send_to_next_collector(slurm_msg_t *msg) { slurm_addr_t *next_dest = NULL; bool i_am_collector; int rc = SLURM_SUCCESS; if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg aggr: send_to_next_collector: getting primary next " "collector"); if ((next_dest = route_g_next_collector(&i_am_collector))) { if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) { char addrbuf[100]; slurm_print_slurm_addr(next_dest, addrbuf, 32); info("msg aggr: send_to_next_collector: *next_dest is " "%s", addrbuf); } memcpy(&msg->address, next_dest, sizeof(slurm_addr_t)); rc = slurm_send_only_node_msg(msg); } if (!next_dest || (rc != SLURM_SUCCESS)) rc = _send_to_backup_collector(msg, rc); return rc; }
static int _send_to_backup_collector(slurm_msg_t *msg, int rc) { slurm_addr_t *next_dest = NULL; if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) { info("_send_to_backup_collector: primary %s, " "getting backup", rc ? "can't be reached" : "is null"); } if ((next_dest = route_g_next_collector_backup())) { if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) { char addrbuf[100]; slurm_print_slurm_addr(next_dest, addrbuf, 32); info("_send_to_backup_collector: *next_dest is " "%s", addrbuf); } memcpy(&msg->address, next_dest, sizeof(slurm_addr_t)); rc = slurm_send_only_node_msg(msg); } if (!next_dest || (rc != SLURM_SUCCESS)) { if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("_send_to_backup_collector: backup %s, " "sending msg to controller", rc ? "can't be reached" : "is null"); rc = slurm_send_only_controller_msg(msg, working_cluster_rec); } return rc; }
extern void msg_aggr_resp(slurm_msg_t *msg) { slurm_msg_t *next_msg; composite_msg_t *comp_msg; msg_aggr_t *msg_aggr; ListIterator itr; comp_msg = (composite_msg_t *)msg->data; itr = list_iterator_create(comp_msg->msg_list); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg_aggr_resp: processing composite msg_list..."); while ((next_msg = list_next(itr))) { switch (next_msg->msg_type) { case RESPONSE_SLURM_RC: /* signal sending thread that slurmctld received this * epilog complete msg */ if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg_aggr_resp: rc message found for " "index %u signaling sending thread", next_msg->msg_index); slurm_mutex_lock(&msg_collection.aggr_mutex); if (!(msg_aggr = _handle_msg_aggr_ret( next_msg->msg_index, 1))) { debug2("msg_aggr_resp: error: unable to " "locate aggr message struct for job %u", next_msg->msg_index); slurm_mutex_unlock(&msg_collection.aggr_mutex); continue; } pthread_cond_signal(&msg_aggr->wait_cond); slurm_mutex_unlock(&msg_collection.aggr_mutex); break; case RESPONSE_MESSAGE_COMPOSITE: comp_msg = (composite_msg_t *)next_msg->data; /* set up the address here for the next node */ memcpy(&next_msg->address, &comp_msg->sender, sizeof(slurm_addr_t)); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) { char addrbuf[100]; slurm_print_slurm_addr(&next_msg->address, addrbuf, 32); info("msg_aggr_resp: composite response msg " "found for %s", addrbuf); } slurm_send_only_node_msg(next_msg); break; default: error("_rpc_composite_resp: invalid msg type in " "composite msg_list"); break; } } list_iterator_destroy(itr); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg aggr: _rpc_composite_resp: finished processing " "composite msg_list..."); }
/* * _thread_per_group_rpc - thread to issue an RPC for a group of nodes * sending message out to one and forwarding it to * others if necessary. * IN/OUT args - pointer to task_info_t, xfree'd on completion */ static void *_thread_per_group_rpc(void *args) { int rc = SLURM_SUCCESS; slurm_msg_t msg; task_info_t *task_ptr = (task_info_t *) args; /* we cache some pointers from task_info_t because we need * to xfree args before being finished with their use. xfree * is required for timely termination of this pthread because * xfree could lock it at the end, preventing a timely * thread_exit */ pthread_mutex_t *thread_mutex_ptr = task_ptr->thread_mutex_ptr; pthread_cond_t *thread_cond_ptr = task_ptr->thread_cond_ptr; uint32_t *threads_active_ptr = task_ptr->threads_active_ptr; thd_t *thread_ptr = task_ptr->thread_struct_ptr; state_t thread_state = DSH_NO_RESP; slurm_msg_type_t msg_type = task_ptr->msg_type; bool is_kill_msg, srun_agent; List ret_list = NULL; ListIterator itr; ret_data_info_t *ret_data_info = NULL; int found = 0; int sig_array[2] = {SIGUSR1, 0}; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; xassert(args != NULL); xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sig_array); is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) || (msg_type == REQUEST_TERMINATE_JOB) ); srun_agent = ( (msg_type == SRUN_PING) || (msg_type == SRUN_EXEC) || (msg_type == SRUN_JOB_COMPLETE) || (msg_type == SRUN_STEP_MISSING) || (msg_type == SRUN_TIMEOUT) || (msg_type == SRUN_USER_MSG) || (msg_type == RESPONSE_RESOURCE_ALLOCATION) || (msg_type == SRUN_NODE_FAIL) ); thread_ptr->start_time = time(NULL); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->state = DSH_ACTIVE; thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT; slurm_mutex_unlock(thread_mutex_ptr); /* send request message */ slurm_msg_t_init(&msg); msg.msg_type = msg_type; msg.data = task_ptr->msg_args_ptr; #if 0 info("sending message type %u to %s", msg_type, thread_ptr->nodelist); #endif if (task_ptr->get_reply) { if(thread_ptr->addr) { msg.address = *thread_ptr->addr; if(!(ret_list = slurm_send_addr_recv_msgs( &msg, thread_ptr->nodelist, 0))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } else { if(!(ret_list = slurm_send_recv_msgs( thread_ptr->nodelist, &msg, 0, true))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } } else { if(thread_ptr->addr) { //info("got the address"); msg.address = *thread_ptr->addr; } else { //info("no address given"); if(slurm_conf_get_addr(thread_ptr->nodelist, &msg.address) == SLURM_ERROR) { error("_thread_per_group_rpc: " "can't find address for host %s, " "check slurm.conf", thread_ptr->nodelist); goto cleanup; } } //info("sending %u to %s", msg_type, thread_ptr->nodelist); if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) { thread_state = DSH_DONE; } else { if (!srun_agent) _comm_err(thread_ptr->nodelist, msg_type); } goto cleanup; } //info("got %d messages back", list_count(ret_list)); found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr)) != NULL) { rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); /* SPECIAL CASE: Mark node as IDLE if job already complete */ if (is_kill_msg && (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) { kill_job_msg_t *kill_job; kill_job = (kill_job_msg_t *) task_ptr->msg_args_ptr; rc = SLURM_SUCCESS; lock_slurmctld(job_write_lock); if (job_epilog_complete(kill_job->job_id, ret_data_info-> node_name, rc)) run_scheduler = true; unlock_slurmctld(job_write_lock); } /* SPECIAL CASE: Kill non-startable batch job, * Requeue the job on ESLURMD_PROLOG_FAILED */ if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) && (ret_data_info->type != RESPONSE_FORWARD_FAILED)) { batch_job_launch_msg_t *launch_msg_ptr = task_ptr->msg_args_ptr; uint32_t job_id = launch_msg_ptr->job_id; info("Killing non-startable batch job %u: %s", job_id, slurm_strerror(rc)); thread_state = DSH_DONE; ret_data_info->err = thread_state; lock_slurmctld(job_write_lock); job_complete(job_id, 0, false, false, _wif_status()); unlock_slurmctld(job_write_lock); continue; } if (((msg_type == REQUEST_SIGNAL_TASKS) || (msg_type == REQUEST_TERMINATE_TASKS)) && (rc == ESRCH)) { /* process is already dead, not a real error */ rc = SLURM_SUCCESS; } switch (rc) { case SLURM_SUCCESS: /* debug("agent processed RPC to node %s", */ /* ret_data_info->node_name); */ thread_state = DSH_DONE; break; case SLURM_UNKNOWN_FORWARD_ADDR: error("We were unable to forward message to '%s'. " "Make sure the slurm.conf for each slurmd " "contain all other nodes in your system.", ret_data_info->node_name); thread_state = DSH_NO_RESP; break; case ESLURMD_EPILOG_FAILED: error("Epilog failure on host %s, " "setting DOWN", ret_data_info->node_name); thread_state = DSH_FAILED; break; case ESLURMD_PROLOG_FAILED: thread_state = DSH_FAILED; break; case ESLURM_INVALID_JOB_ID: /* Not indicative of a real error */ case ESLURMD_JOB_NOTRUNNING: /* Not indicative of a real error */ debug2("agent processed RPC to node %s: %s", ret_data_info->node_name, slurm_strerror(rc)); thread_state = DSH_DONE; break; default: if (!srun_agent) { if (ret_data_info->err) errno = ret_data_info->err; else errno = rc; rc = _comm_err(ret_data_info->node_name, msg_type); } if (srun_agent) thread_state = DSH_FAILED; else if(ret_data_info->type == RESPONSE_FORWARD_FAILED) /* check if a forward failed */ thread_state = DSH_NO_RESP; else { /* some will fail that don't mean anything went * bad like a job term request on a job that is * already finished, we will just exit on those * cases */ thread_state = DSH_DONE; } } ret_data_info->err = thread_state; } list_iterator_destroy(itr); cleanup: xfree(args); /* handled at end of thread just in case resend is needed */ destroy_forward(&msg.forward); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->ret_list = ret_list; thread_ptr->state = thread_state; thread_ptr->end_time = (time_t) difftime(time(NULL), thread_ptr->start_time); /* Signal completion so another thread can replace us */ (*threads_active_ptr)--; pthread_cond_signal(thread_cond_ptr); slurm_mutex_unlock(thread_mutex_ptr); return (void *) NULL; }