/* * _thread_ipmi_run is the thread calling ipmi and launching _thread_ipmi_write */ static void *_thread_ipmi_run(void *no_data) { // need input (attr) struct timeval tvnow; struct timespec abs; flag_energy_accounting_shutdown = false; if (debug_flags & DEBUG_FLAG_ENERGY) info("ipmi-thread: launched"); (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); slurm_mutex_lock(&ipmi_mutex); if (_thread_init() != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_ENERGY) info("ipmi-thread: aborted"); slurm_mutex_unlock(&ipmi_mutex); slurm_cond_signal(&launch_cond); return NULL; } (void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); slurm_mutex_unlock(&ipmi_mutex); flag_thread_started = true; slurm_cond_signal(&launch_cond); /* setup timer */ gettimeofday(&tvnow, NULL); abs.tv_sec = tvnow.tv_sec; abs.tv_nsec = tvnow.tv_usec * 1000; //loop until slurm stop while (!flag_energy_accounting_shutdown) { slurm_mutex_lock(&ipmi_mutex); _thread_update_node_energy(); /* Sleep until the next time. */ abs.tv_sec += slurm_ipmi_conf.freq; slurm_cond_timedwait(&ipmi_cond, &ipmi_mutex, &abs); slurm_mutex_unlock(&ipmi_mutex); } if (debug_flags & DEBUG_FLAG_ENERGY) info("ipmi-thread: ended"); return NULL; }
/* Terminate the gang scheduling thread and free its data structures */ extern void gs_fini(void) { if (!(slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG)) return; /* terminate the timeslicer thread */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_fini"); slurm_mutex_lock(&thread_flag_mutex); if (thread_running) { slurm_mutex_lock(&term_lock); thread_shutdown = true; slurm_cond_signal(&term_cond); slurm_mutex_unlock(&term_lock); slurm_mutex_unlock(&thread_flag_mutex); usleep(120000); if (timeslicer_thread_id) error("gang: timeslicer pthread still running"); } else { slurm_mutex_unlock(&thread_flag_mutex); } FREE_NULL_LIST(preempt_job_list); slurm_mutex_lock(&data_mutex); FREE_NULL_LIST(gs_part_list); gs_part_list = NULL; xfree(gs_bits_per_node); slurm_mutex_unlock(&data_mutex); if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_fini"); }
/* Terminate builtin_agent */ extern void stop_builtin_agent(void) { slurm_mutex_lock(&term_lock); stop_builtin = true; slurm_cond_signal(&term_cond); slurm_mutex_unlock(&term_lock); }
static void *_create_container_thread(void *args) { stepd_step_rec_t *job = (stepd_step_rec_t *)args; job->cont_id = (uint64_t)job_create(0, job->uid, 0); /* Signal the container_create we are done */ slurm_mutex_lock(¬ify_mutex); /* We need to signal failure or not */ slurm_cond_signal(¬ify); /* Don't unlock the notify_mutex here, wait, it is not needed * and can cause deadlock if done. */ if (job->cont_id == (jid_t)-1) error("Failed to create job container: %m"); else /* Wait around for something else to be added and then exit when that takes place. */ slurm_cond_wait(¬ify, ¬ify_mutex); slurm_mutex_unlock(¬ify_mutex); return NULL; }
void step_terminate_monitor_stop(void) { slurm_mutex_lock(&lock); if (!running_flag) { slurm_mutex_unlock(&lock); return; } if (stop_flag) { error("step_terminate_monitor_stop: already stopped"); slurm_mutex_unlock(&lock); return; } stop_flag = 1; debug("step_terminate_monitor_stop signalling condition"); slurm_cond_signal(&cond); slurm_mutex_unlock(&lock); if (pthread_join(tid, NULL) != 0) { error("step_terminate_monitor_stop: pthread_join: %m"); } xfree(program_name); return; }
extern void acct_gather_profile_endpoll(void) { int i; if (!acct_gather_profile_running) { debug2("acct_gather_profile_startpoll: poll already ended!"); return; } acct_gather_profile_running = false; for (i=0; i < PROFILE_CNT; i++) { /* end remote threads */ slurm_mutex_lock(&acct_gather_profile_timer[i].notify_mutex); slurm_cond_signal(&acct_gather_profile_timer[i].notify); slurm_mutex_unlock(&acct_gather_profile_timer[i].notify_mutex); slurm_cond_destroy(&acct_gather_profile_timer[i].notify); acct_gather_profile_timer[i].freq = 0; switch (i) { case PROFILE_ENERGY: break; case PROFILE_TASK: jobacct_gather_endpoll(); break; case PROFILE_FILESYSTEM: break; case PROFILE_NETWORK: break; default: fatal("Unhandled profile option %d please update " "slurm_acct_gather_profile.c " "(acct_gather_profile_endpoll)", i); } } }
static msg_aggr_t *_handle_msg_aggr_ret(uint32_t msg_index, bool locked) { msg_aggr_t *msg_aggr; ListIterator itr; if (!locked) slurm_mutex_lock(&msg_collection.aggr_mutex); itr = list_iterator_create(msg_collection.msg_aggr_list); while ((msg_aggr = list_next(itr))) { /* just remove them all */ if (!msg_index) { /* make sure we don't wait any longer */ slurm_cond_signal(&msg_aggr->wait_cond); list_remove(itr); } else if (msg_aggr->msg_index == msg_index) { list_remove(itr); break; } } list_iterator_destroy(itr); if (!locked) slurm_mutex_unlock(&msg_collection.aggr_mutex); return msg_aggr; }
void heartbeat_stop(void) { slurm_mutex_lock(&heartbeat_mutex); if (heart_beating) { heart_beating = false; slurm_cond_signal(&heartbeat_cond); } slurm_mutex_unlock(&heartbeat_mutex); }
/* * config_power_mgr - Read power management configuration */ extern void config_power_mgr(void) { slurm_mutex_lock(&power_mutex); if (!power_save_config) { if (_init_power_config() == 0) power_save_enabled = true; power_save_config = true; } slurm_cond_signal(&power_cond); slurm_mutex_unlock(&power_mutex); }
static void _end_container_thread(void) { if (threadid) { /* This will end the thread and remove it from the container */ slurm_mutex_lock(&thread_mutex); slurm_mutex_lock(¬ify_mutex); slurm_cond_signal(¬ify); slurm_mutex_unlock(¬ify_mutex); pthread_join(threadid, NULL); threadid = 0; slurm_mutex_unlock(&thread_mutex); } }
void _destroy_tree_fwd(fwd_tree_t *fwd_tree) { if (fwd_tree) { if (fwd_tree->tree_hl) hostlist_destroy(fwd_tree->tree_hl); /* * Lock and decrease thread counter, start_msg_tree is waiting * for a null thread count to exit its main loop */ slurm_mutex_lock(fwd_tree->tree_mutex); (*(fwd_tree->p_thr_count))--; slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); xfree(fwd_tree); } }
extern void msg_aggr_sender_fini(void) { if (!msg_collection.running) return; msg_collection.running = 0; slurm_mutex_lock(&msg_collection.mutex); slurm_cond_signal(&msg_collection.cond); slurm_mutex_unlock(&msg_collection.mutex); pthread_join(msg_collection.thread_id, NULL); msg_collection.thread_id = (pthread_t) 0; slurm_cond_destroy(&msg_collection.cond); /* signal and clear the waiting list */ slurm_mutex_lock(&msg_collection.aggr_mutex); _handle_msg_aggr_ret(0, 1); FREE_NULL_LIST(msg_collection.msg_aggr_list); slurm_mutex_unlock(&msg_collection.aggr_mutex); FREE_NULL_LIST(msg_collection.msg_list); slurm_mutex_destroy(&msg_collection.mutex); }
extern int acct_gather_interconnect_fini(void) { int rc2, rc = SLURM_SUCCESS; int i; slurm_mutex_lock(&g_context_lock); init_run = false; if (watch_node_thread_id) { slurm_mutex_unlock(&g_context_lock); slurm_mutex_lock(&profile_timer->notify_mutex); slurm_cond_signal(&profile_timer->notify); slurm_mutex_unlock(&profile_timer->notify_mutex); pthread_join(watch_node_thread_id, NULL); slurm_mutex_lock(&g_context_lock); } for (i = 0; i < g_context_num; i++) { if (!g_context[i]) continue; rc2 = plugin_context_destroy(g_context[i]); if (rc2 != SLURM_SUCCESS) { debug("%s: %s: %s", __func__, g_context[i]->type, slurm_strerror(rc2)); rc = SLURM_ERROR; } } xfree(ops); xfree(g_context); g_context_num = -1; slurm_mutex_unlock(&g_context_lock); return rc; }
static void * _handle_accept(void *arg) { /*struct request_params *param = (struct request_params *)arg;*/ int fd = ((struct request_params *)arg)->fd; stepd_step_rec_t *job = ((struct request_params *)arg)->job; int req; int len; Buf buffer; void *auth_cred; int rc; uid_t uid; gid_t gid; char *auth_info; debug3("Entering _handle_accept (new thread)"); xfree(arg); safe_read(fd, &req, sizeof(int)); if (req != REQUEST_CONNECT) { error("First message must be REQUEST_CONNECT"); goto fail; } safe_read(fd, &len, sizeof(int)); buffer = init_buf(len); safe_read(fd, get_buf_data(buffer), len); /* Unpack and verify the auth credential */ auth_cred = g_slurm_auth_unpack(buffer); if (auth_cred == NULL) { error("Unpacking authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(NULL))); free_buf(buffer); goto fail; } auth_info = slurm_get_auth_info(); rc = g_slurm_auth_verify(auth_cred, NULL, 2, auth_info); if (rc != SLURM_SUCCESS) { error("Verifying authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred))); xfree(auth_info); (void) g_slurm_auth_destroy(auth_cred); free_buf(buffer); goto fail; } /* Get the uid & gid from the credential, then destroy it. */ uid = g_slurm_auth_get_uid(auth_cred, auth_info); gid = g_slurm_auth_get_gid(auth_cred, auth_info); xfree(auth_info); debug3(" Identity: uid=%d, gid=%d", uid, gid); g_slurm_auth_destroy(auth_cred); free_buf(buffer); rc = SLURM_PROTOCOL_VERSION; safe_write(fd, &rc, sizeof(int)); while (1) { rc = _handle_request(fd, job, uid, gid); if (rc != SLURM_SUCCESS) break; } if (close(fd) == -1) error("Closing accepted fd: %m"); slurm_mutex_lock(&message_lock); message_connections--; slurm_cond_signal(&message_cond); slurm_mutex_unlock(&message_lock); debug3("Leaving _handle_accept"); return NULL; fail: rc = SLURM_FAILURE; safe_write(fd, &rc, sizeof(int)); rwfail: if (close(fd) == -1) error("Closing accepted fd after error: %m"); debug("Leaving _handle_accept on an error"); return NULL; }
extern void msg_aggr_resp(slurm_msg_t *msg) { slurm_msg_t *next_msg; composite_msg_t *comp_msg; msg_aggr_t *msg_aggr; ListIterator itr; comp_msg = (composite_msg_t *)msg->data; itr = list_iterator_create(comp_msg->msg_list); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg_aggr_resp: processing composite msg_list..."); while ((next_msg = list_next(itr))) { switch (next_msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: case RESPONSE_SLURM_RC: /* signal sending thread that slurmctld received this * msg */ if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg_aggr_resp: response found for " "index %u signaling sending thread", next_msg->msg_index); slurm_mutex_lock(&msg_collection.aggr_mutex); if (!(msg_aggr = _handle_msg_aggr_ret( next_msg->msg_index, 1))) { debug2("msg_aggr_resp: error: unable to " "locate aggr message struct for job %u", next_msg->msg_index); slurm_mutex_unlock(&msg_collection.aggr_mutex); continue; } if (msg_aggr->resp_callback && (next_msg->msg_type != RESPONSE_SLURM_RC)) (*(msg_aggr->resp_callback))(next_msg); slurm_cond_signal(&msg_aggr->wait_cond); slurm_mutex_unlock(&msg_collection.aggr_mutex); break; case RESPONSE_MESSAGE_COMPOSITE: comp_msg = (composite_msg_t *)next_msg->data; /* set up the address here for the next node */ memcpy(&next_msg->address, &comp_msg->sender, sizeof(slurm_addr_t)); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) { char addrbuf[100]; slurm_print_slurm_addr(&next_msg->address, addrbuf, 32); info("msg_aggr_resp: composite response msg " "found for %s", addrbuf); } slurm_send_only_node_msg(next_msg); break; default: error("_rpc_composite_resp: invalid msg type in " "composite msg_list"); break; } } list_iterator_destroy(itr); if (msg_collection.debug_flags & DEBUG_FLAG_ROUTE) info("msg aggr: _rpc_composite_resp: finished processing " "composite msg_list..."); }
extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait, void (*resp_callback) (slurm_msg_t *msg)) { int count; static uint16_t msg_index = 1; static uint32_t wait_count = 0; if (!msg_collection.running) return; slurm_mutex_lock(&msg_collection.mutex); if (msg_collection.max_msgs == true) { slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex); } msg->msg_index = msg_index++; /* Add msg to message collection */ list_append(msg_collection.msg_list, msg); count = list_count(msg_collection.msg_list); /* First msg in collection; initiate new window */ if (count == 1) slurm_cond_signal(&msg_collection.cond); /* Max msgs reached; terminate window */ if (count >= msg_collection.max_msg_cnt) { msg_collection.max_msgs = true; slurm_cond_signal(&msg_collection.cond); } slurm_mutex_unlock(&msg_collection.mutex); if (wait) { msg_aggr_t *msg_aggr = xmalloc(sizeof(msg_aggr_t)); uint16_t msg_timeout; struct timeval now; struct timespec timeout; msg_aggr->msg_index = msg->msg_index; msg_aggr->resp_callback = resp_callback; slurm_cond_init(&msg_aggr->wait_cond, NULL); slurm_mutex_lock(&msg_collection.aggr_mutex); list_append(msg_collection.msg_aggr_list, msg_aggr); msg_timeout = slurm_get_msg_timeout(); gettimeofday(&now, NULL); timeout.tv_sec = now.tv_sec + msg_timeout; timeout.tv_nsec = now.tv_usec * 1000; wait_count++; if (pthread_cond_timedwait(&msg_aggr->wait_cond, &msg_collection.aggr_mutex, &timeout) == ETIMEDOUT) _handle_msg_aggr_ret(msg_aggr->msg_index, 1); wait_count--; slurm_mutex_unlock(&msg_collection.aggr_mutex); ; if (!msg_collection.running && !wait_count) slurm_mutex_destroy(&msg_collection.aggr_mutex); _msg_aggr_free(msg_aggr); } }
void *_forward_thread(void *arg) { forward_msg_t *fwd_msg = (forward_msg_t *)arg; forward_struct_t *fwd_struct = fwd_msg->fwd_struct; Buf buffer = init_buf(BUF_SIZE); /* probably enough for header */ List ret_list = NULL; int fd = -1; ret_data_info_t *ret_data_info = NULL; char *name = NULL; hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist); slurm_addr_t addr; char *buf = NULL; int steps = 0; int start_timeout = fwd_msg->timeout; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(hl))) { if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) { error("forward_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); continue; } goto cleanup; } if ((fd = slurm_open_msg_conn(&addr)) < 0) { error("forward_thread to %s: %m", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } buf = hostlist_ranged_string_xmalloc(hl); xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = buf; fwd_msg->header.forward.cnt = hostlist_count(hl); #if 0 info("sending %d forwards (%s) to %s", fwd_msg->header.forward.cnt, fwd_msg->header.forward.nodelist, name); #endif if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); } else debug3("forward: send to %s ", name); pack_header(&fwd_msg->header, buffer); /* add forward data to buffer */ if (remaining_buf(buffer) < fwd_struct->buf_len) { int new_size = buffer->processed + fwd_struct->buf_len; new_size += 1024; /* padded for paranoia */ xrealloc_nz(buffer->head, new_size); buffer->size = new_size; } if (fwd_struct->buf_len) { memcpy(&buffer->head[buffer->processed], fwd_struct->buf, fwd_struct->buf_len); buffer->processed += fwd_struct->buf_len; } /* * forward message */ if (slurm_msg_sendto(fd, get_buf_data(buffer), get_buf_offset(buffer), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) { error("forward_thread: slurm_msg_sendto: %m"); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } /* These messages don't have a return message, but if * we got here things worked out so make note of the * list of nodes as success. */ if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) || (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) || (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) { slurm_mutex_lock(&fwd_struct->forward_mutex); ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); while ((name = hostlist_shift(hl))) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); } goto cleanup; } if (fwd_msg->header.forward.cnt > 0) { static int message_timeout = -1; if (message_timeout < 0) message_timeout = slurm_get_msg_timeout() * 1000; if (!fwd_msg->header.forward.tree_width) fwd_msg->header.forward.tree_width = slurm_get_tree_width(); steps = (fwd_msg->header.forward.cnt+1) / fwd_msg->header.forward.tree_width; fwd_msg->timeout = (message_timeout*steps); /* info("got %d * %d = %d", message_timeout, */ /* steps, fwd_msg->timeout); */ steps++; fwd_msg->timeout += (start_timeout*steps); /* info("now + %d*%d = %d", start_timeout, */ /* steps, fwd_msg->timeout); */ } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); /* info("sent %d forwards got %d back", */ /* fwd_msg->header.forward.cnt, list_count(ret_list)); */ if (!ret_list || (fwd_msg->header.forward.cnt != 0 && list_count(ret_list) <= 1)) { slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); FREE_NULL_LIST(ret_list); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; continue; } goto cleanup; } else if ((fwd_msg->header.forward.cnt+1) != list_count(ret_list)) { /* this should never be called since the above should catch the failed forwards and pipe them back down, but this is here so we never have to worry about a locked mutex */ ListIterator itr = NULL; char *tmp = NULL; int first_node_found = 0; hostlist_iterator_t host_itr = hostlist_iterator_create(hl); error("We shouldn't be here. We forwarded to %d " "but only got %d back", (fwd_msg->header.forward.cnt+1), list_count(ret_list)); while ((tmp = hostlist_next(host_itr))) { int node_found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (!ret_data_info->node_name) { first_node_found = 1; ret_data_info->node_name = xstrdup(name); } if (!xstrcmp(tmp, ret_data_info->node_name)) { node_found = 1; break; } } list_iterator_destroy(itr); if (!node_found) { mark_as_failed_forward( &fwd_struct->ret_list, tmp, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } hostlist_iterator_destroy(host_itr); if (!first_node_found) { mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; } slurm_mutex_lock(&fwd_struct->forward_mutex); if (ret_list) { while ((ret_data_info = list_pop(ret_list)) != NULL) { if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(name); } list_push(fwd_struct->ret_list, ret_data_info); debug3("got response from %s", ret_data_info->node_name); } FREE_NULL_LIST(ret_list); } free(name); cleanup: if ((fd >= 0) && slurm_close(fd) < 0) error ("close(%d): %m", fd); hostlist_destroy(hl); destroy_forward(&fwd_msg->header.forward); free_buf(buffer); slurm_cond_signal(&fwd_struct->notify); slurm_mutex_unlock(&fwd_struct->forward_mutex); xfree(fwd_msg); return (NULL); }
void *_fwd_tree_thread(void *arg) { fwd_tree_t *fwd_tree = (fwd_tree_t *)arg; List ret_list = NULL; char *name = NULL; char *buf = NULL; slurm_msg_t send_msg; slurm_msg_t_init(&send_msg); send_msg.msg_type = fwd_tree->orig_msg->msg_type; send_msg.data = fwd_tree->orig_msg->data; send_msg.protocol_version = fwd_tree->orig_msg->protocol_version; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(fwd_tree->tree_hl))) { if (slurm_conf_get_addr(name, &send_msg.address) == SLURM_ERROR) { error("fwd_tree_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward(&fwd_tree->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } send_msg.forward.timeout = fwd_tree->timeout; if ((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))){ buf = hostlist_ranged_string_xmalloc( fwd_tree->tree_hl); send_msg.forward.nodelist = buf; } else send_msg.forward.nodelist = NULL; if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) { debug3("Tree sending to %s along with %s", name, send_msg.forward.nodelist); } else debug3("Tree sending to %s", name); ret_list = slurm_send_addr_recv_msgs(&send_msg, name, fwd_tree->timeout); xfree(send_msg.forward.nodelist); if (ret_list) { int ret_cnt = list_count(ret_list); /* This is most common if a slurmd is running an older version of Slurm than the originator of the message. */ if ((ret_cnt <= send_msg.forward.cnt) && (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) { error("fwd_tree_thread: %s failed to forward " "the message, expecting %d ret got only " "%d", name, send_msg.forward.cnt + 1, ret_cnt); if (ret_cnt > 1) { /* not likely */ ret_data_info_t *ret_data_info = NULL; ListIterator itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (xstrcmp(ret_data_info-> node_name, name)) hostlist_delete_host( fwd_tree-> tree_hl, ret_data_info-> node_name); } list_iterator_destroy(itr); } } slurm_mutex_lock(fwd_tree->tree_mutex); list_transfer(fwd_tree->ret_list, ret_list); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); FREE_NULL_LIST(ret_list); /* try next node */ if (ret_cnt <= send_msg.forward.cnt) { free(name); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _start_msg_tree_internal( fwd_tree->tree_hl, NULL, fwd_tree, hostlist_count(fwd_tree->tree_hl)); continue; } } else { /* This should never happen (when this was * written slurm_send_addr_recv_msgs always * returned a list */ error("fwd_tree_thread: no return list given from " "slurm_send_addr_recv_msgs spawned for %s", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward( &fwd_tree->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } free(name); /* check for error and try again */ if (errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR) continue; break; } _destroy_tree_fwd(fwd_tree); return NULL; }
static void * _cancel_job_id (void *ci) { int error_code = SLURM_SUCCESS, i; job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci; bool sig_set = true; uint16_t flags = 0; char *job_type = ""; DEF_TIMERS; if (cancel_info->sig == (uint16_t) NO_VAL) { cancel_info->sig = SIGKILL; sig_set = false; } if (opt.batch) { flags |= KILL_JOB_BATCH; job_type = "batch "; } if (opt.full) { flags |= KILL_FULL_JOB; job_type = "full "; } if (cancel_info->array_flag) flags |= KILL_JOB_ARRAY; if (!cancel_info->job_id_str) { if (cancel_info->array_job_id && (cancel_info->array_task_id == INFINITE)) { xstrfmtcat(cancel_info->job_id_str, "%u_*", cancel_info->array_job_id); } else if (cancel_info->array_job_id) { xstrfmtcat(cancel_info->job_id_str, "%u_%u", cancel_info->array_job_id, cancel_info->array_task_id); } else { xstrfmtcat(cancel_info->job_id_str, "%u", cancel_info->job_id); } } if (!sig_set) { verbose("Terminating %sjob %s", job_type, cancel_info->job_id_str); } else { verbose("Signal %u to %sjob %s", cancel_info->sig, job_type, cancel_info->job_id_str); } for (i = 0; i < MAX_CANCEL_RETRY; i++) { _add_delay(); START_TIMER; error_code = slurm_kill_job2(cancel_info->job_id_str, cancel_info->sig, flags); END_TIMER; slurm_mutex_lock(&max_delay_lock); max_resp_time = MAX(max_resp_time, DELTA_TIMER); slurm_mutex_unlock(&max_delay_lock); if ((error_code == 0) || (errno != ESLURM_TRANSITION_STATE_NO_UPDATE)) break; verbose("Job is in transistional state, retrying"); sleep(5 + i); } if (error_code) { error_code = slurm_get_errno(); if ((opt.verbose > 0) || ((error_code != ESLURM_ALREADY_DONE) && (error_code != ESLURM_INVALID_JOB_ID))) { error("Kill job error on job id %s: %s", cancel_info->job_id_str, slurm_strerror(slurm_get_errno())); } if (((error_code == ESLURM_ALREADY_DONE) || (error_code == ESLURM_INVALID_JOB_ID)) && (cancel_info->sig == SIGKILL)) { error_code = 0; /* Ignore error if job done */ } } /* Purposely free the struct passed in here, so the caller doesn't have * to keep track of it, but don't destroy the mutex and condition * variables contained. */ slurm_mutex_lock(cancel_info->num_active_threads_lock); *(cancel_info->rc) = MAX(*(cancel_info->rc), error_code); (*(cancel_info->num_active_threads))--; slurm_cond_signal(cancel_info->num_active_threads_cond); slurm_mutex_unlock(cancel_info->num_active_threads_lock); xfree(cancel_info->job_id_str); xfree(cancel_info); return NULL; }
static void * _cancel_step_id (void *ci) { int error_code = SLURM_SUCCESS, i; job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci; uint32_t job_id = cancel_info->job_id; uint32_t step_id = cancel_info->step_id; bool sig_set = true; DEF_TIMERS; if (cancel_info->sig == (uint16_t) NO_VAL) { cancel_info->sig = SIGKILL; sig_set = false; } if (!cancel_info->job_id_str) { if (cancel_info->array_job_id && (cancel_info->array_task_id == INFINITE)) { xstrfmtcat(cancel_info->job_id_str, "%u_*", cancel_info->array_job_id); } else if (cancel_info->array_job_id) { xstrfmtcat(cancel_info->job_id_str, "%u_%u", cancel_info->array_job_id, cancel_info->array_task_id); } else { xstrfmtcat(cancel_info->job_id_str, "%u", cancel_info->job_id); } } for (i = 0; i < MAX_CANCEL_RETRY; i++) { if (cancel_info->sig == SIGKILL) { verbose("Terminating step %s.%u", cancel_info->job_id_str, step_id); } else { verbose("Signal %u to step %s.%u", cancel_info->sig, cancel_info->job_id_str, step_id); } _add_delay(); START_TIMER; if ((!sig_set) || opt.ctld) error_code = slurm_kill_job_step(job_id, step_id, cancel_info->sig); else if (cancel_info->sig == SIGKILL) error_code = slurm_terminate_job_step(job_id, step_id); else error_code = slurm_signal_job_step(job_id, step_id, cancel_info->sig); END_TIMER; slurm_mutex_lock(&max_delay_lock); max_resp_time = MAX(max_resp_time, DELTA_TIMER); slurm_mutex_unlock(&max_delay_lock); if ((error_code == 0) || ((errno != ESLURM_TRANSITION_STATE_NO_UPDATE) && (errno != ESLURM_JOB_PENDING))) break; verbose("Job is in transistional state, retrying"); sleep(5 + i); } if (error_code) { error_code = slurm_get_errno(); if ((opt.verbose > 0) || (error_code != ESLURM_ALREADY_DONE)) error("Kill job error on job step id %s: %s", cancel_info->job_id_str, slurm_strerror(slurm_get_errno())); if ((error_code == ESLURM_ALREADY_DONE) && (cancel_info->sig == SIGKILL)) { error_code = 0; /* Ignore error if job done */ } } /* Purposely free the struct passed in here, so the caller doesn't have * to keep track of it, but don't destroy the mutex and condition * variables contained. */ slurm_mutex_lock(cancel_info->num_active_threads_lock); *(cancel_info->rc) = MAX(*(cancel_info->rc), error_code); (*(cancel_info->num_active_threads))--; slurm_cond_signal(cancel_info->num_active_threads_cond); slurm_mutex_unlock(cancel_info->num_active_threads_lock); xfree(cancel_info->job_id_str); xfree(cancel_info); return NULL; }
static void *_timer_thread(void *args) { int i, now, diff; #if HAVE_SYS_PRCTL_H if (prctl(PR_SET_NAME, "acctg_prof", NULL, NULL, NULL) < 0) { error("%s: cannot set my name to %s %m", __func__, "acctg_prof"); } #endif (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); DEF_TIMERS; while (init_run && acct_gather_profile_running) { slurm_mutex_lock(&g_context_lock); START_TIMER; now = time(NULL); for (i=0; i<PROFILE_CNT; i++) { if (acct_gather_suspended) { /* Handle suspended time as if it * didn't happen */ if (!acct_gather_profile_timer[i].freq) continue; if (acct_gather_profile_timer[i].last_notify) acct_gather_profile_timer[i]. last_notify += SLEEP_TIME; else acct_gather_profile_timer[i]. last_notify = now; continue; } diff = now - acct_gather_profile_timer[i].last_notify; /* info ("%d is %d and %d", i, */ /* acct_gather_profile_timer[i].freq, */ /* diff); */ if (!acct_gather_profile_timer[i].freq || (diff < acct_gather_profile_timer[i].freq)) continue; debug2("profile signalling type %s", acct_gather_profile_type_t_name(i)); /* signal poller to start */ slurm_mutex_lock(&acct_gather_profile_timer[i]. notify_mutex); slurm_cond_signal( &acct_gather_profile_timer[i].notify); slurm_mutex_unlock(&acct_gather_profile_timer[i]. notify_mutex); acct_gather_profile_timer[i].last_notify = now; } END_TIMER; slurm_mutex_unlock(&g_context_lock); usleep(USLEEP_TIME - DELTA_TIMER); } return NULL; }
/* * init_power_save - Initialize the power save module. Started as a * pthread. Terminates automatically at slurmctld shutdown time. * Input and output are unused. */ static void *_init_power_save(void *arg) { /* Locks: Read nodes */ slurmctld_lock_t node_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write nodes */ slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; time_t now, boot_time = 0, last_power_scan = 0; if (power_save_config && !power_save_enabled) { debug("power_save mode not enabled"); return NULL; } suspend_node_bitmap = bit_alloc(node_record_count); resume_node_bitmap = bit_alloc(node_record_count); while (slurmctld_config.shutdown_time == 0) { sleep(1); if (_reap_procs() < 2) { debug("power_save programs getting backlogged"); continue; } if ((last_config != slurmctld_conf.last_update) && (_init_power_config())) { info("power_save mode has been disabled due to " "configuration changes"); goto fini; } now = time(NULL); if (boot_time == 0) boot_time = now; /* Only run every 60 seconds or after a node state change, * whichever happens first */ if ((last_node_update >= last_power_scan) || (now >= (last_power_scan + 60))) { lock_slurmctld(node_write_lock); _do_power_work(now); unlock_slurmctld(node_write_lock); last_power_scan = now; } if (slurmd_timeout && (now > (boot_time + (slurmd_timeout / 2)))) { lock_slurmctld(node_read_lock); _re_wake(); unlock_slurmctld(node_read_lock); /* prevent additional executions */ boot_time += (365 * 24 * 60 * 60); slurmd_timeout = 0; } } fini: _clear_power_config(); FREE_NULL_BITMAP(suspend_node_bitmap); FREE_NULL_BITMAP(resume_node_bitmap); _shutdown_power(); slurm_mutex_lock(&power_mutex); power_save_enabled = false; slurm_cond_signal(&power_cond); slurm_mutex_unlock(&power_mutex); pthread_exit(NULL); return NULL; }
static int _handle_completion(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int first; int last; jobacctinfo_t *jobacct = NULL; int step_rc; char* buf; int len; Buf buffer; bool lock_set = false; debug("_handle_completion for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("step completion message from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; } safe_read(fd, &first, sizeof(int)); safe_read(fd, &last, sizeof(int)); safe_read(fd, &step_rc, sizeof(int)); /* * We must not use getinfo over a pipe with slurmd here * Indeed, slurmstepd does a large use of setinfo over a pipe * with slurmd and doing the reverse can result in a deadlock * scenario with slurmd : * slurmd(lockforread,write)/slurmstepd(write,lockforread) * Do pack/unpack instead to be sure of independances of * slurmd and slurmstepd */ safe_read(fd, &len, sizeof(int)); buf = xmalloc(len); safe_read(fd, buf, len); buffer = create_buf(buf, len); jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION, PROTOCOL_TYPE_SLURM, buffer, 1); free_buf(buffer); /* * Record the completed nodes */ slurm_mutex_lock(&step_complete.lock); lock_set = true; if (! step_complete.wait_children) { rc = -1; errnum = ETIMEDOUT; /* not used anyway */ goto timeout; } /* SlurmUser or root can craft a launch without a valid credential * ("srun --no-alloc ...") and no tree information can be built * without the hostlist from the credential. */ if (step_complete.rank >= 0) { #if 0 char bits_string[128]; debug2("Setting range %d (bit %d) through %d(bit %d)", first, first-(step_complete.rank+1), last, last-(step_complete.rank+1)); bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" before bits: %s", bits_string); #endif bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); #if 0 bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" after bits: %s", bits_string); #endif } step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ jobacctinfo_aggregate(step_complete.jobacct, jobacct); timeout: jobacctinfo_destroy(jobacct); /*********************************************/ /* Send the return code and errno, we do this within the locked * region to ensure that the stepd doesn't exit before we can * perform this send. */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); slurm_cond_signal(&step_complete.cond); slurm_mutex_unlock(&step_complete.lock); return SLURM_SUCCESS; rwfail: if (lock_set) { slurm_cond_signal(&step_complete.cond); slurm_mutex_unlock(&step_complete.lock); } return SLURM_FAILURE; }
/* Send an RPC to the SlurmDBD and wait for an arbitrary reply message. * The RPC will not be queued if an error occurs. * The "resp" message must be freed by the caller. * Returns SLURM_SUCCESS or an error code */ extern int send_recv_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req, slurmdbd_msg_t *resp) { int rc = SLURM_SUCCESS; Buf buffer; xassert(req); xassert(resp); /* To make sure we can get this to send instead of the agent sending stuff that can happen anytime we set halt_agent and then after we get into the mutex we unset. */ halt_agent = 1; slurm_mutex_lock(&slurmdbd_lock); halt_agent = 0; if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) { /* Either slurm_open_slurmdbd_conn() was not executed or * the connection to Slurm DBD has been closed */ if (req->msg_type == DBD_GET_CONFIG) _open_slurmdbd_conn(0); else _open_slurmdbd_conn(1); if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) { rc = SLURM_ERROR; goto end_it; } } if (!(buffer = pack_slurmdbd_msg(req, rpc_version))) { rc = SLURM_ERROR; goto end_it; } rc = slurm_persist_send_msg(slurmdbd_conn, buffer); free_buf(buffer); if (rc != SLURM_SUCCESS) { error("slurmdbd: Sending message type %s: %d: %m", rpc_num2string(req->msg_type), rc); goto end_it; } buffer = slurm_persist_recv_msg(slurmdbd_conn); if (buffer == NULL) { error("slurmdbd: Getting response to message type %u", req->msg_type); rc = SLURM_ERROR; goto end_it; } rc = unpack_slurmdbd_msg(resp, rpc_version, buffer); /* check for the rc of the start job message */ if (rc == SLURM_SUCCESS && resp->msg_type == DBD_ID_RC) rc = ((dbd_id_rc_msg_t *)resp->data)->return_code; free_buf(buffer); end_it: slurm_cond_signal(&slurmdbd_cond); slurm_mutex_unlock(&slurmdbd_lock); return rc; }
static void *_agent(void *x) { int cnt, rc; Buf buffer; struct timespec abs_time; static time_t fail_time = 0; int sigarray[] = {SIGUSR1, 0}; slurmdbd_msg_t list_req; dbd_list_msg_t list_msg; list_req.msg_type = DBD_SEND_MULT_MSG; list_req.data = &list_msg; memset(&list_msg, 0, sizeof(dbd_list_msg_t)); /* DEF_TIMERS; */ /* Prepare to catch SIGUSR1 to interrupt pending * I/O and terminate in a timely fashion. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); while (*slurmdbd_conn->shutdown == 0) { /* START_TIMER; */ slurm_mutex_lock(&slurmdbd_lock); if (halt_agent) slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock); if ((slurmdbd_conn->fd < 0) && (difftime(time(NULL), fail_time) >= 10)) { /* The connection to Slurm DBD is not open */ _open_slurmdbd_conn(1); if (slurmdbd_conn->fd < 0) fail_time = time(NULL); } slurm_mutex_lock(&agent_lock); if (agent_list && slurmdbd_conn->fd) cnt = list_count(agent_list); else cnt = 0; if ((cnt == 0) || (slurmdbd_conn->fd < 0) || (fail_time && (difftime(time(NULL), fail_time) < 10))) { slurm_mutex_unlock(&slurmdbd_lock); abs_time.tv_sec = time(NULL) + 10; abs_time.tv_nsec = 0; slurm_cond_timedwait(&agent_cond, &agent_lock, &abs_time); slurm_mutex_unlock(&agent_lock); continue; } else if ((cnt > 0) && ((cnt % 100) == 0)) info("slurmdbd: agent queue size %u", cnt); /* Leave item on the queue until processing complete */ if (agent_list) { int handle_agent_count = 1000; if (cnt > handle_agent_count) { int agent_count = 0; ListIterator agent_itr = list_iterator_create(agent_list); list_msg.my_list = list_create(NULL); while ((buffer = list_next(agent_itr))) { list_enqueue(list_msg.my_list, buffer); agent_count++; if (agent_count > handle_agent_count) break; } list_iterator_destroy(agent_itr); buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else if (cnt > 1) { list_msg.my_list = agent_list; buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else buffer = (Buf) list_peek(agent_list); } else buffer = NULL; slurm_mutex_unlock(&agent_lock); if (buffer == NULL) { slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); continue; } /* NOTE: agent_lock is clear here, so we can add more * requests to the queue while waiting for this RPC to * complete. */ rc = slurm_persist_send_msg(slurmdbd_conn, buffer); if (rc != SLURM_SUCCESS) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure sending message: %d: %m", rc); } else if (list_msg.my_list) { rc = _handle_mult_rc_ret(); } else { rc = _get_return_code(); if (rc == EAGAIN) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure with " "message need to resend: %d: %m", rc); } } slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); slurm_mutex_lock(&agent_lock); if (agent_list && (rc == SLURM_SUCCESS)) { /* * If we sent a mult_msg we just need to free buffer, * we don't need to requeue, just mark list_msg.my_list * as NULL as that is the sign we sent a mult_msg. */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; } else buffer = (Buf) list_dequeue(agent_list); free_buf(buffer); fail_time = 0; } else { /* We need to free a mult_msg even on failure */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; free_buf(buffer); } fail_time = time(NULL); } slurm_mutex_unlock(&agent_lock); /* END_TIMER; */ /* info("at the end with %s", TIME_STR); */ } slurm_mutex_lock(&agent_lock); _save_dbd_state(); FREE_NULL_LIST(agent_list); slurm_mutex_unlock(&agent_lock); return NULL; }