/* The timeslicer thread */ static void *_timeslicer_thread(void *arg) { /* Write locks on job and read lock on nodes */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; ListIterator part_iterator; struct gs_part *p_ptr; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: starting timeslicer loop"); while (!thread_shutdown) { _slice_sleep(); if (thread_shutdown) break; lock_slurmctld(job_write_lock); pthread_mutex_lock(&data_mutex); list_sort(gs_part_list, _sort_partitions); /* scan each partition... */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _timeslicer_thread: scanning partitions"); part_iterator = list_iterator_create(gs_part_list); while ((p_ptr = (struct gs_part *) list_next(part_iterator))) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _timeslicer_thread: part %s: " "run %u total %u", p_ptr->part_name, p_ptr->jobs_active, p_ptr->num_jobs); } if (p_ptr->jobs_active < (p_ptr->num_jobs + p_ptr->num_shadows)) { _cycle_job_list(p_ptr); } } list_iterator_destroy(part_iterator); pthread_mutex_unlock(&data_mutex); /* Preempt jobs that were formerly only suspended */ _preempt_job_dequeue(); /* MUST BE OUTSIDE data_mutex lock */ unlock_slurmctld(job_write_lock); } timeslicer_thread_id = (pthread_t) 0; pthread_exit((void *) 0); return NULL; }
static void _join_federation(slurmdb_federation_rec_t *fed, slurmdb_cluster_rec_t *cluster, bool update) { slurmctld_lock_t fed_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; _fed_mgr_ptr_init(fed, cluster); /* We must open the connections after we get out of the * write_lock or we will end up in deadlock. */ if (!update) { lock_slurmctld(fed_read_lock); _open_persist_sends(); unlock_slurmctld(fed_read_lock); } _create_ping_thread(); }
/* RET 0 on success, -1 on failure */ extern int suspend_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *tmp_char; int slurm_rc; suspend_msg_t msg; uint32_t jobid; static char reply_msg[128]; /* Locks: write job and node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "SUSPENDJOB lacks ARG"; error("wiki: SUSPENDJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: SUSPENDJOB has invalid jobid"); return -1; } msg.job_id = jobid; msg.op = SUSPEND_JOB; lock_slurmctld(job_write_lock); slurm_rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); unlock_slurmctld(job_write_lock); if (slurm_rc != SLURM_SUCCESS) { *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to suspend job %u (%m)", jobid); return -1; } snprintf(reply_msg, sizeof(reply_msg), "job %u suspended successfully", jobid); *err_msg = reply_msg; return 0; }
/* Notify a job via arbitrary message: * CMD=NOTIFYJOB ARG=<jobid> MSG=<string> * RET 0 on success, -1 on failure */ extern int job_notify_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *msg_ptr; int slurm_rc; uint32_t jobid; static char reply_msg[128]; /* Locks: read job */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "NOTIFYJOB lacks ARG="; error("wiki: NOTIFYJOB lacks ARG="); return -1; } arg_ptr += 4; jobid = atol(arg_ptr); msg_ptr = strstr(cmd_ptr, "MSG="); if (msg_ptr == NULL) { *err_code = -300; *err_msg = "NOTIFYJOB lacks MSG="; error("wiki: NOTIFYJOB lacks MSG="); return -1; } msg_ptr += 4; lock_slurmctld(job_read_lock); slurm_rc = _job_notify(jobid, msg_ptr); unlock_slurmctld(job_read_lock); if (slurm_rc != SLURM_SUCCESS) { *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to notify job %u (%m)", jobid); return -1; } snprintf(reply_msg, sizeof(reply_msg), "job %u notified successfully", jobid); *err_msg = reply_msg; return 0; }
extern int fed_mgr_fini() { slurmctld_lock_t fed_write_lock = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK }; slurm_mutex_lock(&init_mutex); inited = false; slurm_mutex_unlock(&init_mutex); lock_slurmctld(fed_write_lock); slurm_persist_conn_recv_server_fini(); _leave_federation(); unlock_slurmctld(fed_write_lock); return SLURM_SUCCESS; }
/* Perform periodic background activities */ static void *_bb_agent(void *args) { /* Locks: write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; while (!bb_state.term_flag) { bb_sleep(&bb_state, AGENT_INTERVAL); if (bb_state.term_flag) break; lock_slurmctld(job_write_lock); pthread_mutex_lock(&bb_state.bb_mutex); _load_state(0); _timeout_bb_rec(); pthread_mutex_unlock(&bb_state.bb_mutex); unlock_slurmctld(job_write_lock); } return NULL; }
/* block_state_mutex must be unlocked before calling this. */ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) { int rc; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; /* Wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran. */ if (wait_for_start) sleep(2); lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id); } unlock_slurmctld(job_write_lock); }
static void _notify_slurmctld_jobs(agent_info_t *agent_ptr) { /* Locks: Write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; uint32_t job_id = 0, step_id = 0; thd_t *thread_ptr = agent_ptr->thread_struct; if (agent_ptr->msg_type == SRUN_PING) { srun_ping_msg_t *msg = *agent_ptr->msg_args_pptr; job_id = msg->job_id; step_id = msg->step_id; } else if (agent_ptr->msg_type == SRUN_TIMEOUT) { srun_timeout_msg_t *msg = *agent_ptr->msg_args_pptr; job_id = msg->job_id; step_id = msg->step_id; } else if (agent_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) { resource_allocation_response_msg_t *msg = *agent_ptr->msg_args_pptr; job_id = msg->job_id; step_id = NO_VAL; } else if ((agent_ptr->msg_type == SRUN_JOB_COMPLETE) || (agent_ptr->msg_type == SRUN_STEP_MISSING) || (agent_ptr->msg_type == SRUN_STEP_SIGNAL) || (agent_ptr->msg_type == SRUN_EXEC) || (agent_ptr->msg_type == SRUN_USER_MSG)) { return; /* no need to note srun response */ } else if (agent_ptr->msg_type == SRUN_NODE_FAIL) { return; /* no need to note srun response */ } else { error("_notify_slurmctld_jobs invalid msg_type %u", agent_ptr->msg_type); return; } lock_slurmctld(job_write_lock); if (thread_ptr[0].state == DSH_DONE) { srun_response(job_id, step_id); } unlock_slurmctld(job_write_lock); }
/* backfill_agent - detached thread periodically attempts to backfill jobs */ extern void *backfill_agent(void *args) { struct timeval tv1, tv2; char tv_str[20]; time_t now; double wait_time; static time_t last_backfill_time = 0; /* Read config and partitions; Write jobs and nodes */ slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; _load_config(); last_backfill_time = time(NULL); while (!stop_backfill) { _my_sleep(backfill_interval); if (stop_backfill) break; if (config_flag) { config_flag = false; _load_config(); } now = time(NULL); wait_time = difftime(now, last_backfill_time); if ((wait_time < backfill_interval) || _job_is_completing() || _many_pending_rpcs() || !avail_front_end() || !_more_work(last_backfill_time)) continue; gettimeofday(&tv1, NULL); lock_slurmctld(all_locks); while (_attempt_backfill()) ; last_backfill_time = time(NULL); unlock_slurmctld(all_locks); gettimeofday(&tv2, NULL); _diff_tv_str(&tv1, &tv2, tv_str, 20); if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: completed, %s", tv_str); } return NULL; }
/* Return non-zero to break the backfill loop if change in job, node or * partition state or the backfill scheduler needs to be stopped. */ static int _yield_locks(void) { slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; time_t job_update, node_update, part_update; job_update = last_job_update; node_update = last_node_update; part_update = last_part_update; unlock_slurmctld(all_locks); _my_sleep(backfill_interval); lock_slurmctld(all_locks); if ((last_job_update == job_update) && (last_node_update == node_update) && (last_part_update == part_update) && (! stop_backfill) && (! config_flag)) return 0; else return 1; }
/* backfill_agent - detached thread periodically attempts to backfill jobs */ extern void *backfill_agent(void *args) { time_t now; double wait_time; static time_t last_backfill_time = 0; /* Read config and partitions; Write jobs and nodes */ slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; #if HAVE_SYS_PRCTL_H if (prctl(PR_SET_NAME, "slurmctld_bckfl", NULL, NULL, NULL) < 0) { error("%s: cannot set my name to %s %m", __func__, "slurm_backfill"); } #endif _load_config(); last_backfill_time = time(NULL); while (!stop_backfill) { _my_sleep(backfill_interval * 1000000); if (stop_backfill) break; if (config_flag) { config_flag = false; _load_config(); } now = time(NULL); wait_time = difftime(now, last_backfill_time); if ((wait_time < backfill_interval) || _job_is_completing() || _many_pending_rpcs() || !avail_front_end(NULL) || !_more_work(last_backfill_time)) continue; lock_slurmctld(all_locks); (void) _attempt_backfill(); last_backfill_time = time(NULL); unlock_slurmctld(all_locks); } return NULL; }
/*****************************************************************************\ * spawn message hander thread \*****************************************************************************/ extern int spawn_msg_thread(void) { pthread_attr_t thread_attr_msg; slurm_ctl_conf_t *conf; /* Locks: Read configurationn */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; lock_slurmctld(config_read_lock); conf = slurm_conf_lock(); sched_port = conf->dynalloc_port; slurm_conf_unlock(); unlock_slurmctld(config_read_lock); if (sched_port == 0) { error("DynAllocPort == 0, not spawning communication thread"); return SLURM_ERROR; } slurm_mutex_lock( &thread_flag_mutex ); if (thread_running) { error("dynalloc thread already running, not starting another"); slurm_mutex_unlock(&thread_flag_mutex); return SLURM_ERROR; } slurm_attr_init(&thread_attr_msg); if (pthread_create(&msg_thread_id, &thread_attr_msg, _msg_thread, NULL)) fatal("pthread_create %m"); else info("dynalloc: msg thread create successful!"); slurm_attr_destroy(&thread_attr_msg); thread_running = true; slurm_mutex_unlock(&thread_flag_mutex); return SLURM_SUCCESS; }
static void *_ping_thread(void *arg) { slurmctld_lock_t fed_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; #if HAVE_SYS_PRCTL_H if (prctl(PR_SET_NAME, "fed_ping", NULL, NULL, NULL) < 0) { error("%s: cannot set my name to %s %m", __func__, "fed_ping"); } #endif while (!stop_pinging && !slurmctld_config.shutdown_time) { ListIterator itr; slurmdb_cluster_rec_t *cluster; lock_slurmctld(fed_read_lock); if (!fed_mgr_fed_rec || !fed_mgr_fed_rec->cluster_list) goto next; itr = list_iterator_create(fed_mgr_fed_rec->cluster_list); while ((cluster = list_next(itr))) { if (cluster == fed_mgr_cluster_rec) continue; _ping_controller(cluster); } list_iterator_destroy(itr); next: unlock_slurmctld(fed_read_lock); sleep(5); } if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("Exiting ping thread"); return NULL; }
/* We can not invoke update_job_dependency() until the new job record has * been created, hence this sleeping thread modifies the dependent job * later. */ static void *_dep_agent(void *args) { struct job_record *job_ptr = (struct job_record *) args; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK}; char *end_ptr = NULL, *tok; int cnt = 0; usleep(100000); lock_slurmctld(job_write_lock); if (job_ptr && job_ptr->details && (job_ptr->magic == JOB_MAGIC) && job_ptr->comment && strstr(job_ptr->comment, "on:")) { char *new_depend = job_ptr->details->dependency; job_ptr->details->dependency = NULL; update_job_dependency(job_ptr, new_depend); xfree(new_depend); tok = strstr(job_ptr->comment, "on:"); cnt = strtol(tok + 3, &end_ptr, 10); } if (cnt == 0) set_job_prio(job_ptr); unlock_slurmctld(job_write_lock); return NULL; }
/* run_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ void run_backup(slurm_trigger_callbacks_t *callbacks) { int i; uint32_t trigger_type; time_t last_ping = 0; pthread_attr_t thread_attr_sig, thread_attr_rpc; slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; info("slurmctld running in background mode"); takeover = false; last_controller_response = time(NULL); /* default: don't resume if shutdown */ slurmctld_config.resume_backup = false; if (xsignal_block(backup_sigarray) < 0) error("Unable to block signals"); /* * create attached thread to process RPCs */ slurm_attr_init(&thread_attr_rpc); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr_rpc, _background_rpc_mgr, NULL)) { error("pthread_create error %m"); sleep(1); } slurm_attr_destroy(&thread_attr_rpc); /* * create attached thread for signal handling */ slurm_attr_init(&thread_attr_sig); while (pthread_create(&slurmctld_config.thread_id_sig, &thread_attr_sig, _background_signal_hand, NULL)) { error("pthread_create %m"); sleep(1); } slurm_attr_destroy(&thread_attr_sig); trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP; _trigger_slurmctld_event(trigger_type); for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { sleep(1); /* Give the primary slurmctld set-up time */ } /* repeatedly ping ControlMachine */ while (slurmctld_config.shutdown_time == 0) { sleep(1); /* Lock of slurmctld_conf below not important */ if (slurmctld_conf.slurmctld_timeout && (takeover == false) && (difftime(time(NULL), last_ping) < (slurmctld_conf.slurmctld_timeout / 3))) continue; last_ping = time(NULL); if (_ping_controller() == 0) last_controller_response = time(NULL); else if (takeover) { /* in takeover mode, take control as soon as */ /* primary no longer respond */ break; } else { uint32_t timeout; lock_slurmctld(config_read_lock); timeout = slurmctld_conf.slurmctld_timeout; unlock_slurmctld(config_read_lock); if (difftime(time(NULL), last_controller_response) > timeout) { break; } } } if (slurmctld_config.shutdown_time != 0) { /* Since pidfile is created as user root (its owner is * changed to SlurmUser) SlurmUser may not be able to * remove it, so this is not necessarily an error. * No longer need slurmctld_conf lock after above join. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) verbose("Unable to remove pidfile '%s': %m", slurmctld_conf.slurmctld_pidfile); info("BackupController terminating"); pthread_join(slurmctld_config.thread_id_sig, NULL); log_fini(); if (dump_core) abort(); else exit(0); } lock_slurmctld(config_read_lock); error("ControlMachine %s not responding, " "BackupController %s taking over", slurmctld_conf.control_machine, slurmctld_conf.backup_controller); unlock_slurmctld(config_read_lock); backup_slurmctld_restart(); trigger_primary_ctld_fail(); trigger_backup_ctld_as_ctrl(); pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); pthread_join(slurmctld_config.thread_id_sig, NULL); pthread_join(slurmctld_config.thread_id_rpc, NULL); /* The job list needs to be freed before we run * ctld_assoc_mgr_init, it should be empty here in the first place. */ lock_slurmctld(config_write_lock); job_fini(); init_job_conf(); unlock_slurmctld(config_write_lock); ctld_assoc_mgr_init(callbacks); /* clear old state and read new state */ lock_slurmctld(config_write_lock); if (switch_g_restore(slurmctld_conf.state_save_location, true)) { error("failed to restore switch state"); abort(); } if (read_slurm_conf(2, false)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); } slurmctld_config.shutdown_time = (time_t) 0; unlock_slurmctld(config_write_lock); select_g_select_nodeinfo_set_all(); return; }
/* Checkpoint processing pthread * Never returns, but is cancelled on plugin termiantion */ static void *_ckpt_agent_thr(void *arg) { struct ckpt_req *req = (struct ckpt_req *)arg; int rc; /* Locks: write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; struct job_record *job_ptr; struct step_record *step_ptr; struct check_job_info *check_ptr; /* only perform ckpt operation of ONE JOB */ slurm_mutex_lock(&ckpt_agent_mutex); while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) { pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex); } ckpt_agent_jobid = req->job_id; ckpt_agent_count ++; slurm_mutex_unlock(&ckpt_agent_mutex); debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u", req->op, req->job_id, req->step_id); rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time, req->image_dir, req->wait, req->nodelist); if (rc != SLURM_SUCCESS) { error("checkpoint/blcr: error on checkpoint request %u to " "%u.%u: %s", req->op, req->job_id, req->step_id, slurm_strerror(rc)); } if (req->op == CHECK_REQUEUE) _requeue_when_finished(req->job_id); lock_slurmctld(job_write_lock); job_ptr = find_job_record(req->job_id); if (!job_ptr) { error("_ckpt_agent_thr: job finished"); goto out; } if (req->step_id == SLURM_BATCH_SCRIPT) { /* batch job */ check_ptr = (struct check_job_info *)job_ptr->check_job; } else { step_ptr = find_step_record(job_ptr, req->step_id); if (! step_ptr) { error("_ckpt_agent_thr: step finished"); goto out; } check_ptr = (struct check_job_info *)step_ptr->check_job; } check_ptr->time_stamp = 0; check_ptr->error_code = rc; if (check_ptr->error_code != SLURM_SUCCESS) check_ptr->error_msg = xstrdup(slurm_strerror(rc)); out: unlock_slurmctld(job_write_lock); if (req->sig_done) { _send_sig(req->job_id, req->step_id, req->sig_done, req->nodelist); } _on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id, req->image_dir, rc); slurm_mutex_lock(&ckpt_agent_mutex); ckpt_agent_count --; if (ckpt_agent_count == 0) { ckpt_agent_jobid = 0; pthread_cond_broadcast(&ckpt_agent_cond); } slurm_mutex_unlock(&ckpt_agent_mutex); _ckpt_req_free(req); return NULL; }
/* Modify a job: * CMD=MODIFYJOB ARG=<jobid> * [BANK=<name>;] * [COMMENT=<whatever>;] * [DEPEND=afterany:<jobid>;] * [JOBNAME=<name>;] * [MINSTARTTIME=<uts>;] * [NODES=<number>;] * [PARTITION=<name>;] * [RFEATURES=<features>;] * [TIMELIMT=<seconds>;] * [VARIABLELIST=<env_vars>;] * [GRES=<name:value>;] * [WCKEY=<name>;] * * RET 0 on success, -1 on failure */ extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *bank_ptr, *depend_ptr, *nodes_ptr, *start_ptr; char *host_ptr, *name_ptr, *part_ptr, *time_ptr, *tmp_char; char *comment_ptr, *feature_ptr, *env_ptr, *gres_ptr, *wckey_ptr; int i, slurm_rc; uint32_t jobid, new_node_cnt = 0, new_time_limit = 0; static char reply_msg[128]; /* Locks: write job, read node and partition info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "MODIFYJOB lacks ARG="; error("wiki: MODIFYJOB lacks ARG="); return -1; } /* Change all parsed "=" to ":" then search for remaining "=" * and report results as unrecognized options */ arg_ptr[3] = ':'; arg_ptr += 4; jobid = strtoul(arg_ptr, &tmp_char, 10); if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: MODIFYJOB has invalid jobid"); return -1; } bank_ptr = strstr(cmd_ptr, "BANK="); comment_ptr = strstr(cmd_ptr, "COMMENT="); depend_ptr = strstr(cmd_ptr, "DEPEND="); gres_ptr = strstr(cmd_ptr, "GRES="); host_ptr = strstr(cmd_ptr, "HOSTLIST="); name_ptr = strstr(cmd_ptr, "JOBNAME="); start_ptr = strstr(cmd_ptr, "MINSTARTTIME="); nodes_ptr = strstr(cmd_ptr, "NODES="); part_ptr = strstr(cmd_ptr, "PARTITION="); feature_ptr = strstr(cmd_ptr, "RFEATURES="); time_ptr = strstr(cmd_ptr, "TIMELIMIT="); env_ptr = strstr(cmd_ptr, "VARIABLELIST="); wckey_ptr = strstr(cmd_ptr, "WCKEY="); if (bank_ptr) { bank_ptr[4] = ':'; bank_ptr += 5; null_term(bank_ptr); } if (comment_ptr) { comment_ptr[7] = ':'; comment_ptr += 8; if (comment_ptr[0] == '\"') { comment_ptr++; for (i=0; ; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\"') { comment_ptr[i] = '\0'; break; } } } else if (comment_ptr[0] == '\'') { comment_ptr++; for (i=0; ; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\'') { comment_ptr[i] = '\0'; break; } } } else null_term(comment_ptr); } if (depend_ptr) { depend_ptr[6] = ':'; depend_ptr += 7; null_term(depend_ptr); } if (feature_ptr) { feature_ptr[9] = ':'; feature_ptr += 10; null_term(feature_ptr); } if (gres_ptr) { gres_ptr[4] = ':'; gres_ptr += 5; null_term(gres_ptr); } if (host_ptr) { host_ptr[8] = ':'; host_ptr += 9; null_term(host_ptr); } if (name_ptr) { name_ptr[7] = ':'; name_ptr += 8; if (name_ptr[0] == '\"') { name_ptr++; for (i=0; ; i++) { if (name_ptr[i] == '\0') break; if (name_ptr[i] == '\"') { name_ptr[i] = '\0'; break; } } } else if (name_ptr[0] == '\'') { name_ptr++; for (i=0; ; i++) { if (name_ptr[i] == '\0') break; if (name_ptr[i] == '\'') { name_ptr[i] = '\0'; break; } } } else null_term(name_ptr); } if (start_ptr) { start_ptr[12] = ':'; start_ptr += 13; null_term(start_ptr); } if (nodes_ptr) { nodes_ptr[5] = ':'; nodes_ptr += 6; new_node_cnt = strtoul(nodes_ptr, NULL, 10); } if (part_ptr) { part_ptr[9] = ':'; part_ptr += 10; null_term(part_ptr); } if (time_ptr) { time_ptr[9] = ':'; time_ptr += 10; new_time_limit = strtoul(time_ptr, NULL, 10); } if (env_ptr) { env_ptr[12] = ':'; env_ptr += 13; null_term(env_ptr); } if (wckey_ptr) { wckey_ptr[5] = ':'; wckey_ptr += 6; null_term(wckey_ptr); } /* Look for any un-parsed "=" ignoring anything after VARIABLELIST * which is expected to contain "=" in its value*/ tmp_char = strchr(cmd_ptr, '='); if (tmp_char && (!env_ptr || (env_ptr > tmp_char))) { tmp_char[0] = '\0'; while (tmp_char[-1] && (!isspace(tmp_char[-1]))) tmp_char--; error("wiki: Invalid MODIFYJOB option %s", tmp_char); } lock_slurmctld(job_write_lock); slurm_rc = _job_modify(jobid, bank_ptr, depend_ptr, host_ptr, new_node_cnt, part_ptr, new_time_limit, name_ptr, start_ptr, feature_ptr, env_ptr, comment_ptr, gres_ptr, wckey_ptr); unlock_slurmctld(job_write_lock); if (slurm_rc != SLURM_SUCCESS) { *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to modify job %u (%m)", jobid); return -1; } snprintf(reply_msg, sizeof(reply_msg), "job %u modified successfully", jobid); *err_msg = reply_msg; return 0; }
/* Modify a job: * CMD=MODIFYJOB ARG=<jobid> PARTITION=<name> NODES=<number> * DEPEND=afterany:<jobid> TIMELIMT=<seconds> BANK=<name> * RET 0 on success, -1 on failure */ extern int job_modify_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *bank_ptr, *depend_ptr, *nodes_ptr; char *host_ptr, *part_ptr, *time_ptr, *tmp_char; int slurm_rc; uint32_t jobid, new_node_cnt = 0, new_time_limit = 0; static char reply_msg[128]; /* Locks: write job, read node and partition info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "MODIFYJOB lacks ARG="; error("wiki: MODIFYJOB lacks ARG="); return -1; } /* Change all parsed "=" to ":" then search for remaining "=" * and report results as unrecognized options */ arg_ptr[3] = ':'; arg_ptr += 4; jobid = strtoul(arg_ptr, &tmp_char, 10); if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: MODIFYJOB has invalid jobid"); return -1; } bank_ptr = strstr(cmd_ptr, "BANK="); depend_ptr = strstr(cmd_ptr, "DEPEND="); host_ptr = strstr(cmd_ptr, "HOSTLIST="); nodes_ptr = strstr(cmd_ptr, "NODES="); part_ptr = strstr(cmd_ptr, "PARTITION="); time_ptr = strstr(cmd_ptr, "TIMELIMIT="); if (bank_ptr) { bank_ptr[4] = ':'; bank_ptr += 5; null_term(bank_ptr); } if (depend_ptr) { depend_ptr[6] = ':'; depend_ptr += 7; null_term(depend_ptr); } if (host_ptr) { host_ptr[8] = ':'; host_ptr += 9; null_term(bank_ptr); } if (nodes_ptr) { nodes_ptr[5] = ':'; nodes_ptr += 6; new_node_cnt = strtoul(nodes_ptr, NULL, 10); } if (part_ptr) { part_ptr[9] = ':'; part_ptr += 10; null_term(part_ptr); } if (time_ptr) { time_ptr[9] = ':'; time_ptr += 10; new_time_limit = strtoul(time_ptr, NULL, 10); } /* Look for any un-parsed "=" */ tmp_char = strchr(cmd_ptr, '='); if (tmp_char) { tmp_char[0] = '\0'; while (tmp_char[-1] && (!isspace(tmp_char[-1]))) tmp_char--; error("wiki: Invalid MODIFYJOB option %s", tmp_char); } lock_slurmctld(job_write_lock); slurm_rc = _job_modify(jobid, bank_ptr, depend_ptr, host_ptr, new_node_cnt, part_ptr, new_time_limit); unlock_slurmctld(job_write_lock); if (slurm_rc != SLURM_SUCCESS) { *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to modify job %u (%m)", jobid); return -1; } snprintf(reply_msg, sizeof(reply_msg), "job %u modified successfully", jobid); *err_msg = reply_msg; return 0; }
/* * init_power_save - Initialize the power save module. Started as a * pthread. Terminates automatically at slurmctld shutdown time. * Input and output are unused. */ static void *_init_power_save(void *arg) { /* Locks: Read nodes */ slurmctld_lock_t node_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write nodes */ slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; time_t now, boot_time = 0, last_power_scan = 0; if (power_save_config && !power_save_enabled) { debug("power_save mode not enabled"); return NULL; } suspend_node_bitmap = bit_alloc(node_record_count); resume_node_bitmap = bit_alloc(node_record_count); while (slurmctld_config.shutdown_time == 0) { sleep(1); if (_reap_procs() < 2) { debug("power_save programs getting backlogged"); continue; } if ((last_config != slurmctld_conf.last_update) && (_init_power_config())) { info("power_save mode has been disabled due to " "configuration changes"); goto fini; } now = time(NULL); if (boot_time == 0) boot_time = now; /* Only run every 60 seconds or after a node state change, * whichever happens first */ if ((last_node_update >= last_power_scan) || (now >= (last_power_scan + 60))) { lock_slurmctld(node_write_lock); _do_power_work(now); unlock_slurmctld(node_write_lock); last_power_scan = now; } if (slurmd_timeout && (now > (boot_time + (slurmd_timeout / 2)))) { lock_slurmctld(node_read_lock); _re_wake(); unlock_slurmctld(node_read_lock); /* prevent additional executions */ boot_time += (365 * 24 * 60 * 60); slurmd_timeout = 0; } } fini: _clear_power_config(); FREE_NULL_BITMAP(suspend_node_bitmap); FREE_NULL_BITMAP(resume_node_bitmap); _shutdown_power(); slurm_mutex_lock(&power_mutex); power_save_enabled = false; pthread_cond_signal(&power_cond); slurm_mutex_unlock(&power_mutex); pthread_exit(NULL); return NULL; }
/* * Attempt to start a job * jobid (IN) - job id * task_cnt (IN) - total count of tasks to start * hostlist (IN) - SLURM hostlist expression with no repeated hostnames * tasklist (IN/OUT) - comma separated list of hosts with tasks to be started, * list hostname once per task to start * comment_ptr (IN) - new comment field for the job or NULL for no change * err_code (OUT) - Moab error code * err_msg (OUT) - Moab error message */ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, char *tasklist, char *comment_ptr, int *err_code, char **err_msg) { int rc = 0, old_task_cnt = 1; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; char *new_node_list = NULL; static char tmp_msg[128]; bitstr_t *new_bitmap = (bitstr_t *) NULL; bitstr_t *save_req_bitmap = (bitstr_t *) NULL; bitoff_t i, bsize; int ll; /* layout info index */ char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; size_t node_name_len; static uint32_t cr_test = 0, cr_enabled = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_enabled); cr_test = 1; } lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); rc = -1; goto fini; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "Job not pending, can't start"; error("wiki: Attempt to start job %u in state %s", jobid, job_state_string(job_ptr->job_state)); rc = -1; goto fini; } if (comment_ptr) { char *reserved = strstr(comment_ptr, "RESERVED:"); if (reserved) { reserved += 9; job_ptr->details->reserved_resources = strtol(reserved, NULL, 10); } xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); } if (task_cnt) { new_node_list = xstrdup(hostlist); if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid TASKLIST"; error("wiki: Attempt to set invalid node list for " "job %u, %s", jobid, hostlist); xfree(new_node_list); rc = -1; goto fini; } if (!bit_super_set(new_bitmap, avail_node_bitmap)) { /* Selected node is UP and not responding * or it just went DOWN */ *err_code = -700; *err_msg = "TASKLIST includes non-responsive node"; error("wiki: Attempt to use non-responsive nodes for " "job %u, %s", jobid, hostlist); xfree(new_node_list); FREE_NULL_BITMAP(new_bitmap); rc = -1; goto fini; } /* User excluded node list incompatible with Wiki * Exclude all nodes not explicitly requested */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Build layout information from tasklist (assuming that Moab * sends a non-bracketed list of nodes, repeated as many times * as cpus should be used per node); at this point, node names * are comma-separated. This is _not_ a fast algorithm as it * performs many string compares. */ xfree(job_ptr->details->req_node_layout); if (task_cnt && cr_enabled) { uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task); job_ptr->details->req_node_layout = (uint16_t *) xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t)); bsize = bit_size(new_bitmap); for (i = 0, ll = -1; i < bsize; i++) { if (!bit_test(new_bitmap, i)) continue; ll++; node_name = node_record_table_ptr[i].name; node_name_len = strlen(node_name); if (node_name_len == 0) continue; node_cur = tasklist; while (*node_cur) { if ((node_idx = strstr(node_cur, node_name))) { if ((node_idx[node_name_len] == ',') || (node_idx[node_name_len] == '\0')) { job_ptr->details-> req_node_layout[ll] += cpus_per_task; } node_cur = strchr(node_idx, ','); if (node_cur) continue; } break; } } } /* save and update job state to start now */ save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; old_task_cnt = job_ptr->details->min_cpus; job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); if (rc) return rc; /* No errors so far */ (void) schedule(INFINITE); /* provides own locking */ /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); if (job_ptr->job_id != jobid) job_ptr = find_job_record(jobid); if (job_ptr && (job_ptr->job_id == jobid) && (!IS_JOB_RUNNING(job_ptr))) { uint16_t wait_reason = 0; char *wait_string; if (IS_JOB_FAILED(job_ptr)) wait_string = "Invalid request, job aborted"; else { wait_reason = job_ptr->state_reason; if (wait_reason == WAIT_HELD) { /* some job is completing, slurmctld did * not even try to schedule this job */ wait_reason = WAIT_RESOURCES; } wait_string = job_reason_string(wait_reason); job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } *err_code = -910 - wait_reason; snprintf(tmp_msg, sizeof(tmp_msg), "Could not start job %u(%s): %s", jobid, new_node_list, wait_string); *err_msg = tmp_msg; error("wiki: %s", tmp_msg); /* restore some of job state */ job_ptr->priority = 0; job_ptr->details->min_cpus = old_task_cnt; rc = -1; } if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) { /* Restore required node list in case job requeued */ xfree(job_ptr->details->req_nodes); job_ptr->details->req_nodes = save_req_nodes; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = save_req_bitmap; FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); xfree(job_ptr->details->req_node_layout); } else { error("wiki: start_job(%u) job missing", jobid); xfree(save_req_nodes); FREE_NULL_BITMAP(save_req_bitmap); } unlock_slurmctld(job_write_lock); schedule_node_save(); /* provides own locking */ schedule_job_save(); /* provides own locking */ return rc; }
/* dump_all_front_end_state - save the state of all front_end nodes to file */ extern int dump_all_front_end_state(void) { #ifdef HAVE_FRONT_END /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = (1024 * 1024); int error_code = 0, i, log_fd; char *old_file, *new_file, *reg_file; front_end_record_t *front_end_ptr; /* Locks: Read config and node */ slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; Buf buffer = init_buf(high_buffer_size); DEF_TIMERS; START_TIMER; /* write header: version, time */ packstr(FRONT_END_STATE_VERSION, buffer); pack_time(time(NULL), buffer); /* write node records to buffer */ lock_slurmctld (node_read_lock); for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { xassert(front_end_ptr->magic == FRONT_END_MAGIC); _dump_front_end_state(front_end_ptr, buffer); } old_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (old_file, "/front_end_state.old"); reg_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (reg_file, "/front_end_state"); new_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (new_file, "/front_end_state.new"); unlock_slurmctld (node_read_lock); /* write the buffer to file */ lock_state_files(); log_fd = creat (new_file, 0600); if (log_fd < 0) { error ("Can't save state, error creating file %s %m", new_file); error_code = errno; } else { int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; char *data = (char *)get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "front_end"); if (rc && !error_code) error_code = rc; } if (error_code) (void) unlink (new_file); else { /* file shuffle */ (void) unlink (old_file); if (link(reg_file, old_file)) debug4("unable to create link for %s -> %s: %m", reg_file, old_file); (void) unlink (reg_file); if (link(new_file, reg_file)) debug4("unable to create link for %s -> %s: %m", new_file, reg_file); (void) unlink (new_file); } xfree (old_file); xfree (reg_file); xfree (new_file); unlock_state_files (); free_buf (buffer); END_TIMER2("dump_all_front_end_state"); return error_code; #else return SLURM_SUCCESS; #endif }
/* _background_rpc_mgr - Read and process incoming RPCs to the background * controller (that's us) */ static void *_background_rpc_mgr(void *no_data) { slurm_fd_t newsockfd; slurm_fd_t sockfd; slurm_addr_t cli_addr; slurm_msg_t *msg = NULL; int error_code; char* node_addr = NULL; /* Read configuration only */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; int sigarray[] = {SIGUSR1, 0}; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid()); /* initialize port for RPCs */ lock_slurmctld(config_read_lock); /* set node_addr to bind to (NULL means any) */ if ((strcmp(slurmctld_conf.backup_controller, slurmctld_conf.backup_addr) != 0)) { node_addr = slurmctld_conf.backup_addr ; } if ((sockfd = slurm_init_msg_engine_addrname_port(node_addr, slurmctld_conf. slurmctld_port)) == SLURM_SOCKET_ERROR) fatal("slurm_init_msg_engine_addrname_port error %m"); unlock_slurmctld(config_read_lock); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmctld signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); /* * Process incoming RPCs indefinitely */ while (slurmctld_config.shutdown_time == 0) { /* accept needed for stream implementation * is a no-op in message implementation that just passes * sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { if (errno != EINTR) error("slurm_accept_msg_conn: %m"); continue; } msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); if (slurm_receive_msg(newsockfd, msg, 0) != 0) error("slurm_receive_msg: %m"); error_code = _background_process_msg(msg); if ((error_code == SLURM_SUCCESS) && (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) && (slurmctld_config.shutdown_time == 0)) slurmctld_config.shutdown_time = time(NULL); slurm_free_msg_data(msg->msg_type, msg->data); slurm_free_msg(msg); slurm_close(newsockfd); /* close new socket */ } debug3("_background_rpc_mgr shutting down"); slurm_close(sockfd); /* close the main socket */ pthread_exit((void *) 0); return NULL; }
/* Perform job initiation work */ static void _start_agent(bg_action_t *bg_action_ptr) { int rc, set_user_rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_record_t *found_record = NULL; ListIterator itr; List delete_list = NULL; int requeue_job = 0; uint32_t req_job_id = bg_action_ptr->job_ptr->job_id; bool block_inited = 0; bool delete_it = 0; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); bg_requeue_job(req_job_id, 1, 0, JOB_BOOT_FAIL, false); return; } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { bg_record->modifying = 0; // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the queueing job " "(everything is ok)", req_job_id); return; } if ((bg_record->state == BG_BLOCK_TERM) || bg_record->free_cnt) { /* It doesn't appear state of a small block (conn_type) is held on a BGP system so if we to reset it so, just set the reboot flag and handle it later in that code. */ bg_action_ptr->reboot = 1; } delete_list = list_create(NULL); itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (bg_record == found_record) continue; if (!blocks_overlap(bg_record, found_record)) { debug2("block %s isn't part of %s", found_record->bg_block_id, bg_record->bg_block_id); continue; } if (found_record->job_ptr || (found_record->job_list && list_count(found_record->job_list))) { struct job_record *job_ptr = found_record->job_ptr; if (!found_record->job_ptr) job_ptr = find_job_in_bg_record( found_record, NO_VAL); error("Trying to start job %u on block %s, " "but there is a job %u running on an overlapping " "block %s it will not end until %ld. " "This should never happen.", req_job_id, bg_record->bg_block_id, job_ptr->job_id, found_record->bg_block_id, job_ptr->end_time); requeue_job = 1; break; } debug2("need to make sure %s is free, it's part of %s", found_record->bg_block_id, bg_record->bg_block_id); list_push(delete_list, found_record); } list_iterator_destroy(itr); if (requeue_job) { FREE_NULL_LIST(delete_list); bg_reset_block(bg_record, bg_action_ptr->job_ptr); bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); bg_requeue_job(req_job_id, 0, 0, JOB_BOOT_FAIL, false); return; } slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) delete_it = 1; free_block_list(req_job_id, delete_list, delete_it, 1); FREE_NULL_LIST(delete_list); while (1) { slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before return. No need to reset modifying here if the block doesn't exist. */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) { error("Problem with deallocating blocks to run job %u " "on block %s", req_job_id, bg_action_ptr->bg_block_id); return; } /* If another thread is freeing this block we need to wait until it is done or we will get into a state where this job will be killed. */ if (!bg_record->free_cnt) break; debug("Waiting for block %s to free for job %u. " "%d thread(s) trying to free it", bg_record->bg_block_id, req_job_id, bg_record->free_cnt); slurm_mutex_unlock(&block_state_mutex); sleep(1); } /* This was set in the start_job function to close the above window where a job could be mistakenly requeued if another thread is trying to free this block as we are trying to run on it, which is fine since we will reboot it later. */ bg_record->modifying = 0; if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u already finished before boot", req_job_id); return; } if (bg_record->job_list && (bg_action_ptr->job_ptr->total_cpus != bg_record->cpu_cnt) && (list_count(bg_record->job_list) != 1)) { /* We don't allow modification of a block or reboot of a block if we are running multiple jobs on the block. */ debug2("no reboot"); goto no_reboot; } rc = 0; #ifdef HAVE_BGL if (bg_action_ptr->blrtsimage && xstrcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) { debug3("changing BlrtsImage from %s to %s", bg_record->blrtsimage, bg_action_ptr->blrtsimage); xfree(bg_record->blrtsimage); bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage); rc = 1; } #elif defined HAVE_BGP if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL) && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) { if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) { char *req_conn_type = conn_type_string_full(bg_action_ptr->conn_type); char *conn_type = conn_type_string_full(bg_record->conn_type); debug3("changing small block mode from %s to %s", conn_type, req_conn_type); xfree(req_conn_type); xfree(conn_type); } rc = 1; # ifndef HAVE_BG_FILES /* since we don't check state on an emulated system we * have to change it here */ bg_record->conn_type[0] = bg_action_ptr->conn_type[0]; # endif } #endif #ifdef HAVE_BG_L_P if (bg_action_ptr->linuximage && xstrcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) { # ifdef HAVE_BGL debug3("changing LinuxImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # else debug3("changing CnloadImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # endif xfree(bg_record->linuximage); bg_record->linuximage = xstrdup(bg_action_ptr->linuximage); rc = 1; } if (bg_action_ptr->ramdiskimage && xstrcasecmp(bg_action_ptr->ramdiskimage, bg_record->ramdiskimage)) { # ifdef HAVE_BGL debug3("changing RamDiskImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # else debug3("changing IoloadImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # endif xfree(bg_record->ramdiskimage); bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage); rc = 1; } #endif if (bg_action_ptr->mloaderimage && xstrcasecmp(bg_action_ptr->mloaderimage, bg_record->mloaderimage)) { debug3("changing MloaderImage from %s to %s", bg_record->mloaderimage, bg_action_ptr->mloaderimage); xfree(bg_record->mloaderimage); bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage); rc = 1; } if (rc || bg_action_ptr->reboot) { bg_record->modifying = 1; /* Increment free_cnt to make sure we don't loose this * block since bg_free_block will unlock block_state_mutex. */ bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_BlrtsImg, bg_record->blrtsimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_LinuxImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_LinuxImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_RamdiskImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s", bg_err_str(rc)); #elif defined HAVE_BGP if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_CnloadImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_CnloadImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_IoloadImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_IoloadImg): %s", bg_err_str(rc)); if (bg_action_ptr->conn_type[0] > SELECT_SMALL) { char *conn_type = NULL; switch(bg_action_ptr->conn_type[0]) { case SELECT_HTC_S: conn_type = "s"; break; case SELECT_HTC_D: conn_type = "d"; break; case SELECT_HTC_V: conn_type = "v"; break; case SELECT_HTC_L: conn_type = "l"; break; default: break; } /* the option has to be set before the pool can be set */ if ((rc = bridge_block_modify( bg_record->bg_block_id, RM_MODIFY_Options, conn_type)) != SLURM_SUCCESS) error("bridge_set_data(RM_MODIFY_Options): %s", bg_err_str(rc)); } #endif if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_MloaderImg, bg_record->mloaderimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_MloaderImg): %s", bg_err_str(rc)); #endif bg_record->modifying = 0; } no_reboot: if (bg_record->state == BG_BLOCK_FREE) { if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { char reason[200]; bg_record->boot_state = 0; bg_record->boot_count = 0; if (rc == BG_ERROR_INVALID_STATE) snprintf(reason, sizeof(reason), "Block %s is in an incompatible " "state. This usually means " "hardware is allocated " "by another block (maybe outside " "of SLURM).", bg_record->bg_block_id); else snprintf(reason, sizeof(reason), "Couldn't boot block %s: %s", bg_record->bg_block_id, bg_err_str(rc)); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); return; } } else if (bg_record->state == BG_BLOCK_BOOTING) { #ifdef HAVE_BG_FILES bg_record->boot_state = 1; #else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); bg_record->state = BG_BLOCK_INITED; last_bg_update = time(NULL); #endif } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the start of the boot " "(everything is ok)", req_job_id); return; } /* Don't reset boot_count, it will be reset when state changes, and needs to outlast a job allocation. */ /* bg_record->boot_count = 0; */ if (bg_record->state == BG_BLOCK_INITED) { debug("block %s is already ready.", bg_record->bg_block_id); /* Just in case reset the boot flags */ bg_record->boot_state = 0; bg_record->boot_count = 0; set_user_rc = bridge_block_sync_users(bg_record); block_inited = 1; } slurm_mutex_unlock(&block_state_mutex); /* This lock needs to happen after the block_state_mutex to avoid deadlock. */ if (block_inited && bg_action_ptr->job_ptr) { slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; lock_slurmctld(job_write_lock); bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); unlock_slurmctld(job_write_lock); } if (set_user_rc == SLURM_ERROR) { sleep(2); /* wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just in case the fail job isn't ran */ (void) slurm_fail_job(req_job_id, JOB_BOOT_FAIL); } }
static void _notify_slurmctld_nodes(agent_info_t *agent_ptr, int no_resp_cnt, int retry_cnt) { ListIterator itr = NULL; ret_data_info_t *ret_data_info = NULL; state_t state; int is_ret_list = 1; /* Locks: Read config, write job, write node */ slurmctld_lock_t node_write_lock = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; thd_t *thread_ptr = agent_ptr->thread_struct; int i; /* Notify slurmctld of non-responding nodes */ if (no_resp_cnt) { /* Update node table data for non-responding nodes */ lock_slurmctld(node_write_lock); if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) { /* Requeue the request */ batch_job_launch_msg_t *launch_msg_ptr = *agent_ptr->msg_args_pptr; uint32_t job_id = launch_msg_ptr->job_id; job_complete(job_id, 0, true, false, 0); } unlock_slurmctld(node_write_lock); } if (retry_cnt && agent_ptr->retry) _queue_agent_retry(agent_ptr, retry_cnt); /* Update last_response on responding nodes */ lock_slurmctld(node_write_lock); for (i = 0; i < agent_ptr->thread_count; i++) { char *down_msg, *node_names; if (!thread_ptr[i].ret_list) { state = thread_ptr[i].state; is_ret_list = 0; goto switch_on_state; } is_ret_list = 1; itr = list_iterator_create(thread_ptr[i].ret_list); while ((ret_data_info = list_next(itr))) { state = ret_data_info->err; switch_on_state: switch(state) { case DSH_NO_RESP: if (!is_ret_list) { node_not_resp(thread_ptr[i].nodelist, thread_ptr[i]. start_time); } else { node_not_resp(ret_data_info->node_name, thread_ptr[i].start_time); } break; case DSH_FAILED: if (is_ret_list) node_names = ret_data_info->node_name; else node_names = thread_ptr[i].nodelist; #ifdef HAVE_FRONT_END down_msg = ""; #else set_node_down(node_names, "Prolog/Epilog failure"); down_msg = ", set to state DOWN"; #endif error("Prolog/Epilog failure on nodes %s%s", node_names, down_msg); break; case DSH_DONE: if (!is_ret_list) node_did_resp(thread_ptr[i].nodelist); else node_did_resp(ret_data_info->node_name); break; default: if (!is_ret_list) { error("unknown state returned for %s", thread_ptr[i].nodelist); } else { error("unknown state returned for %s", ret_data_info->node_name); } break; } if (!is_ret_list) goto finished; } list_iterator_destroy(itr); finished: ; } unlock_slurmctld(node_write_lock); if (run_scheduler) { run_scheduler = false; /* below functions all have their own locking */ if (schedule(0)) { schedule_job_save(); schedule_node_save(); } } if ((agent_ptr->msg_type == REQUEST_PING) || (agent_ptr->msg_type == REQUEST_HEALTH_CHECK) || (agent_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS)) ping_end(); }
/* * get_nodes - get information on specific node(s) changed since some time * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...] * [<UPDATETIME>:ALL] * err_code OUT - 0 or an error code * err_msg OUT - response message * NOTE: xfree() err_msg if err_code is zero * RET 0 on success, -1 on failure * * Response format * ARG=<cnt>#<NODEID>: * STATE=<state>; Moab equivalent node state * [CAT=<reason>]; Reason for a node being down or drained * colon separator * CCLASS=<[part:cpus]>; SLURM partition with CPU count of node, * make have more than one partition * [CPULOAD=<load_ave>;] One minute BSD load average * [ARCH=<architecture>;] Computer architecture * [OS=<operating_system>;] Operating system * CMEMORY=<MB>; MB of memory on node * CDISK=<MB>; MB of disk space on node * CPROC=<cpus>; CPU count on node * [FEATURE=<feature>;] Features associated with node, if any * [GRES=<name>[:<count>],...;] generic resources on the node * [#<NODEID>:...]; */ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK }; int node_rec_cnt = 0, buf_size = 0; #ifdef HAVE_ALPS_CRAY /* Locks: write node */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; /* * Run a Basil Inventory immediately before scheduling, to avoid * race conditions caused by ALPS node state change (caused e.g. * by the node health checker). * This relies on the above write lock for the node state. */ lock_slurmctld(node_write_lock); if (select_g_reconfigure()) { unlock_slurmctld(node_write_lock); *err_code = -720; *err_msg = "Unable to run ALPS inventory"; error("wiki: Unable to run ALPS inventory"); return -1; } unlock_slurmctld(node_write_lock); #endif arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "GETNODES lacks ARG"; error("wiki: GETNODES lacks ARG"); return -1; } update_time = (time_t) strtoul(arg_ptr+4, &tmp_char, 10); if (tmp_char[0] != ':') { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: GETNODES has invalid ARG value"); return -1; } tmp_char++; lock_slurmctld(node_read_lock); if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all nodes */ buf = _dump_all_nodes(&node_rec_cnt, update_time); } else { struct node_record *node_ptr = NULL; char *node_name, *slurm_hosts; int node_cnt; hostset_t slurm_hostset; slurm_hosts = moab2slurm_task_list(tmp_char, &node_cnt); if ((slurm_hostset = hostset_create(slurm_hosts))) { while ((node_name = hostset_shift(slurm_hostset))) { node_ptr = find_node_record(node_name); if (node_ptr == NULL) { error("sched/wiki2: bad hostname %s", node_name); continue; } if (_hidden_node(node_ptr)) continue; tmp_buf = _dump_node(node_ptr, NULL, update_time); if (node_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); node_rec_cnt++; } hostset_destroy(slurm_hostset); } else { error("hostset_create(%s): %m", slurm_hosts); } xfree(slurm_hosts); } unlock_slurmctld(node_read_lock); /* Prepend ("ARG=%d", node_rec_cnt) to reply message */ if (buf) buf_size = strlen(buf); tmp_buf = xmalloc(buf_size + 32); if (node_rec_cnt) sprintf(tmp_buf, "SC=0 ARG=%d#%s", node_rec_cnt, buf); else sprintf(tmp_buf, "SC=0 ARG=0#"); xfree(buf); *err_code = 0; *err_msg = tmp_buf; return 0; }
/* * _thread_per_group_rpc - thread to issue an RPC for a group of nodes * sending message out to one and forwarding it to * others if necessary. * IN/OUT args - pointer to task_info_t, xfree'd on completion */ static void *_thread_per_group_rpc(void *args) { int rc = SLURM_SUCCESS; slurm_msg_t msg; task_info_t *task_ptr = (task_info_t *) args; /* we cache some pointers from task_info_t because we need * to xfree args before being finished with their use. xfree * is required for timely termination of this pthread because * xfree could lock it at the end, preventing a timely * thread_exit */ pthread_mutex_t *thread_mutex_ptr = task_ptr->thread_mutex_ptr; pthread_cond_t *thread_cond_ptr = task_ptr->thread_cond_ptr; uint32_t *threads_active_ptr = task_ptr->threads_active_ptr; thd_t *thread_ptr = task_ptr->thread_struct_ptr; state_t thread_state = DSH_NO_RESP; slurm_msg_type_t msg_type = task_ptr->msg_type; bool is_kill_msg, srun_agent; List ret_list = NULL; ListIterator itr; ret_data_info_t *ret_data_info = NULL; int found = 0; int sig_array[2] = {SIGUSR1, 0}; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; xassert(args != NULL); xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sig_array); is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) || (msg_type == REQUEST_TERMINATE_JOB) ); srun_agent = ( (msg_type == SRUN_PING) || (msg_type == SRUN_EXEC) || (msg_type == SRUN_JOB_COMPLETE) || (msg_type == SRUN_STEP_MISSING) || (msg_type == SRUN_TIMEOUT) || (msg_type == SRUN_USER_MSG) || (msg_type == RESPONSE_RESOURCE_ALLOCATION) || (msg_type == SRUN_NODE_FAIL) ); thread_ptr->start_time = time(NULL); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->state = DSH_ACTIVE; thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT; slurm_mutex_unlock(thread_mutex_ptr); /* send request message */ slurm_msg_t_init(&msg); msg.msg_type = msg_type; msg.data = task_ptr->msg_args_ptr; #if 0 info("sending message type %u to %s", msg_type, thread_ptr->nodelist); #endif if (task_ptr->get_reply) { if(thread_ptr->addr) { msg.address = *thread_ptr->addr; if(!(ret_list = slurm_send_addr_recv_msgs( &msg, thread_ptr->nodelist, 0))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } else { if(!(ret_list = slurm_send_recv_msgs( thread_ptr->nodelist, &msg, 0, true))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } } else { if(thread_ptr->addr) { //info("got the address"); msg.address = *thread_ptr->addr; } else { //info("no address given"); if(slurm_conf_get_addr(thread_ptr->nodelist, &msg.address) == SLURM_ERROR) { error("_thread_per_group_rpc: " "can't find address for host %s, " "check slurm.conf", thread_ptr->nodelist); goto cleanup; } } //info("sending %u to %s", msg_type, thread_ptr->nodelist); if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) { thread_state = DSH_DONE; } else { if (!srun_agent) _comm_err(thread_ptr->nodelist, msg_type); } goto cleanup; } //info("got %d messages back", list_count(ret_list)); found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr)) != NULL) { rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); /* SPECIAL CASE: Mark node as IDLE if job already complete */ if (is_kill_msg && (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) { kill_job_msg_t *kill_job; kill_job = (kill_job_msg_t *) task_ptr->msg_args_ptr; rc = SLURM_SUCCESS; lock_slurmctld(job_write_lock); if (job_epilog_complete(kill_job->job_id, ret_data_info-> node_name, rc)) run_scheduler = true; unlock_slurmctld(job_write_lock); } /* SPECIAL CASE: Kill non-startable batch job, * Requeue the job on ESLURMD_PROLOG_FAILED */ if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) && (ret_data_info->type != RESPONSE_FORWARD_FAILED)) { batch_job_launch_msg_t *launch_msg_ptr = task_ptr->msg_args_ptr; uint32_t job_id = launch_msg_ptr->job_id; info("Killing non-startable batch job %u: %s", job_id, slurm_strerror(rc)); thread_state = DSH_DONE; ret_data_info->err = thread_state; lock_slurmctld(job_write_lock); job_complete(job_id, 0, false, false, _wif_status()); unlock_slurmctld(job_write_lock); continue; } if (((msg_type == REQUEST_SIGNAL_TASKS) || (msg_type == REQUEST_TERMINATE_TASKS)) && (rc == ESRCH)) { /* process is already dead, not a real error */ rc = SLURM_SUCCESS; } switch (rc) { case SLURM_SUCCESS: /* debug("agent processed RPC to node %s", */ /* ret_data_info->node_name); */ thread_state = DSH_DONE; break; case SLURM_UNKNOWN_FORWARD_ADDR: error("We were unable to forward message to '%s'. " "Make sure the slurm.conf for each slurmd " "contain all other nodes in your system.", ret_data_info->node_name); thread_state = DSH_NO_RESP; break; case ESLURMD_EPILOG_FAILED: error("Epilog failure on host %s, " "setting DOWN", ret_data_info->node_name); thread_state = DSH_FAILED; break; case ESLURMD_PROLOG_FAILED: thread_state = DSH_FAILED; break; case ESLURM_INVALID_JOB_ID: /* Not indicative of a real error */ case ESLURMD_JOB_NOTRUNNING: /* Not indicative of a real error */ debug2("agent processed RPC to node %s: %s", ret_data_info->node_name, slurm_strerror(rc)); thread_state = DSH_DONE; break; default: if (!srun_agent) { if (ret_data_info->err) errno = ret_data_info->err; else errno = rc; rc = _comm_err(ret_data_info->node_name, msg_type); } if (srun_agent) thread_state = DSH_FAILED; else if(ret_data_info->type == RESPONSE_FORWARD_FAILED) /* check if a forward failed */ thread_state = DSH_NO_RESP; else { /* some will fail that don't mean anything went * bad like a job term request on a job that is * already finished, we will just exit on those * cases */ thread_state = DSH_DONE; } } ret_data_info->err = thread_state; } list_iterator_destroy(itr); cleanup: xfree(args); /* handled at end of thread just in case resend is needed */ destroy_forward(&msg.forward); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->ret_list = ret_list; thread_ptr->state = thread_state; thread_ptr->end_time = (time_t) difftime(time(NULL), thread_ptr->start_time); /* Signal completion so another thread can replace us */ (*threads_active_ptr)--; pthread_cond_signal(thread_cond_ptr); slurm_mutex_unlock(thread_mutex_ptr); return (void *) NULL; }
/* dump_all_part_state - save the state of all partitions to file */ int dump_all_part_state(void) { /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = BUF_SIZE; ListIterator part_iterator; struct part_record *part_ptr; int error_code = 0, log_fd; char *old_file, *new_file, *reg_file; /* Locks: Read partition */ slurmctld_lock_t part_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; Buf buffer = init_buf(high_buffer_size); DEF_TIMERS; START_TIMER; /* write header: time */ packstr(PART_STATE_VERSION, buffer); pack_time(time(NULL), buffer); /* write partition records to buffer */ lock_slurmctld(part_read_lock); part_iterator = list_iterator_create(part_list); while ((part_ptr = (struct part_record *) list_next(part_iterator))) { xassert (part_ptr->magic == PART_MAGIC); _dump_part_state(part_ptr, buffer); } list_iterator_destroy(part_iterator); old_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(old_file, "/part_state.old"); reg_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(reg_file, "/part_state"); new_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(new_file, "/part_state.new"); unlock_slurmctld(part_read_lock); /* write the buffer to file */ lock_state_files(); log_fd = creat(new_file, 0600); if (log_fd < 0) { error("Can't save state, error creating file %s, %m", new_file); error_code = errno; } else { int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; char *data = (char *)get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "partition"); if (rc && !error_code) error_code = rc; } if (error_code) (void) unlink(new_file); else { /* file shuffle */ (void) unlink(old_file); if (link(reg_file, old_file)) { debug4("unable to create link for %s -> %s: %m", reg_file, old_file); } (void) unlink(reg_file); if (link(new_file, reg_file)) { debug4("unable to create link for %s -> %s: %m", new_file, reg_file); } (void) unlink(new_file); } xfree(old_file); xfree(reg_file); xfree(new_file); unlock_state_files(); free_buf(buffer); END_TIMER2("dump_all_part_state"); return 0; }
/* * job_will_run2 - Determine if, when and where a pending job can be * initiated with the currently running jobs either preempted * or left running as on other resources * cmd_ptr IN - CMD=JOBWILLRUN ARG=<JOBID> [STARTTIME=<TIME>] * NODES=<AVAIL_NODES> [PREEMPT=<JOBID1>[,<JOBID2> ..]] * err_code OUT - 0 on success or some error code * err_msg OUT - error message if any of the specified jobs can not be started * at the specified time (if given) on the available nodes. * Otherwise information on when and where the pending jobs * will be initiated * ARG=<JOBID> TASKS=<CPU_COUNT> STARTTIME=<TIME> * NODES=<USED_NODES> [PREEMPT=<JOBID1>[,<JOBID2> ..]] * NOTE: xfree() err_msg if err_code is zero * RET 0 on success, -1 on failure */ extern int job_will_run2(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *buf, *tmp_buf, *tmp_char; int preemptee_cnt = 0; uint32_t jobid, *preemptee = NULL, tmp_id; time_t start_time; char *avail_nodes = NULL; /* Locks: write job, read node and partition info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "JOBWILLRUN lacks ARG"; error("wiki: JOBWILLRUN lacks ARG"); return -1; } arg_ptr += 4; jobid = strtoul(arg_ptr, &tmp_char, 10); if ((tmp_char[0] != ' ') && (tmp_char[0] != '\0')) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: JOBWILLRUN has invalid ARG value"); return -1; } arg_ptr = strstr(cmd_ptr, "STARTTIME="); if (arg_ptr) { arg_ptr += 10; start_time = strtoul(arg_ptr, &tmp_char, 10); if ((tmp_char[0] != ' ') && (tmp_char[0] != '\0')) { *err_code = -300; *err_msg = "Invalid STARTTIME value"; error("wiki: JOBWILLRUN has invalid STARTTIME value"); return -1; } } else { start_time = time(NULL); } arg_ptr = strstr(cmd_ptr, "PREEMPT="); if (arg_ptr) { arg_ptr += 8; preemptee = xmalloc(sizeof(uint32_t) * strlen(arg_ptr)); while (1) { tmp_id = strtoul(arg_ptr, &tmp_char, 10); if ((tmp_char[0] != ' ') && (tmp_char[0] != '\0') && (tmp_char[0] != ',')) { *err_code = -300; *err_msg = "Invalid PREEMPT value"; error("wiki: JOBWILLRUN has invalid PREEMPT " "value"); xfree(preemptee); xfree(avail_nodes); return -1; } preemptee[preemptee_cnt++] = tmp_id; if (tmp_char[0] != ',') break; arg_ptr = tmp_char + 1; } } /* Keep this last, since we modify the input string */ arg_ptr = strstr(cmd_ptr, "NODES="); if (arg_ptr) { arg_ptr += 6; avail_nodes = xstrdup(arg_ptr); arg_ptr = strchr(avail_nodes, ' '); if (arg_ptr) arg_ptr[0] = '\0'; } else { *err_code = -300; *err_msg = "Missing NODES value"; error("wiki: JOBWILLRUN lacks NODES value"); xfree(preemptee); return -1; } lock_slurmctld(job_write_lock); buf = _will_run_test2(jobid, start_time, avail_nodes, preemptee, preemptee_cnt, err_code, err_msg); unlock_slurmctld(job_write_lock); xfree(preemptee); xfree(avail_nodes); if (!buf) return -1; tmp_buf = xmalloc(strlen(buf) + 32); sprintf(tmp_buf, "SC=0 ARG=%s", buf); xfree(buf); *err_code = 0; *err_msg = tmp_buf; return 0; }
static void _proc_msg(int new_fd, char *msg, slurm_addr_t cli_addr) { /* Locks: Read job and node data */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Locks: Write job, write node, read partition */ slurmctld_lock_t job_write_lock2 = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; /* Locks: Write node data */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK }; char *cmd_ptr, *resp = NULL, *msg_decrypted = NULL; uid_t cmd_uid; uint32_t protocol_version = 0; if (!msg) { info("slurmctld/nonstop: NULL message received"); resp = xstrdup("Error:\"NULL message received\""); goto send_resp; } msg_decrypted = _decrypt(msg, &cmd_uid); if (!msg_decrypted) { info("slurmctld/nonstop: Message decrypt failure"); resp = xstrdup("Error:\"Message decrypt failure\""); goto send_resp; } if (nonstop_debug > 0) info("slurmctld/nonstop: msg decrypted:%s", msg_decrypted); cmd_ptr = msg_decrypted; /* 123456789012345678901234567890 */ if (xstrncmp(cmd_ptr, version_string, 13) == 0) { cmd_ptr = strchr(cmd_ptr + 13, ':'); if (cmd_ptr) { cmd_ptr++; protocol_version = SLURM_PROTOCOL_VERSION; } } if (protocol_version == 0) { info("slurmctld/nonstop: Message version invalid"); resp = xstrdup("Error:\"Message version invalid\""); goto send_resp; } if (xstrncmp(cmd_ptr, "CALLBACK:JOBID:", 15) == 0) { resp = register_callback(cmd_ptr, cmd_uid, cli_addr, protocol_version); } else if (xstrncmp(cmd_ptr, "DRAIN:NODES:", 12) == 0) { lock_slurmctld(node_write_lock); resp = drain_nodes_user(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(node_write_lock); } else if (xstrncmp(cmd_ptr, "DROP_NODE:JOBID:", 15) == 0) { lock_slurmctld(job_write_lock2); resp = drop_node(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock2); } else if (xstrncmp(cmd_ptr, "GET_FAIL_NODES:JOBID:", 21) == 0) { lock_slurmctld(job_read_lock); resp = fail_nodes(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_read_lock); } else if (xstrncmp(cmd_ptr, "REPLACE_NODE:JOBID:", 19) == 0) { lock_slurmctld(job_write_lock2); resp = replace_node(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock2); } else if (xstrncmp(cmd_ptr, "SHOW_CONFIG", 11) == 0) { resp = show_config(cmd_ptr, cmd_uid, protocol_version); } else if (xstrncmp(cmd_ptr, "SHOW_JOB:JOBID:", 15) == 0) { resp = show_job(cmd_ptr, cmd_uid, protocol_version); } else if (xstrncmp(cmd_ptr, "TIME_INCR:JOBID:", 16) == 0) { lock_slurmctld(job_write_lock); resp = time_incr(cmd_ptr, cmd_uid, protocol_version); unlock_slurmctld(job_write_lock); } else { info("slurmctld/nonstop: Invalid command: %s", cmd_ptr); xstrfmtcat(resp, "%s ECMD", SLURM_VERSION_STRING); } send_resp: if (nonstop_debug > 0) info("slurmctld/nonstop: msg send:%s", resp); _send_reply(new_fd, resp); xfree(resp); if (msg_decrypted) free(msg_decrypted); return; }
/*****************************************************************************\ * message hander thread \*****************************************************************************/ static void *_msg_thread(void *no_data) { slurm_fd_t sock_fd = -1, new_fd; slurm_addr_t cli_addr; char *msg; slurm_ctl_conf_t *conf; int i; /* Locks: Write configuration, job, node, and partition */ slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; conf = slurm_conf_lock(); sched_port = conf->schedport; slurm_conf_unlock(); /* Wait until configuration is completely loaded */ lock_slurmctld(config_write_lock); unlock_slurmctld(config_write_lock); /* If SchedulerPort is already taken, keep trying to open it * once per minute. Slurmctld will continue to function * during this interval even if nothing can be scheduled. */ for (i=0; (!thread_shutdown); i++) { if (i > 0) sleep(60); sock_fd = slurm_init_msg_engine_port(sched_port); if (sock_fd != SLURM_SOCKET_ERROR) break; error("wiki: slurm_init_msg_engine_port %u %m", sched_port); error("wiki: Unable to communicate with Moab"); } /* Process incoming RPCs until told to shutdown */ while (!thread_shutdown) { if ((new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr)) == SLURM_SOCKET_ERROR) { if (errno != EINTR) error("wiki: slurm_accept_msg_conn %m"); continue; } if (thread_shutdown) { close(new_fd); break; } /* It would be nice to create a pthread for each new * RPC, but that leaks memory on some systems when * done from a plugin. * FIXME: Maintain a pool of and reuse them. */ err_code = 0; err_msg = ""; msg = _recv_msg(new_fd); if (msg) { _proc_msg(new_fd, msg); xfree(msg); } slurm_close_accepted_conn(new_fd); } if (sock_fd > 0) (void) slurm_shutdown_msg_engine(sock_fd); pthread_exit((void *) 0); return NULL; }