/* Start a job: * CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list> [COMMENT=<whatever>] * RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *comment_ptr, *task_ptr, *tasklist, *tmp_char; int i, rc, task_cnt; uint32_t jobid; hostlist_t hl = (hostlist_t) NULL; char *host_string; static char reply_msg[128]; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks ARG"; error("wiki: STARTJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if (!isspace(tmp_char[0])) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: STARTJOB has invalid jobid"); return -1; } comment_ptr = strstr(cmd_ptr, "COMMENT="); task_ptr = strstr(cmd_ptr, "TASKLIST="); if (comment_ptr) { comment_ptr[7] = ':'; comment_ptr += 8; if (comment_ptr[0] == '\"') { comment_ptr++; for (i=0; i<MAX_COMMENT_LEN; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\"') { comment_ptr[i] = '\0'; break; } } if (i == MAX_COMMENT_LEN) comment_ptr[i-1] = '\0'; } else if (comment_ptr[0] == '\'') { comment_ptr++; for (i=0; i<MAX_COMMENT_LEN; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\'') { comment_ptr[i] = '\0'; break; } } if (i == MAX_COMMENT_LEN) comment_ptr[i-1] = '\0'; } else null_term(comment_ptr); } if (task_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks TASKLIST"; error("wiki: STARTJOB lacks TASKLIST"); return -1; } task_ptr += 9; /* skip over "TASKLIST=" */ if ((task_ptr[0] == '\0') || isspace(task_ptr[0])) { /* No TASKLIST specification, useful for testing */ host_string = xstrdup(""); task_cnt = 0; tasklist = NULL; } else { null_term(task_ptr); tasklist = moab2slurm_task_list(task_ptr, &task_cnt); if (tasklist) hl = hostlist_create(tasklist); if ((tasklist == NULL) || (hl == NULL)) { *err_code = -300; *err_msg = "STARTJOB TASKLIST is invalid"; error("wiki: STARTJOB TASKLIST is invalid: %s", task_ptr); xfree(tasklist); return -1; } hostlist_uniq(hl); hostlist_sort(hl); host_string = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (host_string == NULL) { *err_code = -300; *err_msg = "STARTJOB has invalid TASKLIST"; error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist); xfree(tasklist); return -1; } } rc = _start_job(jobid, task_cnt, host_string, tasklist, comment_ptr, err_code, err_msg); xfree(host_string); xfree(tasklist); if (rc == 0) { snprintf(reply_msg, sizeof(reply_msg), "job %u started successfully", jobid); *err_msg = reply_msg; } return rc; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit, char *name_ptr, char *start_ptr, char *feature_ptr, char *env_ptr, char *comment_ptr, char *gres_ptr, char *wckey_ptr) { struct job_record *job_ptr; time_t now = time(NULL); bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) { info("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (comment_ptr) { info("wiki: change job %u comment %s", jobid, comment_ptr); xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); last_job_update = now; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (env_ptr) { bool have_equal = false; char old_sep[1]; int begin = 0, i; if (job_ptr->batch_flag == 0) { error("wiki: attempt to set environment variables " "for non-batch job %u", jobid); return ESLURM_DISABLED; } for (i=0; ; i++) { if (env_ptr[i] == '=') { if (have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } have_equal = true; if (env_ptr[i+1] == '\"') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\"') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } else if (env_ptr[i+1] == '\'') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\'') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } } if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) { if (!have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } old_sep[0] = env_ptr[i]; env_ptr[i] = '\0'; xrealloc(job_ptr->details->env_sup, sizeof(char *) * (job_ptr->details->env_cnt+1)); job_ptr->details->env_sup [job_ptr->details->env_cnt++] = xstrdup(&env_ptr[begin]); info("wiki: for job %u add env: %s", jobid, &env_ptr[begin]); env_ptr[i] = old_sep[0]; if (isspace(old_sep[0])) break; begin = i + 1; have_equal = false; } } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = now; } if (bank_ptr && (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { return EINVAL; } if (feature_ptr) { if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u features to %s", jobid, feature_ptr); job_ptr->details->features = xstrdup(feature_ptr); last_job_update = now; } else { error("wiki: MODIFYJOB features of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (start_ptr) { char *end_ptr; uint32_t begin_time = strtol(start_ptr, &end_ptr, 10); if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u begin time to %u", jobid, begin_time); job_ptr->details->begin_time = begin_time; last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB begin_time of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (name_ptr) { if (IS_JOB_PENDING(job_ptr)) { info("wiki: change job %u name %s", jobid, name_ptr); xfree(job_ptr->name); job_ptr->name = xstrdup(name_ptr); last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB name of non-pending job %u", jobid); return ESLURM_DISABLED; } } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB hostlist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = now; update_accounting = true; } if (new_node_cnt) { job_desc_msg_t job_desc; #ifdef HAVE_BG uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; static uint16_t cpus_per_node = 0; if (!cpus_per_node) { select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, &cpus_per_node); } #endif if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } memset(&job_desc, 0, sizeof(job_desc_msg_t)); job_desc.min_nodes = new_node_cnt; job_desc.max_nodes = NO_VAL; job_desc.select_jobinfo = select_g_select_jobinfo_alloc(); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc); select_g_select_jobinfo_free(job_desc.select_jobinfo); job_ptr->details->min_nodes = job_desc.min_nodes; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < job_desc.min_nodes)) job_ptr->details->max_nodes = job_desc.min_nodes; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG job_ptr->details->min_cpus = job_desc.min_cpus; job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus; new_node_cnt = job_ptr->details->min_cpus; if (cpus_per_node) new_node_cnt /= cpus_per_node; /* This is only set up so accounting is set up correctly */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &new_node_cnt); /* reset geo since changing this makes any geo potentially invalid */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_GEOMETRY, geometry); #endif last_job_update = now; update_accounting = true; } if (gres_ptr) { char *orig_gres; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB GRES of non-pending job %u", jobid); return ESLURM_DISABLED; } orig_gres = job_ptr->gres; job_ptr->gres = NULL; if (gres_ptr[0]) job_ptr->gres = xstrdup(gres_ptr); if (gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)) { error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr); xfree(job_ptr->gres); job_ptr->gres = orig_gres; return ESLURM_INVALID_GRES; } xfree(orig_gres); } if (wckey_ptr) { int rc = update_job_wckey("update_job", job_ptr, wckey_ptr); if (rc != SLURM_SUCCESS) { error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr); return rc; } } if (update_accounting) { if (job_ptr->details && job_ptr->details->begin_time) { /* Update job record in accounting to reflect * the changes */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } } return SLURM_SUCCESS; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit) { struct job_record *job_ptr; bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr)) { error("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = time(NULL); } if (bank_ptr) { if (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS) return EINVAL; else update_accounting = true; } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB tasklist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = time(NULL); update_accounting = true; } if (new_node_cnt) { if (IS_JOB_PENDING(job_ptr) && job_ptr->details) { job_ptr->details->min_nodes = new_node_cnt; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < new_node_cnt)) job_ptr->details->max_nodes = new_node_cnt; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); last_job_update = time(NULL); update_accounting = true; } else { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (update_accounting) { /* Update job record in accounting to reflect changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } return SLURM_SUCCESS; }
/* Start a job: * CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list> * RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *task_ptr, *tasklist, *tmp_char; int rc, task_cnt; uint32_t jobid; hostlist_t hl = (hostlist_t) NULL; char *host_string; static char reply_msg[128]; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks ARG"; error("wiki: STARTJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if (!isspace(tmp_char[0])) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: STARTJOB has invalid jobid"); return -1; } task_ptr = strstr(cmd_ptr, "TASKLIST="); if (task_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks TASKLIST"; error("wiki: STARTJOB lacks TASKLIST"); return -1; } task_ptr += 9; /* skip over "TASKLIST=" */ null_term(task_ptr); tasklist = moab2slurm_task_list(task_ptr, &task_cnt); if (tasklist) hl = hostlist_create(tasklist); if ((tasklist == NULL) || (hl == NULL)) { *err_code = -300; *err_msg = "STARTJOB TASKLIST is invalid"; error("wiki: STARTJOB TASKLIST is invalid: %s", task_ptr); xfree(tasklist); return -1; } hostlist_uniq(hl); hostlist_sort(hl); host_string = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (host_string == NULL) { *err_code = -300; *err_msg = "STARTJOB has invalid TASKLIST"; error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist); xfree(tasklist); return -1; } rc = _start_job(jobid, task_cnt, host_string, tasklist, err_code, err_msg); xfree(host_string); xfree(tasklist); if (rc == 0) { snprintf(reply_msg, sizeof(reply_msg), "job %u started successfully", jobid); *err_msg = reply_msg; } return rc; }
/* * get_nodes - get information on specific node(s) changed since some time * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...] * [<UPDATETIME>:ALL] * err_code OUT - 0 or an error code * err_msg OUT - response message * NOTE: xfree() err_msg if err_code is zero * RET 0 on success, -1 on failure * * Response format * ARG=<cnt>#<NODEID>: * STATE=<state>; Moab equivalent node state * [CAT=<reason>]; Reason for a node being down or drained * colon separator * CCLASS=<[part:cpus]>; SLURM partition with CPU count of node, * make have more than one partition * [CPULOAD=<load_ave>;] One minute BSD load average * [ARCH=<architecture>;] Computer architecture * [OS=<operating_system>;] Operating system * CMEMORY=<MB>; MB of memory on node * CDISK=<MB>; MB of disk space on node * CPROC=<cpus>; CPU count on node * [FEATURE=<feature>;] Features associated with node, if any * [GRES=<name>[:<count>],...;] generic resources on the node * [#<NODEID>:...]; */ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK }; int node_rec_cnt = 0, buf_size = 0; #ifdef HAVE_ALPS_CRAY /* Locks: write node */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; /* * Run a Basil Inventory immediately before scheduling, to avoid * race conditions caused by ALPS node state change (caused e.g. * by the node health checker). * This relies on the above write lock for the node state. */ lock_slurmctld(node_write_lock); if (select_g_reconfigure()) { unlock_slurmctld(node_write_lock); *err_code = -720; *err_msg = "Unable to run ALPS inventory"; error("wiki: Unable to run ALPS inventory"); return -1; } unlock_slurmctld(node_write_lock); #endif arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "GETNODES lacks ARG"; error("wiki: GETNODES lacks ARG"); return -1; } update_time = (time_t) strtoul(arg_ptr+4, &tmp_char, 10); if (tmp_char[0] != ':') { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: GETNODES has invalid ARG value"); return -1; } tmp_char++; lock_slurmctld(node_read_lock); if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all nodes */ buf = _dump_all_nodes(&node_rec_cnt, update_time); } else { struct node_record *node_ptr = NULL; char *node_name, *slurm_hosts; int node_cnt; hostset_t slurm_hostset; slurm_hosts = moab2slurm_task_list(tmp_char, &node_cnt); if ((slurm_hostset = hostset_create(slurm_hosts))) { while ((node_name = hostset_shift(slurm_hostset))) { node_ptr = find_node_record(node_name); if (node_ptr == NULL) { error("sched/wiki2: bad hostname %s", node_name); continue; } if (_hidden_node(node_ptr)) continue; tmp_buf = _dump_node(node_ptr, NULL, update_time); if (node_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); node_rec_cnt++; } hostset_destroy(slurm_hostset); } else { error("hostset_create(%s): %m", slurm_hosts); } xfree(slurm_hosts); } unlock_slurmctld(node_read_lock); /* Prepend ("ARG=%d", node_rec_cnt) to reply message */ if (buf) buf_size = strlen(buf); tmp_buf = xmalloc(buf_size + 32); if (node_rec_cnt) sprintf(tmp_buf, "SC=0 ARG=%d#%s", node_rec_cnt, buf); else sprintf(tmp_buf, "SC=0 ARG=0#"); xfree(buf); *err_code = 0; *err_msg = tmp_buf; return 0; }