/* For each burst buffer record, set the use_time to the time at which its * use is expected to begin (i.e. each job's expected start time) */ extern void bb_set_use_time(bb_state_t *state_ptr) { struct job_record *job_ptr; bb_alloc_t *bb_alloc = NULL; time_t now = time(NULL); int i; state_ptr->next_end_time = now + 60 * 60; /* Start estimate now+1hour */ for (i = 0; i < BB_HASH_SIZE; i++) { bb_alloc = state_ptr->bb_ahash[i]; while (bb_alloc) { if (bb_alloc->job_id && ((bb_alloc->state == BB_STATE_STAGING_IN) || (bb_alloc->state == BB_STATE_STAGED_IN))) { job_ptr = find_job_record(bb_alloc->job_id); if (!job_ptr && !bb_alloc->orphaned) { bb_alloc->orphaned = true; error("%s: Job %u not found for " "allocated burst buffer", __func__, bb_alloc->job_id); bb_alloc->use_time = now + 24 * 60 * 60; } else if (!job_ptr) { bb_alloc->use_time = now + 24 * 60 * 60; } else if (job_ptr->start_time) { bb_alloc->end_time = job_ptr->end_time; bb_alloc->use_time = job_ptr->start_time; } else { /* Unknown start time */ bb_alloc->use_time = now + 60 * 60; } } else if (bb_alloc->job_id) { job_ptr = find_job_record(bb_alloc->job_id); if (job_ptr) bb_alloc->end_time = job_ptr->end_time; } else { bb_alloc->use_time = now; } if (bb_alloc->end_time && bb_alloc->size) { if (bb_alloc->end_time <= now) state_ptr->next_end_time = now; else if (state_ptr->next_end_time > bb_alloc->end_time) { state_ptr->next_end_time = bb_alloc->end_time; } } bb_alloc = bb_alloc->next; } } }
/* Purge per-job burst buffer records when the stage-out has completed and * the job has been purged from Slurm */ static void _purge_bb_rec(void) { static time_t time_last_purge = 0; time_t now = time(NULL); bb_alloc_t **bb_pptr, *bb_ptr = NULL; int i; if (difftime(now, time_last_purge) > 60) { /* Once per minute */ for (i = 0; i < BB_HASH_SIZE; i++) { bb_pptr = &bb_hash[i]; bb_ptr = bb_hash[i]; while (bb_ptr) { if ((bb_ptr->job_id != 0) && (bb_ptr->state >= BB_STATE_STAGED_OUT) && !find_job_record(bb_ptr->job_id)) { *bb_pptr = bb_ptr->next; xfree(bb_ptr); break; } bb_pptr = &bb_ptr->next; bb_ptr = bb_ptr->next; } } } }
static void _requeue_when_finished(uint32_t job_id) { /* Locks: read job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; struct job_record *job_ptr; while (1) { lock_slurmctld(job_write_lock); job_ptr = find_job_record(job_id); if (IS_JOB_FINISHED(job_ptr)) { job_ptr->job_state = JOB_PENDING; job_ptr->details->submit_time = time(NULL); job_ptr->restart_cnt++; /* Since the job completion logger * removes the submit we need to add it again. */ acct_policy_add_job_submit(job_ptr); unlock_slurmctld(job_write_lock); break; } else { unlock_slurmctld(job_write_lock); sleep(1); } } }
/* * srun_node_fail - notify srun of a node's failure * IN job_id - id of job to notify * IN node_name - name of failed node */ extern void srun_node_fail (uint32_t job_id, char *node_name) { #ifndef HAVE_FRONT_END struct node_record *node_ptr; #endif struct job_record *job_ptr = find_job_record (job_id); int bit_position = -1; slurm_addr_t * addr; srun_node_fail_msg_t *msg_arg; ListIterator step_iterator; struct step_record *step_ptr; xassert(job_ptr); xassert(node_name); if (!job_ptr || !IS_JOB_RUNNING(job_ptr)) return; #ifdef HAVE_FRONT_END /* Purge all jobs steps in front end mode */ #else if (!node_name || (node_ptr = find_node_record(node_name)) == NULL) return; bit_position = node_ptr - node_record_table_ptr; #endif step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next(step_iterator))) { if ((bit_position >= 0) && (!bit_test(step_ptr->step_node_bitmap, bit_position))) continue; /* job step not on this node */ if ( (step_ptr->port == 0) || (step_ptr->host == NULL) || (step_ptr->batch_step) || (step_ptr->host[0] == '\0') ) continue; addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, step_ptr->port, step_ptr->host); msg_arg = xmalloc(sizeof(srun_node_fail_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->step_id = step_ptr->step_id; msg_arg->nodelist = xstrdup(node_name); _srun_agent_launch(addr, step_ptr->host, SRUN_NODE_FAIL, msg_arg); } list_iterator_destroy(step_iterator); if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host); msg_arg = xmalloc(sizeof(srun_node_fail_msg_t)); msg_arg->job_id = job_id; msg_arg->step_id = NO_VAL; msg_arg->nodelist = xstrdup(node_name); _srun_agent_launch(addr, job_ptr->alloc_node, SRUN_NODE_FAIL, msg_arg); } }
/* * srun_response - note that srun has responded * IN job_id - id of job responding * IN step_id - id of step responding or NO_VAL if not a step */ extern void srun_response(uint32_t job_id, uint32_t step_id) { struct job_record *job_ptr = find_job_record (job_id); time_t now = time(NULL); if (job_ptr == NULL) return; job_ptr->time_last_active = now; }
/* * srun_allocate - notify srun of a resource allocation * IN job_ptr - job allocated resources */ extern void srun_allocate(struct job_record *job_ptr) { struct job_record *pack_job, *pack_leader; resource_allocation_response_msg_t *msg_arg = NULL; slurm_addr_t *addr; ListIterator iter; List job_resp_list = NULL; xassert(job_ptr); if (!job_ptr || !job_ptr->alloc_resp_port || !job_ptr->alloc_node || !job_ptr->resp_host || !job_ptr->job_resrcs || !job_ptr->job_resrcs->cpu_array_cnt) return; if (job_ptr->pack_job_id == 0) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->alloc_resp_port, job_ptr->resp_host); msg_arg = build_alloc_msg(job_ptr, SLURM_SUCCESS, NULL); _srun_agent_launch(addr, job_ptr->alloc_node, RESPONSE_RESOURCE_ALLOCATION, msg_arg, job_ptr->start_protocol_ver); } else if (_pending_pack_jobs(job_ptr)) { return; } else if ((pack_leader = find_job_record(job_ptr->pack_job_id))) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, pack_leader->alloc_resp_port, pack_leader->resp_host); job_resp_list = list_create(_free_srun_alloc); iter = list_iterator_create(pack_leader->pack_job_list); while ((pack_job = (struct job_record *) list_next(iter))) { if (pack_leader->pack_job_id != pack_job->pack_job_id) { error("%s: Bad pack_job_list for %pJ", __func__, pack_leader); continue; } msg_arg = build_alloc_msg(pack_job, SLURM_SUCCESS, NULL); list_append(job_resp_list, msg_arg); msg_arg = NULL; } list_iterator_destroy(iter); _srun_agent_launch(addr, job_ptr->alloc_node, RESPONSE_JOB_PACK_ALLOCATION, job_resp_list, job_ptr->start_protocol_ver); } else { error("%s: Can not find pack job leader %pJ", __func__, job_ptr); } }
/* RET 0 on success, -1 on failure */ extern int job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *tmp_char; uint32_t jobid; struct job_record *job_ptr; static char reply_msg[128]; int slurm_rc; /* Write lock on job and node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "REQUEUEJOB lacks ARG"; error("wiki: REQUEUEJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: REQUEUEJOB has invalid jobid"); return -1; } lock_slurmctld(job_write_lock); slurm_rc = job_requeue(0, jobid, NULL, false, 0); if (slurm_rc != SLURM_SUCCESS) { unlock_slurmctld(job_write_lock); *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to requeue job %u (%m)", jobid); return -1; } /* We need to clear the required node list here. * If the job was submitted with srun and a * required node list, it gets lost here. */ job_ptr = find_job_record(jobid); if (job_ptr && job_ptr->details) { xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); } info("wiki: requeued job %u", jobid); unlock_slurmctld(job_write_lock); snprintf(reply_msg, sizeof(reply_msg), "job %u requeued successfully", jobid); *err_msg = reply_msg; return 0; }
static int _job_notify(uint32_t jobid, char *msg_ptr) { struct job_record *job_ptr; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: NOTIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr)) { error("wiki: NOTIFYJOB jobid %u is finished", jobid); return ESLURM_INVALID_JOB_ID; } srun_user_message(job_ptr, msg_ptr); return SLURM_SUCCESS; }
static int _job_signal(uint32_t jobid, uint16_t sig_num) { struct job_record *job_ptr; int rc = SLURM_SUCCESS; job_ptr = find_job_record(jobid); if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID; if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; if (job_ptr->batch_flag) rc = job_signal(jobid, sig_num, 1, 0, false); if (rc == SLURM_SUCCESS) rc = job_signal(jobid, sig_num, 0, 0, false); return rc; }
/* * srun_allocate - notify srun of a resource allocation * IN job_id - id of the job allocated resource */ extern void srun_allocate (uint32_t job_id) { struct job_record *job_ptr = find_job_record (job_id); xassert(job_ptr); if (job_ptr && job_ptr->alloc_resp_port && job_ptr->alloc_node && job_ptr->resp_host && job_ptr->job_resrcs && job_ptr->job_resrcs->cpu_array_cnt) { slurm_addr_t * addr; resource_allocation_response_msg_t *msg_arg; job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->alloc_resp_port, job_ptr->resp_host); msg_arg = xmalloc(sizeof(resource_allocation_response_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->node_list = xstrdup(job_ptr->nodes); msg_arg->alias_list = xstrdup(job_ptr->alias_list); msg_arg->num_cpu_groups = job_resrcs_ptr->cpu_array_cnt; msg_arg->cpus_per_node = xmalloc(sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt); if (job_ptr->details) { msg_arg->pn_min_memory = job_ptr->details-> pn_min_memory; } memcpy(msg_arg->cpus_per_node, job_resrcs_ptr->cpu_array_value, (sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt)); msg_arg->cpu_count_reps = xmalloc(sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt); memcpy(msg_arg->cpu_count_reps, job_resrcs_ptr->cpu_array_reps, (sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt)); msg_arg->node_cnt = job_ptr->node_cnt; msg_arg->select_jobinfo = select_g_select_jobinfo_copy( job_ptr->select_jobinfo); msg_arg->error_code = SLURM_SUCCESS; _srun_agent_launch(addr, job_ptr->alloc_node, RESPONSE_RESOURCE_ALLOCATION, msg_arg); } }
static bool _pending_pack_jobs(struct job_record *job_ptr) { struct job_record *pack_leader, *pack_job; ListIterator iter; bool pending_job = false; if (job_ptr->pack_job_id == 0) return false; pack_leader = find_job_record(job_ptr->pack_job_id); if (!pack_leader) { error("Job pack leader %pJ not found", job_ptr); return false; } if (!pack_leader->pack_job_list) { error("Job pack leader %pJ lacks pack_job_list", job_ptr); return false; } iter = list_iterator_create(pack_leader->pack_job_list); while ((pack_job = (struct job_record *) list_next(iter))) { if (pack_leader->pack_job_id != pack_job->pack_job_id) { error("%s: Bad pack_job_list for %pJ", __func__, pack_leader); continue; } if (IS_JOB_PENDING(pack_job)) { pending_job = true; break; } } list_iterator_destroy(iter); return pending_job; }
static char * _will_run_test2(uint32_t jobid, time_t start_time, char *node_list, uint32_t *preemptee, int preemptee_cnt, int *err_code, char **err_msg) { struct job_record *job_ptr = NULL, *pre_ptr; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t start_res; uint32_t min_nodes, max_nodes, req_nodes; List preemptee_candidates = NULL, preempted_jobs = NULL; time_t orig_start_time; char *reply_msg = NULL; int i, rc; bool resv_overlap = false; xassert(node_list); debug2("wiki2: will_run2 job_id=%u start_time=%u node_list=%s", jobid, (uint32_t)start_time, node_list); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); return NULL; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "WillRun not applicable to non-pending job"; error("wiki: WillRun on non-pending job %u", jobid); return NULL; } part_ptr = job_ptr->part_ptr; if (part_ptr == NULL) { *err_code = -700; *err_msg = "Job lacks a partition"; error("wiki: Job %u lacks a partition", jobid); return NULL; } if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid available nodes value"; error("wiki: Attempt to set invalid available node " "list for job %u, %s", jobid, node_list); return NULL; } /* Enforce reservation: access control, time and nodes */ start_res = start_time; rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap, &resv_overlap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; error("wiki: reservation access denied for job %u", jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } bit_and(avail_bitmap, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); /* Consider only nodes in this job's partition */ if (part_ptr->node_bitmap) bit_and(avail_bitmap, part_ptr->node_bitmap); else { *err_code = -730; *err_msg = "Job's partition has no nodes"; error("wiki: no nodes in partition %s for job %u", part_ptr->name, jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { /* Job probably has invalid feature list */ *err_code = -730; *err_msg = "Job's required features not available " "on selected nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) { *err_code = -730; *err_msg = "Job's required nodes not available"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partitions max_nodes */ *err_code = -730; *err_msg = "Job's min_nodes > max_nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (preemptee_cnt) { preemptee_candidates = list_create(NULL); for (i=0; i<preemptee_cnt; i++) { if ((pre_ptr = find_job_record(preemptee[i]))) list_append(preemptee_candidates, pre_ptr); } } orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, &preempted_jobs, exc_core_bitmap); FREE_NULL_LIST(preemptee_candidates); if (rc == SLURM_SUCCESS) { char *hostlist, *sep, tmp_str[128]; uint32_t pre_cnt = 0, proc_cnt = 0; #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=", job_ptr->job_id, proc_cnt, (uint32_t) job_ptr->start_time); xstrcat(reply_msg, tmp_str); hostlist = bitmap2node_name(avail_bitmap); xstrcat(reply_msg, hostlist); xfree(hostlist); if (preempted_jobs) { while ((pre_ptr = list_pop(preempted_jobs))) { if (pre_cnt++) sep = ","; else sep = " PREEMPT="; snprintf(tmp_str, sizeof(tmp_str), "%s%u", sep, pre_ptr->job_id); xstrcat(reply_msg, tmp_str); } FREE_NULL_LIST(preempted_jobs); } } else { xstrcat(reply_msg, "Jobs not runable on selected nodes"); error("wiki: jobs not runnable on nodes"); } /* Restore pending job's expected start time */ job_ptr->start_time = orig_start_time; FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return reply_msg; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit, char *name_ptr, char *start_ptr, char *feature_ptr, char *env_ptr, char *comment_ptr, char *gres_ptr, char *wckey_ptr) { struct job_record *job_ptr; time_t now = time(NULL); bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) { info("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (comment_ptr) { info("wiki: change job %u comment %s", jobid, comment_ptr); xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); last_job_update = now; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (env_ptr) { bool have_equal = false; char old_sep[1]; int begin = 0, i; if (job_ptr->batch_flag == 0) { error("wiki: attempt to set environment variables " "for non-batch job %u", jobid); return ESLURM_DISABLED; } for (i=0; ; i++) { if (env_ptr[i] == '=') { if (have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } have_equal = true; if (env_ptr[i+1] == '\"') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\"') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } else if (env_ptr[i+1] == '\'') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\'') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } } if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) { if (!have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } old_sep[0] = env_ptr[i]; env_ptr[i] = '\0'; xrealloc(job_ptr->details->env_sup, sizeof(char *) * (job_ptr->details->env_cnt+1)); job_ptr->details->env_sup [job_ptr->details->env_cnt++] = xstrdup(&env_ptr[begin]); info("wiki: for job %u add env: %s", jobid, &env_ptr[begin]); env_ptr[i] = old_sep[0]; if (isspace(old_sep[0])) break; begin = i + 1; have_equal = false; } } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = now; } if (bank_ptr && (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { return EINVAL; } if (feature_ptr) { if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u features to %s", jobid, feature_ptr); job_ptr->details->features = xstrdup(feature_ptr); last_job_update = now; } else { error("wiki: MODIFYJOB features of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (start_ptr) { char *end_ptr; uint32_t begin_time = strtol(start_ptr, &end_ptr, 10); if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u begin time to %u", jobid, begin_time); job_ptr->details->begin_time = begin_time; last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB begin_time of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (name_ptr) { if (IS_JOB_PENDING(job_ptr)) { info("wiki: change job %u name %s", jobid, name_ptr); xfree(job_ptr->name); job_ptr->name = xstrdup(name_ptr); last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB name of non-pending job %u", jobid); return ESLURM_DISABLED; } } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB hostlist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = now; update_accounting = true; } if (new_node_cnt) { job_desc_msg_t job_desc; #ifdef HAVE_BG uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; static uint16_t cpus_per_node = 0; if (!cpus_per_node) { select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, &cpus_per_node); } #endif if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } memset(&job_desc, 0, sizeof(job_desc_msg_t)); job_desc.min_nodes = new_node_cnt; job_desc.max_nodes = NO_VAL; job_desc.select_jobinfo = select_g_select_jobinfo_alloc(); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc); select_g_select_jobinfo_free(job_desc.select_jobinfo); job_ptr->details->min_nodes = job_desc.min_nodes; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < job_desc.min_nodes)) job_ptr->details->max_nodes = job_desc.min_nodes; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG job_ptr->details->min_cpus = job_desc.min_cpus; job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus; new_node_cnt = job_ptr->details->min_cpus; if (cpus_per_node) new_node_cnt /= cpus_per_node; /* This is only set up so accounting is set up correctly */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &new_node_cnt); /* reset geo since changing this makes any geo potentially invalid */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_GEOMETRY, geometry); #endif last_job_update = now; update_accounting = true; } if (gres_ptr) { char *orig_gres; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB GRES of non-pending job %u", jobid); return ESLURM_DISABLED; } orig_gres = job_ptr->gres; job_ptr->gres = NULL; if (gres_ptr[0]) job_ptr->gres = xstrdup(gres_ptr); if (gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)) { error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr); xfree(job_ptr->gres); job_ptr->gres = orig_gres; return ESLURM_INVALID_GRES; } xfree(orig_gres); } if (wckey_ptr) { int rc = update_job_wckey("update_job", job_ptr, wckey_ptr); if (rc != SLURM_SUCCESS) { error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr); return rc; } } if (update_accounting) { if (job_ptr->details && job_ptr->details->begin_time) { /* Update job record in accounting to reflect * the changes */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } } return SLURM_SUCCESS; }
/* * The remainder of this file implements the standard SLURM checkpoint API. */ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, struct step_record *step_ptr, uint16_t op, uint16_t data, char *image_dir, time_t * event_time, uint32_t *error_code, char **error_msg ) { int rc = SLURM_SUCCESS; struct check_job_info *check_ptr; uint16_t done_sig = 0; struct job_record *job_ptr; struct node_record *node_ptr; pthread_attr_t attr; pthread_t ckpt_agent_tid = 0; char *nodelist; struct ckpt_req *req_ptr; /* job/step checked already */ job_ptr = find_job_record(job_id); if (!job_ptr) return ESLURM_INVALID_JOB_ID; if (step_id == SLURM_BATCH_SCRIPT) { check_ptr = (struct check_job_info *)job_ptr->check_job; node_ptr = find_first_node_record(job_ptr->node_bitmap); nodelist = node_ptr->name; } else { step_ptr = find_step_record(job_ptr, step_id); if (!step_ptr) return ESLURM_INVALID_JOB_ID; check_ptr = (struct check_job_info *)step_ptr->check_job; nodelist = step_ptr->step_layout->node_list; } xassert(check_ptr); switch (op) { case CHECK_ABLE: if (check_ptr->disabled) rc = ESLURM_DISABLED; else { *event_time = check_ptr->time_stamp; rc = SLURM_SUCCESS; } break; case CHECK_DISABLE: check_ptr->disabled++; break; case CHECK_ENABLE: check_ptr->disabled--; break; case CHECK_REQUEUE: if (step_id != SLURM_BATCH_SCRIPT) { rc = ESLURM_NOT_SUPPORTED; break; } /* no break */ case CHECK_VACATE: done_sig = SIGTERM; /* no break */ case CHECK_CREATE: if (check_ptr->disabled) { rc = ESLURM_DISABLED; break; } if (check_ptr->time_stamp != 0) { rc = EALREADY; break; } check_ptr->time_stamp = time(NULL); check_ptr->error_code = 0; xfree(check_ptr->error_msg); req_ptr = xmalloc(sizeof(struct ckpt_req)); if (!req_ptr) { rc = ENOMEM; break; } req_ptr->gid = job_ptr->group_id; req_ptr->uid = job_ptr->user_id; req_ptr->job_id = job_id; req_ptr->step_id = step_id; req_ptr->begin_time = check_ptr->time_stamp; req_ptr->wait = data; req_ptr->image_dir = xstrdup(image_dir); req_ptr->nodelist = xstrdup(nodelist); req_ptr->sig_done = done_sig; req_ptr->op = op; slurm_attr_init(&attr); if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) { error("pthread_attr_setdetachstate: %m"); rc = errno; break; } if (pthread_create(&ckpt_agent_tid, &attr, _ckpt_agent_thr, req_ptr)) { error("pthread_create: %m"); rc = errno; break; } slurm_attr_destroy(&attr); break; case CHECK_RESTART: if (step_id != SLURM_BATCH_SCRIPT) { rc = ESLURM_NOT_SUPPORTED; break; } /* create a batch job from saved desc */ rc = ESLURM_NOT_SUPPORTED; /* TODO: save job script */ break; case CHECK_ERROR: xassert(error_code); xassert(error_msg); *error_code = check_ptr->error_code; xfree(*error_msg); *error_msg = xstrdup(check_ptr->error_msg); break; default: error("Invalid checkpoint operation: %d", op); rc = EINVAL; } return rc; }
static void _preempt_job_dequeue(void) { struct job_record *job_ptr; uint32_t job_id, *tmp_id; uint16_t preempt_mode; xassert(preempt_job_list); while ((tmp_id = list_pop(preempt_job_list))) { int rc = SLURM_ERROR; job_id = *tmp_id; xfree(tmp_id); if ((job_ptr = find_job_record(job_id)) == NULL) { error("_preempt_job_dequeue could not find job %u", job_id); continue; } preempt_mode = slurm_job_preempt_mode(job_ptr); if (preempt_mode == PREEMPT_MODE_SUSPEND) { if ((rc = _suspend_job(job_id)) == ESLURM_DISABLED) rc = SLURM_SUCCESS; } else if (preempt_mode == PREEMPT_MODE_CANCEL) { rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true); if (rc == SLURM_SUCCESS) { info("preempted job %u has been killed", job_ptr->job_id); } } else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) { checkpoint_msg_t ckpt_msg; memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); ckpt_msg.op = CHECK_REQUEUE; ckpt_msg.job_id = job_ptr->job_id; rc = job_checkpoint(&ckpt_msg, 0, -1, (uint16_t)NO_VAL); if (rc == ESLURM_NOT_SUPPORTED) { memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); ckpt_msg.op = CHECK_VACATE; ckpt_msg.job_id = job_ptr->job_id; rc = job_checkpoint(&ckpt_msg, 0, -1, (uint16_t)NO_VAL); } if (rc == SLURM_SUCCESS) { info("preempted job %u has been checkpointed", job_ptr->job_id); } else error("preempted job %u could not be " "checkpointed: %s", job_ptr->job_id, slurm_strerror(rc)); } else if ((preempt_mode == PREEMPT_MODE_REQUEUE) && job_ptr->batch_flag && job_ptr->details && (job_ptr->details->requeue > 0)) { rc = job_requeue(0, job_ptr->job_id, -1, (uint16_t)NO_VAL, true, 0); if (rc == SLURM_SUCCESS) { info("preempted job %u has been requeued", job_ptr->job_id); } else error("preempted job %u could not be " "requeued: %s", job_ptr->job_id, slurm_strerror(rc)); } if (rc != SLURM_SUCCESS) { rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true); if (rc == SLURM_SUCCESS) info("preempted job %u had to be killed", job_ptr->job_id); else { info("preempted job %u kill failure %s", job_ptr->job_id, slurm_strerror(rc)); } } } return; }
/* * get_jobs - get information on specific job(s) changed since some time * cmd_ptr IN - CMD=GETJOBS ARG=[<UPDATETIME>:<JOBID>[:<JOBID>]...] * [<UPDATETIME>:ALL] * err_code OUT - 0 or an error code * err_msg OUT - response message * NOTE: xfree() err_msg if err_code is zero * RET 0 on success, -1 on failure * * Response format * ARG=<cnt>#<JOBID>; * STATE=<state>; Moab equivalent job state * [EXITCODE=<number>;] Job exit code, if completed * [RFEATURES=<features>;] required features, if any, * NOTE: OR operator not supported * [HOSTLIST=<node1:node2>;] list of required nodes, if any * [EXCLUDE_HOSTLIST=<node1:node2>;list of excluded nodes, if any * [STARTDATE=<uts>;] earliest start time, if any * [MAXNODES=<nodes>;] maximum number of nodes, 0 if no limit * [TASKLIST=<node1:node2>;] nodes in use, if running or completing * [REJMESSAGE=<str>;] reason job is not running, if any * [IWD=<directory>;] Initial Working Directory * [FLAGS=INTERACTIVE;] set if interactive (not batch) job * [GRES=<name>[:<count>[*cpus]],...;] generic resources required by the * job on a per node basis * [WCKEY=<key>;] workload characterization key for job * UPDATETIME=<uts>; time last active * WCLIMIT=<secs>; wall clock time limit, seconds * TASKS=<cpus>; CPUs required * NODES=<nodes>; count of nodes required or allocated * DPROCS=<cpus_per_task>; count of CPUs required per task * QUEUETIME=<uts>; submission time * STARTTIME=<uts>; time execution started * RCLASS=<partition>; SLURM partition name * RMEM=<MB>; MB of memory required * RDISK=<MB>; MB of disk space required * [COMMENT=<whatever>;] job dependency or account number * [COMPLETETIME=<uts>;] termination time * [SUSPENDTIME=<secs>;] seconds that job has been suspended * UNAME=<user_name>; user name * GNAME=<group_name>; group name * NAME=<job_name>; job name * [#<JOBID>;...]; additional jobs, if any * */ extern int get_jobs(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read job, partition */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; int job_rec_cnt = 0, buf_size = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_enabled); cr_test = 1; } arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "GETJOBS lacks ARG"; error("wiki: GETJOBS lacks ARG"); return -1; } update_time = (time_t) strtoul(arg_ptr+4, &tmp_char, 10); if (tmp_char[0] != ':') { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: GETJOBS has invalid ARG value"); return -1; } if (job_list == NULL) { *err_code = -140; *err_msg = "Still performing initialization"; error("wiki: job_list not yet initilized"); return -1; } tmp_char++; lock_slurmctld(job_read_lock); if (xstrncmp(tmp_char, "ALL", 3) == 0) { /* report all jobs */ buf = _dump_all_jobs(&job_rec_cnt, update_time); } else { struct job_record *job_ptr = NULL; char *job_name = NULL, *tmp2_char = NULL; uint32_t job_id; job_name = strtok_r(tmp_char, ":", &tmp2_char); while (job_name) { job_id = (uint32_t) strtoul(job_name, NULL, 10); job_ptr = find_job_record(job_id); tmp_buf = _dump_job(job_ptr, update_time); if (job_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); job_rec_cnt++; job_name = strtok_r(NULL, ":", &tmp2_char); } } unlock_slurmctld(job_read_lock); /* Prepend ("ARG=%d", job_rec_cnt) to reply message */ if (buf) buf_size = strlen(buf); tmp_buf = xmalloc(buf_size + 32); if (job_rec_cnt) sprintf(tmp_buf, "SC=0 ARG=%d#%s", job_rec_cnt, buf); else sprintf(tmp_buf, "SC=0 ARG=0#"); xfree(buf); *err_code = 0; *err_msg = tmp_buf; return 0; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit) { struct job_record *job_ptr; bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr)) { error("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = time(NULL); } if (bank_ptr) { if (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS) return EINVAL; else update_accounting = true; } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB tasklist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = time(NULL); update_accounting = true; } if (new_node_cnt) { if (IS_JOB_PENDING(job_ptr) && job_ptr->details) { job_ptr->details->min_nodes = new_node_cnt; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < new_node_cnt)) job_ptr->details->max_nodes = new_node_cnt; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); last_job_update = time(NULL); update_accounting = true; } else { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (update_accounting) { /* Update job record in accounting to reflect changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } return SLURM_SUCCESS; }
/* rebuild data structures from scratch * * A reconfigure can affect this plugin in these ways: * - partitions can be added or removed * - this affects the gs_part_list * - nodes can be removed from a partition, or added to a partition * - this affects the size of the active resmap * * Here's the plan: * 1. save a copy of the global structures, and then construct * new ones. * 2. load the new partition structures with existing jobs, * confirming the job exists and resizing their resmaps * (if necessary). * 3. make sure all partitions are accounted for. If a partition * was removed, make sure any jobs that were in the queue and * that were suspended are resumed. Conversely, if a partition * was added, check for existing jobs that may be contending * for resources that we could begin timeslicing. * 4. delete the old global structures and return. */ extern int gs_reconfig(void) { int i; ListIterator part_iterator; struct gs_part *p_ptr, *newp_ptr; List old_part_list; struct job_record *job_ptr; struct gs_job *j_ptr; if (!timeslicer_thread_id) { /* gs_init() will be called later from read_slurm_conf() * if we are enabling gang scheduling via reconfiguration */ return SLURM_SUCCESS; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_reconfig"); pthread_mutex_lock(&data_mutex); old_part_list = gs_part_list; gs_part_list = NULL; /* reset global data */ gs_fast_schedule = slurm_get_fast_schedule(); gr_type = _get_gr_type(); _load_phys_res_cnt(); _build_parts(); /* scan the old part list and add existing jobs to the new list */ part_iterator = list_iterator_create(old_part_list); while ((p_ptr = (struct gs_part *) list_next(part_iterator))) { newp_ptr = (struct gs_part *) list_find_first(gs_part_list, _find_gs_part, p_ptr->part_name); if (!newp_ptr) { /* this partition was removed, so resume * any jobs suspended by gang and continue */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->sig_state == GS_SUSPEND) && (j_ptr->job_ptr->priority != 0)) { info("resuming job in missing part %s", p_ptr->part_name); _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; } } continue; } if (p_ptr->num_jobs == 0) /* no jobs to transfer */ continue; /* we need to transfer the jobs from p_ptr to new_ptr and * adjust their resmaps (if necessary). then we need to create * the active resmap and adjust the state of each job (if * necessary). NOTE: there could be jobs that only overlap * on nodes that are no longer in the partition, but we're * not going to worry about those cases. * * add the jobs from p_ptr into new_ptr in their current order * to preserve the state of timeslicing. */ for (i = 0; i < p_ptr->num_jobs; i++) { job_ptr = find_job_record(p_ptr->job_list[i]->job_id); if (job_ptr == NULL) { /* job no longer exists in SLURM, so drop it */ continue; } /* resume any job that is suspended by us */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG){ info("resuming job %u apparently " "suspended by gang", job_ptr->job_id); } _resume_job(job_ptr->job_id); } /* transfer the job as long as it is still active */ if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { _add_job_to_part(newp_ptr, job_ptr); } } } list_iterator_destroy(part_iterator); /* confirm all jobs. Scan the master job_list and confirm that we * are tracking all jobs */ _scan_slurm_job_list(); FREE_NULL_LIST(old_part_list); pthread_mutex_unlock(&data_mutex); _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_reconfig"); return SLURM_SUCCESS; }
static int _parse_job_info(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover) { s_p_hashtbl_t *job_tbl; char *name = NULL, *tmp = NULL, tmp_name[64]; uint32_t job_id = 0, size = 0, user_id = 0; uint16_t state = 0; bb_alloc_t *bb_ptr; struct job_record *job_ptr = NULL; static s_p_options_t _job_options[] = { {"JobID",S_P_STRING}, {"Name", S_P_STRING}, {"Size", S_P_STRING}, {"State", S_P_STRING}, {NULL} }; *dest = NULL; user_id = atoi(value); job_tbl = s_p_hashtbl_create(_job_options); s_p_parse_line(job_tbl, *leftover, leftover); if (s_p_get_string(&tmp, "JobID", job_tbl)) job_id = atoi(tmp); s_p_get_string(&name, "Name", job_tbl); if (s_p_get_string(&tmp, "Size", job_tbl)) size = _get_size_num(tmp); if (s_p_get_string(&tmp, "State", job_tbl)) state = bb_state_num(tmp); #if 1 info("%s: JobID:%u Name:%s Size:%u State:%u UserID:%u", __func__, job_id, name, size, state, user_id); #endif if (job_id) { job_ptr = find_job_record(job_id); if (!job_ptr) { error("%s: Vestigial buffer for job ID %u. " "Clear manually", plugin_type, job_id); } snprintf(tmp_name, sizeof(tmp_name), "VestigialJob%u", job_id); job_id = 0; name = tmp_name; } if (job_ptr) { if ((bb_ptr = _find_bb_job_rec(job_ptr)) == NULL) { bb_ptr = _alloc_bb_job_rec(job_ptr); bb_ptr->state = state; } } else { if ((bb_ptr = _find_bb_name_rec(name, user_id)) == NULL) { bb_ptr = _alloc_bb_name_rec(name, user_id); bb_ptr->size = size; bb_ptr->state = state; return SLURM_SUCCESS; } } if (bb_ptr->user_id != user_id) { error("%s: User ID mismatch (%u != %u). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_ptr->user_id, user_id, bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); } if (bb_ptr->size != size) { error("%s: Size mismatch (%u != %u). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_ptr->size, size, bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); bb_ptr->size = MAX(bb_ptr->size, size); } if (bb_ptr->state != state) { /* State is subject to real-time changes */ debug("%s: State mismatch (%s != %s). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_state_string(bb_ptr->state), bb_state_string(state), bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); } return SLURM_SUCCESS; }
/* Checkpoint processing pthread * Never returns, but is cancelled on plugin termiantion */ static void *_ckpt_agent_thr(void *arg) { struct ckpt_req *req = (struct ckpt_req *)arg; int rc; /* Locks: write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; struct job_record *job_ptr; struct step_record *step_ptr; struct check_job_info *check_ptr; /* only perform ckpt operation of ONE JOB */ slurm_mutex_lock(&ckpt_agent_mutex); while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) { pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex); } ckpt_agent_jobid = req->job_id; ckpt_agent_count ++; slurm_mutex_unlock(&ckpt_agent_mutex); debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u", req->op, req->job_id, req->step_id); rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time, req->image_dir, req->wait, req->nodelist); if (rc != SLURM_SUCCESS) { error("checkpoint/blcr: error on checkpoint request %u to " "%u.%u: %s", req->op, req->job_id, req->step_id, slurm_strerror(rc)); } if (req->op == CHECK_REQUEUE) _requeue_when_finished(req->job_id); lock_slurmctld(job_write_lock); job_ptr = find_job_record(req->job_id); if (!job_ptr) { error("_ckpt_agent_thr: job finished"); goto out; } if (req->step_id == SLURM_BATCH_SCRIPT) { /* batch job */ check_ptr = (struct check_job_info *)job_ptr->check_job; } else { step_ptr = find_step_record(job_ptr, req->step_id); if (! step_ptr) { error("_ckpt_agent_thr: step finished"); goto out; } check_ptr = (struct check_job_info *)step_ptr->check_job; } check_ptr->time_stamp = 0; check_ptr->error_code = rc; if (check_ptr->error_code != SLURM_SUCCESS) check_ptr->error_msg = xstrdup(slurm_strerror(rc)); out: unlock_slurmctld(job_write_lock); if (req->sig_done) { _send_sig(req->job_id, req->step_id, req->sig_done, req->nodelist); } _on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id, req->image_dir, rc); slurm_mutex_lock(&ckpt_agent_mutex); ckpt_agent_count --; if (ckpt_agent_count == 0) { ckpt_agent_jobid = 0; pthread_cond_broadcast(&ckpt_agent_cond); } slurm_mutex_unlock(&ckpt_agent_mutex); _ckpt_req_free(req); return NULL; }
/* Test if a batch launch request should be defered * RET -1: abort the request, pending job cancelled * 0: execute the request now * 1: defer the request */ static int _batch_launch_defer(queued_request_t *queued_req_ptr) { agent_arg_t *agent_arg_ptr; batch_job_launch_msg_t *launch_msg_ptr; time_t now = time(NULL); struct job_record *job_ptr; int delay_time, nodes_ready = 0; agent_arg_ptr = queued_req_ptr->agent_arg_ptr; if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) return 0; if (difftime(now, queued_req_ptr->last_attempt) < 10) { /* Reduce overhead by only testing once every 10 secs */ return 1; } launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; job_ptr = find_job_record(launch_msg_ptr->job_id); if ((job_ptr == NULL) || (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { info("agent(batch_launch): removed pending request for " "cancelled job %u", launch_msg_ptr->job_id); return -1; /* job cancelled while waiting */ } if (job_ptr->wait_all_nodes) { (void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready); } else { #ifdef HAVE_FRONT_END nodes_ready = 1; #else struct node_record *node_ptr; char *hostname; hostname = hostlist_deranged_string_xmalloc( agent_arg_ptr->hostlist); node_ptr = find_node_record(hostname); if (node_ptr == NULL) { error("agent(batch_launch) removed pending request for " "job %u, missing node %s", launch_msg_ptr->job_id, hostname); xfree(hostname); return -1; /* invalid request?? */ } xfree(hostname); if (!IS_NODE_POWER_SAVE(node_ptr) && !IS_NODE_NO_RESPOND(node_ptr)) { nodes_ready = 1; } #endif } delay_time = difftime(now, job_ptr->start_time); if (nodes_ready) { /* ready to launch, adjust time limit for boot time */ if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } if (queued_req_ptr->last_attempt == 0) { queued_req_ptr->first_attempt = now; queued_req_ptr->last_attempt = now; } else if (difftime(now, queued_req_ptr->first_attempt) >= slurm_get_resume_timeout()) { error("agent waited too long for nodes to respond, " "sending batch request anyway..."); if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } queued_req_ptr->last_attempt = now; return 1; }
/* Handle timeout of burst buffer events: * 1. Purge per-job burst buffer records when the stage-out has completed and * the job has been purged from Slurm * 2. Test for StageInTimeout events * 3. Test for StageOutTimeout events */ static void _timeout_bb_rec(void) { struct job_record *job_ptr; bb_alloc_t **bb_pptr, *bb_ptr = NULL; uint32_t age; time_t now = time(NULL); int i; for (i = 0; i < BB_HASH_SIZE; i++) { bb_pptr = &bb_state.bb_ahash[i]; bb_ptr = bb_state.bb_ahash[i]; while (bb_ptr) { if (bb_ptr->seen_time < bb_state.last_load_time) { if (bb_ptr->job_id == 0) { info("%s: Persistent burst buffer %s " "purged", __func__, bb_ptr->name); } else if (bb_state.bb_config.debug_flag) { info("%s: burst buffer for job %u " "purged", __func__, bb_ptr->job_id); } //FIXME: VESTIGIAL: Use bb_limit_rem // bb_remove_user_load(bb_ptr, &bb_state); *bb_pptr = bb_ptr->next; bb_free_alloc_buf(bb_ptr); break; } if ((bb_ptr->job_id != 0) && (bb_ptr->state >= BB_STATE_STAGED_OUT) && !find_job_record(bb_ptr->job_id)) { _stop_stage_out(bb_ptr->job_id); bb_ptr->cancelled = true; bb_ptr->end_time = 0; *bb_pptr = bb_ptr->next; bb_free_alloc_buf(bb_ptr); break; } age = difftime(now, bb_ptr->state_time); if ((bb_ptr->job_id != 0) && bb_state.bb_config.stop_stage_in && (bb_ptr->state == BB_STATE_STAGING_IN) && (bb_state.bb_config.stage_in_timeout != 0) && (!bb_ptr->cancelled) && (age >= bb_state.bb_config.stage_in_timeout)) { _stop_stage_in(bb_ptr->job_id); bb_ptr->cancelled = true; bb_ptr->end_time = 0; job_ptr = find_job_record(bb_ptr->job_id); if (job_ptr) { error("%s: StageIn timed out, holding " "job %u", __func__, bb_ptr->job_id); job_ptr->priority = 0; job_ptr->direct_set_prio = 1; job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); job_ptr->state_desc = xstrdup( "Burst buffer stage-in timeout"); last_job_update = now; } else { error("%s: StageIn timed out for " "vestigial job %u ", __func__, bb_ptr->job_id); } } if ((bb_ptr->job_id != 0) && bb_state.bb_config.stop_stage_out && (bb_ptr->state == BB_STATE_STAGING_OUT) && (bb_state.bb_config.stage_out_timeout != 0) && (!bb_ptr->cancelled) && (age >= bb_state.bb_config.stage_out_timeout)) { error("%s: StageOut for job %u timed out", __func__, bb_ptr->job_id); _stop_stage_out(bb_ptr->job_id); bb_ptr->cancelled = true; bb_ptr->end_time = 0; } bb_pptr = &bb_ptr->next; bb_ptr = bb_ptr->next; } } }
static char * _will_run_test(uint32_t jobid, time_t start_time, char *node_list, int *err_code, char **err_msg) { struct job_record *job_ptr = NULL; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; char *hostlist, *reply_msg = NULL; uint32_t min_nodes, max_nodes, req_nodes; int rc; time_t start_res, orig_start_time; List preemptee_candidates; debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s", jobid, (uint32_t)start_time, node_list); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); return NULL; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "WillRun not applicable to non-pending job"; error("wiki: WillRun on non-pending job %u", jobid); return NULL; } part_ptr = job_ptr->part_ptr; if (part_ptr == NULL) { *err_code = -700; *err_msg = "Job lacks a partition"; error("wiki: Job %u lacks a partition", jobid); return NULL; } if ((node_list == NULL) || (node_list[0] == '\0')) { /* assume all nodes available to job for testing */ avail_bitmap = bit_copy(avail_node_bitmap); } else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid available nodes value"; error("wiki: Attempt to set invalid available node " "list for job %u, %s", jobid, node_list); return NULL; } /* Enforce reservation: access control, time and nodes */ start_res = start_time; rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; error("wiki: reservation access denied for job %u", jobid); FREE_NULL_BITMAP(avail_bitmap); return NULL; } start_time = MAX(start_time, start_res); bit_and(avail_bitmap, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); /* Consider only nodes in this job's partition */ if (part_ptr->node_bitmap) bit_and(avail_bitmap, part_ptr->node_bitmap); else { *err_code = -730; *err_msg = "Job's partition has no nodes"; error("wiki: no nodes in partition %s for job %u", part_ptr->name, jobid); FREE_NULL_BITMAP(avail_bitmap); return NULL; } if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { /* Job probably has invalid feature list */ *err_code = -730; *err_msg = "Job's required features not available " "on selected nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) { *err_code = -730; *err_msg = "Job's required nodes not available"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partitions max_nodes */ *err_code = -730; *err_msg = "Job's min_nodes > max_nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); if (preemptee_candidates) list_destroy(preemptee_candidates); if (rc == SLURM_SUCCESS) { char tmp_str[128]; *err_code = 0; uint32_t proc_cnt = 0; xstrcat(reply_msg, "STARTINFO="); #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,", jobid, proc_cnt, (uint32_t) job_ptr->start_time); xstrcat(reply_msg, tmp_str); hostlist = bitmap2node_name(avail_bitmap); xstrcat(reply_msg, hostlist); xfree(hostlist); } else { xstrcat(reply_msg, "Jobs not runable on selected nodes"); error("wiki: jobs not runnable on nodes"); } /* Restore pending job's expected start time */ job_ptr->start_time = orig_start_time; FREE_NULL_BITMAP(avail_bitmap); return reply_msg; }
/* * Attempt to start a job * jobid (IN) - job id * task_cnt (IN) - total count of tasks to start * hostlist (IN) - SLURM hostlist expression with no repeated hostnames * tasklist (IN/OUT) - comma separated list of hosts with tasks to be started, * list hostname once per task to start * comment_ptr (IN) - new comment field for the job or NULL for no change * err_code (OUT) - Moab error code * err_msg (OUT) - Moab error message */ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, char *tasklist, char *comment_ptr, int *err_code, char **err_msg) { int rc = 0, old_task_cnt = 1; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; char *new_node_list = NULL; static char tmp_msg[128]; bitstr_t *new_bitmap = (bitstr_t *) NULL; bitstr_t *save_req_bitmap = (bitstr_t *) NULL; bitoff_t i, bsize; int ll; /* layout info index */ char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; size_t node_name_len; static uint32_t cr_test = 0, cr_enabled = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_enabled); cr_test = 1; } lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); rc = -1; goto fini; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "Job not pending, can't start"; error("wiki: Attempt to start job %u in state %s", jobid, job_state_string(job_ptr->job_state)); rc = -1; goto fini; } if (comment_ptr) { char *reserved = strstr(comment_ptr, "RESERVED:"); if (reserved) { reserved += 9; job_ptr->details->reserved_resources = strtol(reserved, NULL, 10); } xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); } if (task_cnt) { new_node_list = xstrdup(hostlist); if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid TASKLIST"; error("wiki: Attempt to set invalid node list for " "job %u, %s", jobid, hostlist); xfree(new_node_list); rc = -1; goto fini; } if (!bit_super_set(new_bitmap, avail_node_bitmap)) { /* Selected node is UP and not responding * or it just went DOWN */ *err_code = -700; *err_msg = "TASKLIST includes non-responsive node"; error("wiki: Attempt to use non-responsive nodes for " "job %u, %s", jobid, hostlist); xfree(new_node_list); FREE_NULL_BITMAP(new_bitmap); rc = -1; goto fini; } /* User excluded node list incompatible with Wiki * Exclude all nodes not explicitly requested */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Build layout information from tasklist (assuming that Moab * sends a non-bracketed list of nodes, repeated as many times * as cpus should be used per node); at this point, node names * are comma-separated. This is _not_ a fast algorithm as it * performs many string compares. */ xfree(job_ptr->details->req_node_layout); if (task_cnt && cr_enabled) { uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task); job_ptr->details->req_node_layout = (uint16_t *) xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t)); bsize = bit_size(new_bitmap); for (i = 0, ll = -1; i < bsize; i++) { if (!bit_test(new_bitmap, i)) continue; ll++; node_name = node_record_table_ptr[i].name; node_name_len = strlen(node_name); if (node_name_len == 0) continue; node_cur = tasklist; while (*node_cur) { if ((node_idx = strstr(node_cur, node_name))) { if ((node_idx[node_name_len] == ',') || (node_idx[node_name_len] == '\0')) { job_ptr->details-> req_node_layout[ll] += cpus_per_task; } node_cur = strchr(node_idx, ','); if (node_cur) continue; } break; } } } /* save and update job state to start now */ save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; old_task_cnt = job_ptr->details->min_cpus; job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); if (rc) return rc; /* No errors so far */ (void) schedule(INFINITE); /* provides own locking */ /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); if (job_ptr->job_id != jobid) job_ptr = find_job_record(jobid); if (job_ptr && (job_ptr->job_id == jobid) && (!IS_JOB_RUNNING(job_ptr))) { uint16_t wait_reason = 0; char *wait_string; if (IS_JOB_FAILED(job_ptr)) wait_string = "Invalid request, job aborted"; else { wait_reason = job_ptr->state_reason; if (wait_reason == WAIT_HELD) { /* some job is completing, slurmctld did * not even try to schedule this job */ wait_reason = WAIT_RESOURCES; } wait_string = job_reason_string(wait_reason); job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } *err_code = -910 - wait_reason; snprintf(tmp_msg, sizeof(tmp_msg), "Could not start job %u(%s): %s", jobid, new_node_list, wait_string); *err_msg = tmp_msg; error("wiki: %s", tmp_msg); /* restore some of job state */ job_ptr->priority = 0; job_ptr->details->min_cpus = old_task_cnt; rc = -1; } if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) { /* Restore required node list in case job requeued */ xfree(job_ptr->details->req_nodes); job_ptr->details->req_nodes = save_req_nodes; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = save_req_bitmap; FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); xfree(job_ptr->details->req_node_layout); } else { error("wiki: start_job(%u) job missing", jobid); xfree(save_req_nodes); FREE_NULL_BITMAP(save_req_bitmap); } unlock_slurmctld(job_write_lock); schedule_node_save(); /* provides own locking */ schedule_job_save(); /* provides own locking */ return rc; }
static void _xlate_before(char *depend, uint32_t submit_uid, uint32_t my_job_id) { uint32_t job_id; char *last_ptr = NULL, *new_dep = NULL, *tok, *type; struct job_record *job_ptr; pthread_attr_t attr; pthread_t dep_thread; tok = strtok_r(depend, ":", &last_ptr); if (!xstrcmp(tok, "before")) type = "after"; else if (!xstrcmp(tok, "beforeany")) type = "afterany"; else if (!xstrcmp(tok, "beforenotok")) type = "afternotok"; else if (!xstrcmp(tok, "beforeok")) type = "afterok"; else { info("%s: discarding invalid job dependency option %s", plugin_type, tok); return; } /* NOTE: We are updating a job record here in order to implement * the depend=before option. We are doing so without the write lock * on the job record, but using a local mutex to prevent multiple * updates on the same job when multiple jobs satisfying the dependency * are being processed at the same time (all with read locks). The * job read lock will prevent anyone else from getting a job write * lock and using a job write lock causes serious performance problems * for slow job_submit plugins. Not an ideal solution, but the best * option that we see. */ slurm_mutex_lock(&depend_mutex); tok = strtok_r(NULL, ":", &last_ptr); while (tok) { job_id = atoi(tok); job_ptr = find_job_record(job_id); if (!job_ptr) { info("%s: discarding invalid job dependency before %s", plugin_type, tok); } else if ((submit_uid != job_ptr->user_id) && !validate_super_user(submit_uid)) { error("%s: Security violation: uid %u trying to alter " "job %u belonging to uid %u", plugin_type, submit_uid, job_ptr->job_id, job_ptr->user_id); } else if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { info("%s: discarding job before dependency on " "non-pending job %u", plugin_type, job_ptr->job_id); } else { if (job_ptr->details->dependency) { xstrcat(new_dep, job_ptr->details->dependency); xstrcat(new_dep, ","); } xstrfmtcat(new_dep, "%s:%u", type, my_job_id); xfree(job_ptr->details->dependency); job_ptr->details->dependency = new_dep; new_dep = NULL; _decr_depend_cnt(job_ptr); slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); pthread_create(&dep_thread, &attr, _dep_agent, job_ptr); slurm_attr_destroy(&attr); } tok = strtok_r(NULL, ":", &last_ptr); } slurm_mutex_unlock(&depend_mutex); }
static int _parse_job_info(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover) { s_p_hashtbl_t *job_tbl; char *name = NULL, *tmp = NULL, local_name[64] = ""; uint64_t size = 0; uint32_t job_id = 0, user_id = 0; uint16_t state = 0; bb_alloc_t *bb_ptr; struct job_record *job_ptr = NULL; bb_job_t *bb_spec; static s_p_options_t _job_options[] = { {"JobID",S_P_STRING}, {"Name", S_P_STRING}, {"Size", S_P_STRING}, {"State", S_P_STRING}, {NULL} }; *dest = NULL; user_id = strtol(value, NULL, 10); job_tbl = s_p_hashtbl_create(_job_options); s_p_parse_line(job_tbl, *leftover, leftover); if (s_p_get_string(&tmp, "JobID", job_tbl)) { job_id = strtol(tmp, NULL, 10); xfree(tmp); } if (s_p_get_string(&name, "Name", job_tbl)) { snprintf(local_name, sizeof(local_name), "%s", name); xfree(name); } if (s_p_get_string(&tmp, "Size", job_tbl)) { size = bb_get_size_num(tmp, bb_state.bb_config.granularity); xfree(tmp); } if (s_p_get_string(&tmp, "State", job_tbl)) { state = bb_state_num(tmp); xfree(tmp); } s_p_hashtbl_destroy(job_tbl); #if 0 info("%s: JobID:%u Name:%s Size:%"PRIu64" State:%u UserID:%u", __func__, job_id, local_name, size, state, user_id); #endif if (job_id) { job_ptr = find_job_record(job_id); if (!job_ptr && (state == BB_STATE_STAGED_OUT)) { struct job_record job_rec; job_rec.job_id = job_id; job_rec.user_id = user_id; bb_ptr = bb_find_alloc_rec(&bb_state, &job_rec); _stop_stage_out(job_id); /* Purge buffer */ if (bb_ptr) { bb_ptr->cancelled = true; bb_ptr->end_time = 0; } else { /* Slurm knows nothing about this job, * may be result of slurmctld cold start */ error("%s: Vestigial buffer for purged job %u", plugin_type, job_id); } return SLURM_SUCCESS; } else if (!job_ptr && ((state == BB_STATE_STAGING_IN) || (state == BB_STATE_STAGED_IN))) { struct job_record job_rec; job_rec.job_id = job_id; job_rec.user_id = user_id; bb_ptr = bb_find_alloc_rec(&bb_state, &job_rec); _stop_stage_in(job_id); /* Purge buffer */ if (bb_ptr) { bb_ptr->cancelled = true; bb_ptr->end_time = 0; } else { /* Slurm knows nothing about this job, * may be result of slurmctld cold start */ error("%s: Vestigial buffer for purged job %u", plugin_type, job_id); } return SLURM_SUCCESS; } else if (!job_ptr) { error("%s: Vestigial buffer for job ID %u. " "Clear manually", plugin_type, job_id); } snprintf(local_name, sizeof(local_name), "VestigialJob%u", job_id); } if (job_ptr) { bb_ptr = bb_find_alloc_rec(&bb_state, job_ptr); if (bb_ptr == NULL) { bb_spec = xmalloc(sizeof(bb_job_t)); bb_spec->total_size = _get_bb_size(job_ptr); bb_ptr = bb_alloc_job_rec(&bb_state, job_ptr, bb_spec); xfree(bb_spec); bb_ptr->state = state; /* bb_ptr->state_time set in bb_alloc_job_rec() */ } } else { if ((bb_ptr = _find_bb_name_rec(local_name, user_id)) == NULL) { bb_ptr = bb_alloc_name_rec(&bb_state, local_name, user_id); bb_ptr->size = size; bb_ptr->state = state; //FIXME: VESTIGIAL: Use bb_limit_add // bb_add_user_load(bb_ptr, &bb_state); return SLURM_SUCCESS; } } bb_ptr->seen_time = time(NULL); /* used to purge defunct recs */ /* UserID set to 0 on some failure modes */ if ((bb_ptr->user_id != user_id) && (user_id != 0)) { error("%s: User ID mismatch (%u != %u). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_ptr->user_id, user_id, bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); } if ((bb_ptr->state == BB_STATE_RUNNING) && (state == BB_STATE_STAGED_IN)) state = BB_STATE_RUNNING; /* More precise state info */ if (bb_ptr->state != state) { /* State is subject to real-time changes */ debug("%s: State changed (%s to %s). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_state_string(bb_ptr->state), bb_state_string(state), bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); bb_ptr->state = state; bb_ptr->state_time = time(NULL); if (bb_ptr->state == BB_STATE_STAGED_OUT) { if (bb_ptr->size != 0) { //FIXME: VESTIGIAL: Use bb_limit_rem // bb_remove_user_load(bb_ptr, &bb_state); bb_ptr->size = 0; } } if (bb_ptr->state == BB_STATE_STAGED_IN) queue_job_scheduler(); } if ((bb_ptr->state != BB_STATE_STAGED_OUT) && (bb_ptr->size != size)) { //FIXME: VESTIGIAL: Use bb_limit_rem // bb_remove_user_load(bb_ptr, &bb_state); if (size != 0) { error("%s: Size mismatch (%"PRIu64" != %"PRIu64"). " "BB UserID=%u JobID=%u Name=%s", plugin_type, bb_ptr->size, size, bb_ptr->user_id, bb_ptr->job_id, bb_ptr->name); } bb_ptr->size = MAX(bb_ptr->size, size); //FIXME: VESTIGIAL: Use bb_limit_add // bb_add_user_load(bb_ptr, &bb_state); } return SLURM_SUCCESS; }
static void _xlate_before(char *depend, uint32_t submit_uid, uint32_t my_job_id) { uint32_t job_id; char *last_ptr = NULL, *new_dep = NULL, *tok, *type; struct job_record *job_ptr; pthread_attr_t attr; pthread_t dep_thread; tok = strtok_r(depend, ":", &last_ptr); if (!strcmp(tok, "before")) type = "after"; else if (!strcmp(tok, "beforeany")) type = "afterany"; else if (!strcmp(tok, "beforenotok")) type = "afternotok"; else if (!strcmp(tok, "beforeok")) type = "afterok"; else { info("%s: discarding invalid job dependency option %s", plugin_type, tok); return; } tok = strtok_r(NULL, ":", &last_ptr); while (tok) { job_id = atoi(tok); job_ptr = find_job_record(job_id); if (!job_ptr) { info("%s: discarding invalid job dependency before %s", plugin_type, tok); } else if ((submit_uid != job_ptr->user_id) && !validate_super_user(submit_uid)) { error("%s: Security violation: uid %u trying to alter " "job %u belonging to uid %u", plugin_type, submit_uid, job_ptr->job_id, job_ptr->user_id); } else if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { info("%s: discarding job before dependency on " "non-pending job %u", plugin_type, job_ptr->job_id); } else { if (job_ptr->details->dependency) { xstrcat(new_dep, job_ptr->details->dependency); xstrcat(new_dep, ","); } xstrfmtcat(new_dep, "%s:%u", type, my_job_id); xfree(job_ptr->details->dependency); job_ptr->details->dependency = new_dep; new_dep = NULL; _decr_depend_cnt(job_ptr); slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); pthread_create(&dep_thread, &attr, _dep_agent, job_ptr); slurm_attr_destroy(&attr); } tok = strtok_r(NULL, ":", &last_ptr); } }