/* NOTE: if job has already completed, we append "EXITCODE=#" to * the state name */ static char * _get_job_state(struct job_record *job_ptr) { char *state_str; static char return_msg[128]; if (IS_JOB_COMPLETING(job_ptr)) { /* Give configured KillWait+10 for job * to clear out, then then consider job * done. Moab will allocate jobs to * nodes that are already Idle. */ int age = (int) difftime(time(NULL), job_ptr->end_time); if (age < (kill_wait+10)) return "Running"; } if (IS_JOB_RUNNING(job_ptr)) return "Running"; if (IS_JOB_SUSPENDED(job_ptr)) return "Suspended"; if (IS_JOB_PENDING(job_ptr)) return "Idle"; if (IS_JOB_COMPLETE(job_ptr) || IS_JOB_FAILED(job_ptr)) state_str = "Completed"; else /* JOB_CANCELLED, JOB_TIMEOUT, JOB_NODE_FAIL, etc. */ state_str = "Removed"; snprintf(return_msg, sizeof(return_msg), "%s;EXITCODE=%u", state_str, WEXITSTATUS(job_ptr->exit_code)); return return_msg; }
extern List find_preemptable_jobs(struct job_record *job_ptr) { ListIterator job_iterator; struct job_record *job_p; List preemptee_job_list = NULL; /* Validate the preemptor job */ if (job_ptr == NULL) { error("find_preemptable_jobs: job_ptr is NULL"); return preemptee_job_list; } if (!IS_JOB_PENDING(job_ptr)) { error("find_preemptable_jobs: job %u not pending", job_ptr->job_id); return preemptee_job_list; } if (job_ptr->part_ptr == NULL) { error("find_preemptable_jobs: job %u has NULL partition ptr", job_ptr->job_id); return preemptee_job_list; } if (job_ptr->part_ptr->node_bitmap == NULL) { error("find_preemptable_jobs: partition %s node_bitmap=NULL", job_ptr->part_ptr->name); return preemptee_job_list; } /* Build an array of pointers to preemption candidates */ job_iterator = list_iterator_create(job_list); while ((job_p = (struct job_record *) list_next(job_iterator))) { if (!IS_JOB_RUNNING(job_p) && !IS_JOB_SUSPENDED(job_p)) continue; if ((job_p->part_ptr == NULL) || (job_p->part_ptr->priority_tier >= job_ptr->part_ptr->priority_tier) || (job_p->part_ptr->preempt_mode == PREEMPT_MODE_OFF)) continue; if ((job_p->node_bitmap == NULL) || (bit_overlap(job_p->node_bitmap, job_ptr->part_ptr->node_bitmap) == 0)) continue; if (job_ptr->details && (job_ptr->details->expanding_jobid == job_p->job_id)) continue; /* This job is a preemption candidate */ if (preemptee_job_list == NULL) { preemptee_job_list = list_create(NULL); } list_append(preemptee_job_list, job_p); } list_iterator_destroy(job_iterator); if (preemptee_job_list && youngest_order) list_sort(preemptee_job_list, _sort_by_youngest); else if (preemptee_job_list) list_sort(preemptee_job_list, _sort_by_prio); return preemptee_job_list; }
/* Code taken from job_info.c calculate cummulative run time for a job */ static time_t _get_job_runtime(struct job_record *job_ptr) { time_t end_time, run_time; if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else { run_time = (time_t) difftime(end_time, job_ptr->start_time); } } return run_time; }
/* returns how long job has been suspended, in seconds */ static uint32_t _get_job_suspend_time(struct job_record *job_ptr) { if (IS_JOB_SUSPENDED(job_ptr)) { time_t now = time(NULL); return (uint32_t) difftime(now, job_ptr->suspend_time); } return (uint32_t) 0; }
/* Gang scheduling has been disabled by change in configuration, * resume any suspended jobs */ extern void gs_wake_jobs(void) { struct job_record *job_ptr; ListIterator job_iterator; if (!job_list) /* no jobs */ return; job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)) { info("gang waking preempted job %u", job_ptr->job_id); _resume_job(job_ptr->job_id); } } list_iterator_destroy(job_iterator); }
long job_time_used(job_info_t * job_ptr) { time_t end_time; if ((job_ptr->start_time == 0) || IS_JOB_PENDING(job_ptr)) return 0L; if (IS_JOB_SUSPENDED(job_ptr)) return (long) job_ptr->pre_sus_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) return (long) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); return (long) (difftime(end_time, job_ptr->start_time)); }
/* ensure that all jobs running in SLURM are accounted for. * this procedure assumes that the gs data has already been * locked by the caller! */ static void _scan_slurm_job_list(void) { struct job_record *job_ptr; struct gs_part *p_ptr; int i; ListIterator job_iterator; char *part_name; if (!job_list) { /* no jobs */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list NULL"); return; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _scan_slurm_job_list: job_list exists..."); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _scan_slurm_job_list: checking job %u", job_ptr->job_id); } if (IS_JOB_PENDING(job_ptr)) continue; if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority == 0)) continue; /* not suspended by us */ if (job_ptr->part_ptr && job_ptr->part_ptr->name) part_name = job_ptr->part_ptr->name; else part_name = job_ptr->partition; if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { /* are we tracking this job already? */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; i = _find_job_index(p_ptr, job_ptr->job_id); if (i >= 0) /* we're tracking it, so continue */ continue; /* We're not tracking this job. Resume it if it's * suspended, and then add it to the job list. */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { /* The likely scenario here is that the * failed over, and this is a job that gang * had previously suspended. It's not possible * to determine the previous order of jobs * without preserving gang state, which is not * worth the extra infrastructure. Just resume * the job and then add it to the job list. */ _resume_job(job_ptr->job_id); } _add_job_to_part(p_ptr, job_ptr); continue; } /* if the job is not pending, suspended, or running, then * it's completing or completed. Make sure we've released * this job */ p_ptr = list_find_first(gs_part_list, _find_gs_part, part_name); if (!p_ptr) /* no partition */ continue; _remove_job_from_part(job_ptr->job_id, p_ptr, false); } list_iterator_destroy(job_iterator); /* now that all of the old jobs have been flushed out, * update the active row of all partitions */ _update_all_active_rows(); return; }
/* filter job records per input specifications, * returns >0 if job should be filter out (not printed) */ static int _filter_job(job_info_t * job) { int filter; ListIterator iterator; uint32_t *user; uint16_t *state_id; char *account, *part, *qos, *name; squeue_job_step_t *job_step_id; if (params.job_list) { filter = 1; iterator = list_iterator_create(params.job_list); while ((job_step_id = list_next(iterator))) { if (((job_step_id->array_id == (uint16_t) NO_VAL) && ((job_step_id->job_id == job->array_job_id) || (job_step_id->job_id == job->job_id))) || ((job_step_id->array_id == job->array_task_id) && (job_step_id->job_id == job->array_job_id))) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 1; } if (params.part_list) { char *token = NULL, *last = NULL, *tmp_name = NULL; filter = 1; if (job->partition) { tmp_name = xstrdup(job->partition); token = strtok_r(tmp_name, ",", &last); } while (token && filter) { iterator = list_iterator_create(params.part_list); while ((part = list_next(iterator))) { if (strcmp(part, token) == 0) { filter = 0; break; } } list_iterator_destroy(iterator); token = strtok_r(NULL, ",", &last); } xfree(tmp_name); if (filter == 1) return 2; } if (params.account_list) { filter = 1; iterator = list_iterator_create(params.account_list); while ((account = list_next(iterator))) { if ((job->account != NULL) && (strcasecmp(account, job->account) == 0)) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 2; } if (params.qos_list) { filter = 1; iterator = list_iterator_create(params.qos_list); while ((qos = list_next(iterator))) { if ((job->qos != NULL) && (strcasecmp(qos, job->qos) == 0)) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 2; } if (params.state_list) { filter = 1; iterator = list_iterator_create(params.state_list); while ((state_id = list_next(iterator))) { if ((*state_id == job->job_state) || ((*state_id == JOB_COMPLETING) && (*state_id & job->job_state)) || ((*state_id == JOB_CONFIGURING) && (*state_id & job->job_state))) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 3; } else { if (!IS_JOB_PENDING(job) && !IS_JOB_RUNNING(job) && !IS_JOB_SUSPENDED(job) && !IS_JOB_COMPLETING(job)) return 4; } if ((params.nodes) && ((job->nodes == NULL) || (!hostset_intersects(params.nodes, job->nodes)))) return 5; if (params.user_list) { filter = 1; iterator = list_iterator_create(params.user_list); while ((user = list_next(iterator))) { if (*user == job->user_id) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 6; } if (params.reservation) { if ((job->resv_name == NULL) || (strcmp(job->resv_name, params.reservation))) { return 7; } } if (params.name_list) { filter = 1; iterator = list_iterator_create(params.name_list); while ((name = list_next(iterator))) { if ((job->name != NULL) && (strcasecmp(name, job->name) == 0)) { filter = 0; break; } } list_iterator_destroy(iterator); if (filter == 1) return 8; } return 0; }
extern void get_job(void) { int error_code = -1, i, recs; static int printed_jobs = 0; static int count = 0; static job_info_msg_t *job_info_ptr = NULL, *new_job_ptr = NULL; job_info_t *job_ptr = NULL; uint16_t show_flags = 0; bitstr_t *nodes_req = NULL; static uint16_t last_flags = 0; if (params.all_flag) show_flags |= SHOW_ALL; if (job_info_ptr) { if (show_flags != last_flags) job_info_ptr->last_update = 0; error_code = slurm_load_jobs(job_info_ptr->last_update, &new_job_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg(job_info_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = job_info_ptr; } } else error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); last_flags = show_flags; if (error_code) { if (quiet_flag != 1) { if (!params.commandline) { mvwprintw(text_win, main_ycord, 1, "slurm_load_jobs: %s", slurm_strerror(slurm_get_errno())); main_ycord++; } else { printf("slurm_load_jobs: %s\n", slurm_strerror(slurm_get_errno())); } } } if (!params.no_header) _print_header_job(); if (new_job_ptr) recs = new_job_ptr->record_count; else recs = 0; if (!params.commandline) if ((text_line_cnt+printed_jobs) > count) text_line_cnt--; printed_jobs = 0; count = 0; if (params.hl) nodes_req = get_requested_node_bitmap(); for (i = 0; i < recs; i++) { job_ptr = &(new_job_ptr->job_array[i]); if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; /* job has completed */ if (nodes_req) { int overlap = 0; bitstr_t *loc_bitmap = bit_alloc(bit_size(nodes_req)); inx2bitstr(loc_bitmap, job_ptr->node_inx); overlap = bit_overlap(loc_bitmap, nodes_req); FREE_NULL_BITMAP(loc_bitmap); if (!overlap) continue; } if (job_ptr->node_inx[0] != -1) { int j = 0; job_ptr->num_nodes = 0; while (job_ptr->node_inx[j] >= 0) { job_ptr->num_nodes += (job_ptr->node_inx[j + 1] + 1) - job_ptr->node_inx[j]; set_grid_inx(job_ptr->node_inx[j], job_ptr->node_inx[j + 1], count); j += 2; } if (!params.commandline) { if ((count >= text_line_cnt) && (printed_jobs < (getmaxy(text_win) - 4))) { job_ptr->num_cpus = (int)letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); _print_text_job(job_ptr); wattroff(text_win, COLOR_PAIR(colors[count%6])); printed_jobs++; } } else { job_ptr->num_cpus = (int)letters[count%62]; _print_text_job(job_ptr); } count++; } if (count == 128) count = 0; } for (i = 0; i < recs; i++) { job_ptr = &(new_job_ptr->job_array[i]); if (!IS_JOB_PENDING(job_ptr)) continue; /* job has completed */ if (!params.commandline) { if ((count>=text_line_cnt) && (printed_jobs < (getmaxy(text_win) - 4))) { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); job_ptr->num_cpus = (int) letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); _print_text_job(job_ptr); wattroff(text_win, COLOR_PAIR(colors[count%6])); printed_jobs++; } } else { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); job_ptr->num_cpus = (int) letters[count%62]; _print_text_job(job_ptr); printed_jobs++; } count++; if (count == 128) count = 0; } if (params.commandline && params.iterate) printf("\n"); if (!params.commandline) main_ycord++; job_info_ptr = new_job_ptr; return; }
/* Add the given job to the given partition, and if it remains running * then "cast it's shadow" over the active row of any partition with a * lower priority than the given partition. Return the sig state of the * job (GS_SUSPEND or GS_RESUME) */ static uint16_t _add_job_to_part(struct gs_part *p_ptr, struct job_record *job_ptr) { int i; struct gs_job *j_ptr; uint16_t preempt_mode; xassert(p_ptr); xassert(job_ptr->job_id > 0); xassert(job_ptr->job_resrcs); xassert(job_ptr->job_resrcs->node_bitmap); xassert(job_ptr->job_resrcs->core_bitmap); if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_part: adding job %u to %s", job_ptr->job_id, p_ptr->part_name); } /* take care of any memory needs */ if (!p_ptr->job_list) { p_ptr->job_list_size = default_job_list_size; p_ptr->job_list = xmalloc(p_ptr->job_list_size * sizeof(struct gs_job *)); /* job_list is initialized to be NULL filled */ } /* protect against duplicates */ i = _find_job_index(p_ptr, job_ptr->job_id); if (i >= 0) { /* This job already exists, but the resource allocation * may have changed. In any case, remove the existing * job before adding this new one. */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_part: duplicate job %u " "detected", job_ptr->job_id); } _remove_job_from_part(job_ptr->job_id, p_ptr, false); _update_active_row(p_ptr, 0); } /* more memory management */ if ((p_ptr->num_jobs + 1) == p_ptr->job_list_size) { p_ptr->job_list_size *= 2; xrealloc(p_ptr->job_list, p_ptr->job_list_size * sizeof(struct gs_job *)); /* enlarged job_list is initialized to be NULL filled */ } j_ptr = xmalloc(sizeof(struct gs_job)); /* gather job info */ j_ptr->job_id = job_ptr->job_id; j_ptr->job_ptr = job_ptr; j_ptr->sig_state = GS_RESUME; /* all jobs are running initially */ j_ptr->row_state = GS_NO_ACTIVE; /* job is not in the active row */ /* append this job to the job_list */ p_ptr->job_list[p_ptr->num_jobs++] = j_ptr; /* determine the immediate fate of this job (run or suspend) */ if (!IS_JOB_SUSPENDED(job_ptr) && _job_fits_in_active_row(job_ptr, p_ptr)) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_part: job %u remains running", job_ptr->job_id); } _add_job_to_active(job_ptr, p_ptr); /* note that this job is a "filler" for this row */ j_ptr->row_state = GS_FILLER; /* all jobs begin in the run state, so * there's no need to signal this job */ /* since this job is running we need to "cast it's shadow" * over lower priority partitions */ _cast_shadow(j_ptr, p_ptr->priority); } else { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_part: suspending job %u", job_ptr->job_id); } preempt_mode = slurm_job_preempt_mode(job_ptr); if (p_ptr->num_shadows && (preempt_mode != PREEMPT_MODE_OFF) && (preempt_mode != PREEMPT_MODE_SUSPEND)) { _preempt_job_queue(job_ptr->job_id); } else _suspend_job(job_ptr->job_id); j_ptr->sig_state = GS_SUSPEND; } _print_jobs(p_ptr); return j_ptr->sig_state; }
/* Test if a batch launch request should be defered * RET -1: abort the request, pending job cancelled * 0: execute the request now * 1: defer the request */ static int _batch_launch_defer(queued_request_t *queued_req_ptr) { agent_arg_t *agent_arg_ptr; batch_job_launch_msg_t *launch_msg_ptr; time_t now = time(NULL); struct job_record *job_ptr; int delay_time, nodes_ready = 0; agent_arg_ptr = queued_req_ptr->agent_arg_ptr; if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) return 0; if (difftime(now, queued_req_ptr->last_attempt) < 10) { /* Reduce overhead by only testing once every 10 secs */ return 1; } launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; job_ptr = find_job_record(launch_msg_ptr->job_id); if ((job_ptr == NULL) || (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { info("agent(batch_launch): removed pending request for " "cancelled job %u", launch_msg_ptr->job_id); return -1; /* job cancelled while waiting */ } if (job_ptr->wait_all_nodes) { (void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready); } else { #ifdef HAVE_FRONT_END nodes_ready = 1; #else struct node_record *node_ptr; char *hostname; hostname = hostlist_deranged_string_xmalloc( agent_arg_ptr->hostlist); node_ptr = find_node_record(hostname); if (node_ptr == NULL) { error("agent(batch_launch) removed pending request for " "job %u, missing node %s", launch_msg_ptr->job_id, hostname); xfree(hostname); return -1; /* invalid request?? */ } xfree(hostname); if (!IS_NODE_POWER_SAVE(node_ptr) && !IS_NODE_NO_RESPOND(node_ptr)) { nodes_ready = 1; } #endif } delay_time = difftime(now, job_ptr->start_time); if (nodes_ready) { /* ready to launch, adjust time limit for boot time */ if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } if (queued_req_ptr->last_attempt == 0) { queued_req_ptr->first_attempt = now; queued_req_ptr->last_attempt = now; } else if (difftime(now, queued_req_ptr->first_attempt) >= slurm_get_resume_timeout()) { error("agent waited too long for nodes to respond, " "sending batch request anyway..."); if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } queued_req_ptr->last_attempt = now; return 1; }
extern List find_preemptable_jobs(struct job_record *job_ptr) { ListIterator preemptee_candidate_iterator; struct job_record *preemptee_job_ptr; struct job_record *preemptor_job_ptr = job_ptr; List preemptee_job_list = NULL; /* Validate the preemptor job */ if (preemptor_job_ptr == NULL) { error("%s: preemptor_job_ptr is NULL", plugin_type); return preemptee_job_list; } if (!IS_JOB_PENDING(preemptor_job_ptr)) { error("%s: JobId %u not pending", plugin_type, preemptor_job_ptr->job_id); return preemptee_job_list; } if (preemptor_job_ptr->part_ptr == NULL) { error("%s: JobId %u has NULL partition ptr", plugin_type, preemptor_job_ptr->job_id); return preemptee_job_list; } if (preemptor_job_ptr->part_ptr->node_bitmap == NULL) { error("%s: partition %s node_bitmap==NULL", plugin_type, preemptor_job_ptr->part_ptr->name); return preemptee_job_list; } if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) { info("%s: Looking for jobs to preempt for JobId %u", plugin_type, preemptor_job_ptr->job_id); } /* Build an array of pointers to preemption candidates */ preemptee_candidate_iterator = list_iterator_create(job_list); while ((preemptee_job_ptr = (struct job_record *) list_next(preemptee_candidate_iterator))) { if (!IS_JOB_RUNNING(preemptee_job_ptr) && !IS_JOB_SUSPENDED(preemptee_job_ptr)) continue; if (!_job_prio_preemptable(preemptor_job_ptr,preemptee_job_ptr)) continue; if ((preemptee_job_ptr->node_bitmap == NULL) || (bit_overlap(preemptee_job_ptr->node_bitmap, preemptor_job_ptr->part_ptr->node_bitmap) == 0)) continue; if (preemptor_job_ptr->details && (preemptor_job_ptr->details->expanding_jobid == preemptee_job_ptr->job_id)) continue; if (CHECK_FOR_PREEMPTOR_OVERALLOC && !_account_preemptable(preemptor_job_ptr, preemptee_job_ptr)) continue; /* This job is a valid preemption candidate and should be added * to the list. Create the list as needed. */ if (preemptee_job_list == NULL) preemptee_job_list = list_create(NULL); list_append(preemptee_job_list, preemptee_job_ptr); } list_iterator_destroy(preemptee_candidate_iterator); if (preemptee_job_list) { list_sort(preemptee_job_list, _sort_by_job_prio); if (CHECK_FOR_ACCOUNT_UNDERALLOC) { _account_under_alloc(preemptor_job_ptr, preemptee_job_list); } } else if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) { info("%s: NULL preemptee list for job (%u) %s", plugin_type, preemptor_job_ptr->job_id, preemptor_job_ptr->name); } return preemptee_job_list; }
extern int as_mysql_job_start(mysql_conn_t *mysql_conn, struct job_record *job_ptr) { int rc=SLURM_SUCCESS; char *nodes = NULL, *jname = NULL, *node_inx = NULL; int track_steps = 0; char *block_id = NULL, *partition = NULL, *gres_req = NULL, *gres_alloc = NULL; char *query = NULL; int reinit = 0; time_t begin_time, check_time, start_time, submit_time; uint32_t wckeyid = 0; int job_state, node_cnt = 0; uint32_t job_db_inx = job_ptr->db_index; if ((!job_ptr->details || !job_ptr->details->submit_time) && !job_ptr->resize_time) { error("as_mysql_job_start: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_start() called"); job_state = job_ptr->job_state; if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; start_time = job_ptr->resize_time; } else { begin_time = job_ptr->details->begin_time; submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } /* Since we need a new db_inx make sure the old db_inx * removed. This is most likely the only time we are going to * be notified of the change also so make the state without * the resize. */ if (IS_JOB_RESIZING(job_ptr)) { /* If we have a db_index lets end the previous record. */ if (!job_ptr->db_index) { error("We don't have a db_index for job %u, " "this should only happen when resizing " "jobs and the database interface was down.", job_ptr->job_id); job_ptr->db_index = _get_db_index(mysql_conn, job_ptr->details-> submit_time, job_ptr->job_id, job_ptr->assoc_id); } if (job_ptr->db_index) as_mysql_job_complete(mysql_conn, job_ptr); job_state &= (~JOB_RESIZING); job_ptr->db_index = 0; } job_state &= JOB_STATE_BASE; /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will * need to reset it to look at this job. */ if (start_time) check_time = start_time; else if (begin_time) check_time = begin_time; else check_time = submit_time; slurm_mutex_lock(&rollup_lock); if (check_time < global_last_rollup) { MYSQL_RES *result = NULL; MYSQL_ROW row; /* check to see if we are hearing about this time for the * first time. */ query = xstrdup_printf("select job_db_inx " "from \"%s_%s\" where id_job=%u and " "time_submit=%ld and time_eligible=%ld " "and time_start=%ld;", mysql_conn->cluster_name, job_table, job_ptr->job_id, submit_time, begin_time, start_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); slurm_mutex_unlock(&rollup_lock); return SLURM_ERROR; } xfree(query); if ((row = mysql_fetch_row(result))) { mysql_free_result(result); debug4("revieved an update for a " "job (%u) already known about", job_ptr->job_id); slurm_mutex_unlock(&rollup_lock); goto no_rollup_change; } mysql_free_result(result); if (job_ptr->start_time) debug("Need to reroll usage from %sJob %u " "from %s started then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else if (begin_time) debug("Need to reroll usage from %sJob %u " "from %s became eligible then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else debug("Need to reroll usage from %sJob %u " "from %s was submitted then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); global_last_rollup = check_time; slurm_mutex_unlock(&rollup_lock); /* If the times here are later than the daily_rollup or monthly rollup it isn't a big deal since they are always shrunk down to the beginning of each time period. */ query = xstrdup_printf("update \"%s_%s\" set " "hourly_rollup=%ld, " "daily_rollup=%ld, monthly_rollup=%ld", mysql_conn->cluster_name, last_ran_table, check_time, check_time, check_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); } else slurm_mutex_unlock(&rollup_lock); no_rollup_change: if (job_ptr->name && job_ptr->name[0]) jname = slurm_add_slash_to_quotes(job_ptr->name); else { jname = xstrdup("allocation"); track_steps = 1; } if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else nodes = "None assigned"; if (job_ptr->batch_flag) track_steps = 1; if (slurmdbd_conf) { block_id = xstrdup(job_ptr->comment); node_cnt = job_ptr->total_nodes; node_inx = job_ptr->network; } else { char temp_bit[BUF_SIZE]; if (job_ptr->node_bitmap) { node_inx = bit_fmt(temp_bit, sizeof(temp_bit), job_ptr->node_bitmap); } #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); #else node_cnt = job_ptr->total_nodes; #endif } /* If there is a start_time get the wckeyid. If the job is * cancelled before the job starts we also want to grab it. */ if (job_ptr->assoc_id && (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr))) wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey, job_ptr->user_id, mysql_conn->cluster_name, job_ptr->assoc_id); if (job_ptr->partition) partition = slurm_add_slash_to_quotes(job_ptr->partition); if (job_ptr->gres_req) gres_req = slurm_add_slash_to_quotes(job_ptr->gres_req); if (job_ptr->gres_alloc) gres_alloc = slurm_add_slash_to_quotes(job_ptr->gres_alloc); if (!job_ptr->db_index) { if (!begin_time) begin_time = submit_time; query = xstrdup_printf( "insert into \"%s_%s\" " "(id_job, id_array_job, id_array_task, " "id_assoc, id_qos, id_wckey, id_user, " "id_group, nodelist, id_resv, timelimit, " "time_eligible, time_submit, time_start, " "job_name, track_steps, state, priority, cpus_req, " "cpus_alloc, nodes_alloc, mem_req", mysql_conn->cluster_name, job_table); if (job_ptr->account) xstrcat(query, ", account"); if (partition) xstrcat(query, ", `partition`"); if (block_id) xstrcat(query, ", id_block"); if (job_ptr->wckey) xstrcat(query, ", wckey"); if (node_inx) xstrcat(query, ", node_inx"); if (gres_req) xstrcat(query, ", gres_req"); if (gres_alloc) xstrcat(query, ", gres_alloc"); xstrfmtcat(query, ") values (%u, %u, %u, %u, %u, %u, %u, %u, " "'%s', %u, %u, %ld, %ld, %ld, " "'%s', %u, %u, %u, %u, %u, %u, %u", job_ptr->job_id, job_ptr->array_job_id, job_ptr->array_task_id, job_ptr->assoc_id, job_ptr->qos_id, wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, begin_time, submit_time, start_time, jname, track_steps, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory); if (job_ptr->account) xstrfmtcat(query, ", '%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", '%s'", partition); if (block_id) xstrfmtcat(query, ", '%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", '%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", '%s'", node_inx); if (gres_req) xstrfmtcat(query, ", '%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", '%s'", gres_alloc); xstrfmtcat(query, ") on duplicate key update " "job_db_inx=LAST_INSERT_ID(job_db_inx), " "id_wckey=%u, id_user=%u, id_group=%u, " "nodelist='%s', id_resv=%u, timelimit=%u, " "time_submit=%ld, time_start=%ld, " "job_name='%s', track_steps=%u, id_qos=%u, " "state=greatest(state, %u), priority=%u, " "cpus_req=%u, cpus_alloc=%u, nodes_alloc=%u, " "mem_req=%u, id_array_job=%u, id_array_task=%u", wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, submit_time, start_time, jname, track_steps, job_ptr->qos_id, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id); if (job_ptr->account) xstrfmtcat(query, ", account='%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", `partition`='%s'", partition); if (block_id) xstrfmtcat(query, ", id_block='%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", node_inx='%s'", node_inx); if (gres_req) xstrfmtcat(query, ", gres_req='%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", gres_alloc='%s'", gres_alloc); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); try_again: if (!(job_ptr->db_index = mysql_db_insert_ret_id( mysql_conn, query))) { if (!reinit) { error("It looks like the storage has gone " "away trying to reconnect"); mysql_db_close_db_connection( mysql_conn); /* reconnect */ check_connection(mysql_conn); reinit = 1; goto try_again; } else rc = SLURM_ERROR; } } else { query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ", mysql_conn->cluster_name, job_table, nodes); if (job_ptr->account) xstrfmtcat(query, "account='%s', ", job_ptr->account); if (partition) xstrfmtcat(query, "`partition`='%s', ", partition); if (block_id) xstrfmtcat(query, "id_block='%s', ", block_id); if (job_ptr->wckey) xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey); if (node_inx) xstrfmtcat(query, "node_inx='%s', ", node_inx); if (gres_req) xstrfmtcat(query, "gres_req='%s', ", gres_req); if (gres_alloc) xstrfmtcat(query, "gres_alloc='%s', ", gres_alloc); xstrfmtcat(query, "time_start=%ld, job_name='%s', state=%u, " "cpus_alloc=%u, nodes_alloc=%u, id_qos=%u, " "id_assoc=%u, id_wckey=%u, id_resv=%u, " "timelimit=%u, mem_req=%u, " "id_array_job=%u, id_array_task=%u, " "time_eligible=%ld where job_db_inx=%d", start_time, jname, job_state, job_ptr->total_cpus, node_cnt, job_ptr->qos_id, job_ptr->assoc_id, wckeyid, job_ptr->resv_id, job_ptr->time_limit, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id, begin_time, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); } xfree(block_id); xfree(partition); xfree(gres_req); xfree(gres_alloc); xfree(jname); xfree(query); /* now we will reset all the steps */ if (IS_JOB_RESIZING(job_ptr)) { /* FIXME : Verify this is still needed */ if (IS_JOB_SUSPENDED(job_ptr)) as_mysql_suspend(mysql_conn, job_db_inx, job_ptr); } return rc; }
extern int as_mysql_suspend(mysql_conn_t *mysql_conn, uint32_t old_db_inx, struct job_record *job_ptr) { char *query = NULL; int rc = SLURM_SUCCESS; time_t submit_time; uint32_t job_db_inx; if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; if (job_ptr->resize_time) submit_time = job_ptr->resize_time; else submit_time = job_ptr->details->submit_time; if (!job_ptr->db_index) { if (!(job_ptr->db_index = _get_db_index(mysql_conn, submit_time, job_ptr->job_id, job_ptr->assoc_id))) { /* If we get an error with this just fall * through to avoid an infinite loop */ if (as_mysql_job_start( mysql_conn, job_ptr) == SLURM_ERROR) { error("couldn't suspend job %u", job_ptr->job_id); return SLURM_SUCCESS; } } } if (IS_JOB_RESIZING(job_ptr)) { if (!old_db_inx) { error("No old db inx given for job %u cluster %s, " "can't update suspend table.", job_ptr->job_id, mysql_conn->cluster_name); return SLURM_ERROR; } job_db_inx = old_db_inx; xstrfmtcat(query, "update \"%s_%s\" set time_end=%d where " "job_db_inx=%u && time_end=0;", mysql_conn->cluster_name, suspend_table, (int)job_ptr->suspend_time, job_db_inx); } else job_db_inx = job_ptr->db_index; /* use job_db_inx for this one since we want to update the supend time of the job before it was resized. */ xstrfmtcat(query, "update \"%s_%s\" set time_suspended=%d-time_suspended, " "state=%d where job_db_inx=%d;", mysql_conn->cluster_name, job_table, (int)job_ptr->suspend_time, job_ptr->job_state & JOB_STATE_BASE, job_db_inx); if (IS_JOB_SUSPENDED(job_ptr)) xstrfmtcat(query, "insert into \"%s_%s\" (job_db_inx, id_assoc, " "time_start, time_end) values (%u, %u, %d, 0);", mysql_conn->cluster_name, suspend_table, job_ptr->db_index, job_ptr->assoc_id, (int)job_ptr->suspend_time); else xstrfmtcat(query, "update \"%s_%s\" set time_end=%d where " "job_db_inx=%u && time_end=0;", mysql_conn->cluster_name, suspend_table, (int)job_ptr->suspend_time, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); if (rc != SLURM_ERROR) { xstrfmtcat(query, "update \"%s_%s\" set " "time_suspended=%u-time_suspended, " "state=%d where job_db_inx=%u and time_end=0", mysql_conn->cluster_name, step_table, (int)job_ptr->suspend_time, job_ptr->job_state, job_ptr->db_index); rc = mysql_db_query(mysql_conn, query); xfree(query); } return rc; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j, k; char time_str[32], *group_name, *user_name; char *gres_last = "", tmp1[128], tmp2[128]; char *tmp6_ptr; char tmp_line[1024 * 128]; char tmp_path[MAXPATHLEN]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *cpu_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int64_t nice; int bit_inx, bit_reps; uint64_t *last_mem_alloc_ptr = NULL; uint64_t last_mem_alloc = NO_VAL64; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); uint32_t threads; char *line_end = (one_liner) ? " " : "\n "; if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ xstrfmtcat(out, "JobId=%u ", job_ptr->job_id); if (job_ptr->array_job_id) { if (job_ptr->array_task_str) { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ", job_ptr->array_job_id, job_ptr->array_task_str); } else { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); } } xstrfmtcat(out, "JobName=%s", job_ptr->name); xstrcat(out, line_end); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s", user_name, job_ptr->user_id, group_name, job_ptr->group_id, (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label); xfree(user_name); xfree(group_name); xstrcat(out, line_end); /****** Line 3 ******/ nice = ((int64_t)job_ptr->nice) - NICE_OFFSET; xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s", job_ptr->priority, nice, job_ptr->account, job_ptr->qos); if (slurm_get_track_wckey()) xstrfmtcat(out, " WCKey=%s", job_ptr->wckey); xstrcat(out, line_end); /****** Line 4 ******/ xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state)); if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc); } else xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason)); xstrfmtcat(out, "Dependency=%s", job_ptr->dependency); xstrcat(out, line_end); /****** Line 5 ******/ xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag, job_ptr->reboot); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); /****** Line 5a (optional) ******/ if (job_ptr->show_flags & SHOW_DETAIL) { if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); } /****** Line 6 ******/ if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, time_str, sizeof(time_str)); xstrfmtcat(out, "RunTime=%s ", time_str); if (job_ptr->time_limit == NO_VAL) xstrcat(out, "TimeLimit=Partition_Limit "); else { mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeLimit=%s ", time_str); } if (job_ptr->time_min == 0) xstrcat(out, "TimeMin=N/A"); else { mins2time_str(job_ptr->time_min, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeMin=%s", time_str); } xstrcat(out, line_end); /****** Line 7 ******/ slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SubmitTime=%s ", time_str); slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EligibleTime=%s", time_str); xstrcat(out, line_end); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str)); xstrfmtcat(out, "ResizeTime=%s", time_str); xstrcat(out, line_end); } /****** Line 9 ******/ slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str)); xstrfmtcat(out, "StartTime=%s ", time_str); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) xstrcat(out, "EndTime=Unknown "); else { slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EndTime=%s ", time_str); } if (job_ptr->deadline) { slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str)); xstrfmtcat(out, "Deadline=%s", time_str); } else { xstrcat(out, "Deadline=N/A"); } xstrcat(out, line_end); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) xstrcat(out, "PreemptTime=None "); else { slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str)); xstrfmtcat(out, "PreemptTime=%s ", time_str); } if (job_ptr->suspend_time) { slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SuspendTime=%s ", time_str); } else xstrcat(out, "SuspendTime=None "); xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time); xstrcat(out, line_end); /****** Line 11 ******/ xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, line_end); /****** Line 12 ******/ xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, line_end); /****** Line 13 ******/ xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes); if (job_ptr->nodes && ionodes) { xstrfmtcat(out, "[%s]", ionodes); xfree(ionodes); } if (job_ptr->sched_nodes) xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes); xstrcat(out, line_end); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host); xstrcat(out, line_end); } /****** Line 14a (optional) ******/ if (job_ptr->fed_siblings) { xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s", job_ptr->fed_origin_str, job_ptr->fed_siblings_str); xstrcat(out, line_end); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else if (IS_JOB_PENDING(job_ptr)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; if (max_nodes && (max_nodes < min_nodes)) min_nodes = max_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = 0; } _sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes); xstrfmtcat(out, "NumNodes=%s ", tmp_line); _sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus); xstrfmtcat(out, "NumCPUs=%s ", tmp_line); xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks); xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task); if (job_ptr->boards_per_node == (uint16_t) NO_VAL) xstrcat(out, "ReqB:S:C:T=*:"); else xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node); if (job_ptr->sockets_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->sockets_per_board); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) xstrcat(out, "*"); else xstrfmtcat(out, "%u", job_ptr->threads_per_core); xstrcat(out, line_end); /****** Line 16 ******/ /* Tres should already of been converted at this point from simple */ xstrfmtcat(out, "TRES=%s", job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str : job_ptr->tres_req_str); xstrcat(out, line_end); /****** Line 17 ******/ if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) xstrcat(out, "Socks/Node=* "); else xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node); if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL) xstrcat(out, "NtasksPerN:B:S:C=*:"); else xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node); if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board); if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_socket == (uint16_t) INFINITE)) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket); if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_core == (uint16_t) INFINITE)) xstrcat(out, "* "); else xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core); if (job_ptr->core_spec == (uint16_t) NO_VAL) xstrcat(out, "CoreSpec=*"); else if (job_ptr->core_spec & CORE_SPEC_THREAD) xstrfmtcat(out, "ThreadSpec=%d", (job_ptr->core_spec & (~CORE_SPEC_THREAD))); else xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec); xstrcat(out, line_end); if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { /* only print 60 characters worth of this record */ if (length > 60) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); } length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]); if (job_resrcs->cpu_array_reps[i] > 1) { length += xstrfmtcat(out, "*%d", job_resrcs->cpu_array_reps[i]); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } xstrcat(out, line_end); } } else if (job_resrcs && job_resrcs->core_bitmap && ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) { hl = hostlist_create(job_resrcs->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_resrcs->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; gres_last = ""; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; host = hostlist_shift(hl); threads = _threads_per_core(host); cpu_bitmap = bit_alloc(bit_reps * threads); for (j = 0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)){ for (k = 0; k < threads; k++) bit_set(cpu_bitmap, (j * threads) + k); } bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap); FREE_NULL_BITMAP(cpu_bitmap); /* * If the allocation values for this host are not the * same as the last host, print the report of the last * group of hosts that had identical allocation values. */ if (xstrcmp(tmp1, tmp2) || ((rel_node_inx < job_ptr->gres_detail_cnt) && xstrcmp(job_ptr->gres_detail_str[rel_node_inx], gres_last)) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s " "Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); if (rel_node_inx < job_ptr->gres_detail_cnt) { gres_last = job_ptr-> gres_detail_str[rel_node_inx]; } else { gres_last = ""; } last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL64; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 18 ******/ if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinCPUsNode=%s ", tmp1); } else { xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus); } convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, line_end); /****** Line ******/ secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1)); xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "Gres=%s Reservation=%s", job_ptr->gres, job_ptr->resv_name); xstrcat(out, line_end); /****** Line 20 ******/ xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s", job_share_string(job_ptr->shared), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, line_end); /****** Line 21 ******/ xstrfmtcat(out, "Command=%s", job_ptr->command); xstrcat(out, line_end); /****** Line 22 ******/ xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "Block_ID=%s", select_buf); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrcat(out, select_buf); } /****** Line 26 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "CnloadImage=%s", select_buf); } /****** Line 27 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "MloaderImage=%s", select_buf); } /****** Line 28 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "IoloadImage=%s", select_buf); } } /****** Line (optional) ******/ if (job_ptr->admin_comment) { xstrcat(out, line_end); xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment); } /****** Line (optional) ******/ if (job_ptr->comment) { xstrcat(out, line_end); xstrfmtcat(out, "Comment=%s ", job_ptr->comment); } /****** Line 30 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdErr=%s", tmp_path); } /****** Line 31 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdIn=%s", tmp_path); } /****** Line 32 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdOut=%s", tmp_path); } /****** Line 33 (optional) ******/ if (job_ptr->batch_script) { xstrcat(out, line_end); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 34 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; xstrcat(out, line_end); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf); } /****** Line 35 (optional) ******/ if (job_ptr->burst_buffer) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer); } /****** Line (optional) ******/ if (job_ptr->burst_buffer_state) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBufferState=%s", job_ptr->burst_buffer_state); } /****** Line 36 (optional) ******/ if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1), job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min, job_ptr->cpu_freq_max, NO_VAL) != 0) { xstrcat(out, line_end); xstrcat(out, tmp1); } /****** Line 37 ******/ xstrcat(out, line_end); xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags)); /****** Line 38 (optional) ******/ if (job_ptr->bitflags) { xstrcat(out, line_end); if (job_ptr->bitflags & GRES_ENFORCE_BIND) xstrcat(out, "GresEnforceBind=Yes"); if (job_ptr->bitflags & KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=Yes"); if (job_ptr->bitflags & NO_KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=No"); if (job_ptr->bitflags & SPREAD_JOB) xstrcat(out, "SpreadJob=Yes"); } /****** END OF JOB RECORD ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
/* * sync_front_end_state - synchronize job pointers and front-end node state */ extern void sync_front_end_state(void) { #ifdef HAVE_FRONT_END ListIterator job_iterator; struct job_record *job_ptr; front_end_record_t *front_end_ptr; uint32_t state_flags; int i; for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { front_end_ptr->job_cnt_comp = 0; front_end_ptr->job_cnt_run = 0; } job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (job_ptr->batch_host) { job_ptr->front_end_ptr = find_front_end_record(job_ptr->batch_host); if ((job_ptr->front_end_ptr == NULL) && IS_JOB_RUNNING(job_ptr)) { error("front end node %s has vanished, " "killing job %u", job_ptr->batch_host, job_ptr->job_id); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; } else if (job_ptr->front_end_ptr == NULL) { info("front end node %s has vanished", job_ptr->batch_host); } else if (IS_JOB_COMPLETING(job_ptr)) { job_ptr->front_end_ptr->job_cnt_comp++; } else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { job_ptr->front_end_ptr->job_cnt_run++; } } else { job_ptr->front_end_ptr = NULL; } } list_iterator_destroy(job_iterator); for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((IS_NODE_IDLE(front_end_ptr) || IS_NODE_UNKNOWN(front_end_ptr)) && (front_end_ptr->job_cnt_run != 0)) { state_flags = front_end_ptr->node_state & NODE_STATE_FLAGS; front_end_ptr->node_state = NODE_STATE_ALLOCATED | state_flags; } if (IS_NODE_ALLOCATED(front_end_ptr) && (front_end_ptr->job_cnt_run == 0)) { state_flags = front_end_ptr->node_state & NODE_STATE_FLAGS; front_end_ptr->node_state = NODE_STATE_IDLE | state_flags; } if (IS_NODE_COMPLETING(front_end_ptr) && (front_end_ptr->job_cnt_comp == 0)) { front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); } if (!IS_NODE_COMPLETING(front_end_ptr) && (front_end_ptr->job_cnt_comp != 0)) { front_end_ptr->node_state |= NODE_STATE_COMPLETING; } } if (slurmctld_conf.debug_flags & DEBUG_FLAG_FRONT_END) log_front_end_state(); #endif }
/* rebuild data structures from scratch * * A reconfigure can affect this plugin in these ways: * - partitions can be added or removed * - this affects the gs_part_list * - nodes can be removed from a partition, or added to a partition * - this affects the size of the active resmap * * Here's the plan: * 1. save a copy of the global structures, and then construct * new ones. * 2. load the new partition structures with existing jobs, * confirming the job exists and resizing their resmaps * (if necessary). * 3. make sure all partitions are accounted for. If a partition * was removed, make sure any jobs that were in the queue and * that were suspended are resumed. Conversely, if a partition * was added, check for existing jobs that may be contending * for resources that we could begin timeslicing. * 4. delete the old global structures and return. */ extern int gs_reconfig(void) { int i; ListIterator part_iterator; struct gs_part *p_ptr, *newp_ptr; List old_part_list; struct job_record *job_ptr; struct gs_job *j_ptr; if (!timeslicer_thread_id) { /* gs_init() will be called later from read_slurm_conf() * if we are enabling gang scheduling via reconfiguration */ return SLURM_SUCCESS; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: entering gs_reconfig"); pthread_mutex_lock(&data_mutex); old_part_list = gs_part_list; gs_part_list = NULL; /* reset global data */ gs_fast_schedule = slurm_get_fast_schedule(); gr_type = _get_gr_type(); _load_phys_res_cnt(); _build_parts(); /* scan the old part list and add existing jobs to the new list */ part_iterator = list_iterator_create(old_part_list); while ((p_ptr = (struct gs_part *) list_next(part_iterator))) { newp_ptr = (struct gs_part *) list_find_first(gs_part_list, _find_gs_part, p_ptr->part_name); if (!newp_ptr) { /* this partition was removed, so resume * any jobs suspended by gang and continue */ for (i = 0; i < p_ptr->num_jobs; i++) { j_ptr = p_ptr->job_list[i]; if ((j_ptr->sig_state == GS_SUSPEND) && (j_ptr->job_ptr->priority != 0)) { info("resuming job in missing part %s", p_ptr->part_name); _resume_job(j_ptr->job_id); j_ptr->sig_state = GS_RESUME; } } continue; } if (p_ptr->num_jobs == 0) /* no jobs to transfer */ continue; /* we need to transfer the jobs from p_ptr to new_ptr and * adjust their resmaps (if necessary). then we need to create * the active resmap and adjust the state of each job (if * necessary). NOTE: there could be jobs that only overlap * on nodes that are no longer in the partition, but we're * not going to worry about those cases. * * add the jobs from p_ptr into new_ptr in their current order * to preserve the state of timeslicing. */ for (i = 0; i < p_ptr->num_jobs; i++) { job_ptr = find_job_record(p_ptr->job_list[i]->job_id); if (job_ptr == NULL) { /* job no longer exists in SLURM, so drop it */ continue; } /* resume any job that is suspended by us */ if (IS_JOB_SUSPENDED(job_ptr) && job_ptr->priority) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG){ info("resuming job %u apparently " "suspended by gang", job_ptr->job_id); } _resume_job(job_ptr->job_id); } /* transfer the job as long as it is still active */ if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) { _add_job_to_part(newp_ptr, job_ptr); } } } list_iterator_destroy(part_iterator); /* confirm all jobs. Scan the master job_list and confirm that we * are tracking all jobs */ _scan_slurm_job_list(); FREE_NULL_LIST(old_part_list); pthread_mutex_unlock(&data_mutex); _preempt_job_dequeue(); /* MUST BE OUTSIDE OF data_mutex lock */ if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: leaving gs_reconfig"); return SLURM_SUCCESS; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j; char time_str[32], *group_name, *user_name; char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr; char tmp_line[512]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *core_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int bit_inx, bit_reps; uint32_t *last_mem_alloc_ptr = NULL; uint32_t last_mem_alloc = NO_VAL; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id); out = xstrdup(tmp_line); if (job_ptr->array_job_id) { snprintf(tmp_line, sizeof(tmp_line), "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); xstrcat(out, tmp_line); } snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); snprintf(tmp_line, sizeof(tmp_line), "UserId=%s(%u) GroupId=%s(%u)", user_name, job_ptr->user_id, group_name, job_ptr->group_id); xfree(user_name); xfree(group_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 3 ******/ snprintf(tmp_line, sizeof(tmp_line), "Priority=%u Account=%s QOS=%s", job_ptr->priority, job_ptr->account, job_ptr->qos); xstrcat(out, tmp_line); if (slurm_get_track_wckey()) { snprintf(tmp_line, sizeof(tmp_line), " WCKey=%s", job_ptr->wckey); xstrcat(out, tmp_line); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 4 ******/ if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } tmp6_ptr = job_ptr->state_desc; } else tmp6_ptr = job_reason_string(job_ptr->state_reason); snprintf(tmp_line, sizeof(tmp_line), "JobState=%s Reason=%s Dependency=%s", job_state_string(job_ptr->job_state), tmp6_ptr, job_ptr->dependency); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5 ******/ snprintf(tmp_line, sizeof(tmp_line), "Requeue=%u Restarts=%u BatchFlag=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag); xstrcat(out, tmp_line); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); snprintf(tmp_line, sizeof(tmp_line), "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5a (optional) ******/ if (!(job_ptr->show_flags & SHOW_DETAIL)) goto line6; if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); snprintf(tmp_line, sizeof(tmp_line), "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 6 ******/ line6: snprintf(tmp_line, sizeof(tmp_line), "RunTime="); xstrcat(out, tmp_line); if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, tmp1, sizeof(tmp1)); sprintf(tmp_line, "%s ", tmp1); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "TimeLimit="); xstrcat(out, tmp_line); if (job_ptr->time_limit == NO_VAL) sprintf(tmp_line, "Partition_Limit"); else { mins2time_str(job_ptr->time_limit, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), " TimeMin="); xstrcat(out, tmp_line); if (job_ptr->time_min == 0) sprintf(tmp_line, "N/A"); else { mins2time_str(job_ptr->time_min, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 7 ******/ slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str); xstrcat(out, tmp_line); slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 9 ******/ slurm_make_time_str((time_t *)&job_ptr->start_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "EndTime="); xstrcat(out, tmp_line); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) sprintf(tmp_line, "Unknown"); else { slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str, sizeof(time_str)); sprintf(tmp_line, "%s", time_str); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) sprintf(tmp_line, "PreemptTime=None "); else { slurm_make_time_str((time_t *)&job_ptr->preempt_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ", time_str); } xstrcat(out, tmp_line); if (job_ptr->suspend_time) { slurm_make_time_str ((time_t *)&job_ptr->suspend_time, time_str, sizeof(time_str)); } else { strncpy(time_str, "None", sizeof(time_str)); } snprintf(tmp_line, sizeof(tmp_line), "SuspendTime=%s SecsPreSuspend=%ld", time_str, (long int)job_ptr->pre_sus_time); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 11 ******/ snprintf(tmp_line, sizeof(tmp_line), "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 12 ******/ snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 13 ******/ xstrfmtcat(out, "%s=", nodelist); xstrcat(out, job_ptr->nodes); if (job_ptr->nodes && ionodes) { snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes); xstrcat(out, tmp_line); xfree(ionodes); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s", job_ptr->batch_host); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } _sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus); _sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes); if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) strcpy(tmp3, "*"); else snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) strcpy(tmp4, "*"); else snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) strcpy(tmp5, "*"); else snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core); snprintf(tmp_line, sizeof(tmp_line), "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s", tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (!job_resrcs) goto line15; if (cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); length += 10; for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { if (length > 70) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); length += 4; } snprintf(tmp_line, sizeof(tmp_line), "%d", job_resrcs->cpus[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); if (job_resrcs->cpu_array_reps[i] > 1) { snprintf(tmp_line, sizeof(tmp_line), "*%d", job_resrcs->cpu_array_reps[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } } else { if (!job_resrcs->core_bitmap) goto line15; last = bit_fls(job_resrcs->core_bitmap); if (last == -1) goto line15; hl = hostlist_create(job_ptr->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_ptr->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; core_bitmap = bit_alloc(bit_reps); for (j=0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)) bit_set(core_bitmap, j); bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), core_bitmap); FREE_NULL_BITMAP(core_bitmap); host = hostlist_shift(hl); /* * If the allocation values for this host are not the same as the * last host, print the report of the last group of hosts that had * identical allocation values. */ if (strcmp(tmp1, tmp2) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 15 ******/ line15: if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s", tmp1); } else { snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u", job_ptr->pn_min_cpus); } xstrcat(out, tmp_line); convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA); snprintf(tmp_line, sizeof(tmp_line), " MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 16 ******/ snprintf(tmp_line, sizeof(tmp_line), "Features=%s Gres=%s Reservation=%s", job_ptr->features, job_ptr->gres, job_ptr->resv_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 17 ******/ snprintf(tmp_line, sizeof(tmp_line), "Shared=%s Contiguous=%d Licenses=%s Network=%s", (job_ptr->shared == 0 ? "0" : job_ptr->shared == 1 ? "1" : "OK"), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 18 ******/ snprintf(tmp_line, sizeof(tmp_line), "Command=%s", job_ptr->command); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 19 ******/ snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s", job_ptr->work_dir); xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 20 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Block_ID=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 21 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, select_buf); } if (cluster_flags & CLUSTER_FLAG_BGL) { /****** Line 22 (optional) ******/ select_g_select_jobinfo_sprint( job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BLRTS_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "BlrtsImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "LinuxImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "CnloadImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "MloaderImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 25 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "RamDiskImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "IoloadImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 26 (optional) ******/ if (job_ptr->comment) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ", job_ptr->comment); xstrcat(out, tmp_line); } /****** Line 27 (optional) ******/ if (job_ptr->batch_script) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 28 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); snprintf(tmp_line, sizeof(tmp_line), "Switches=%u@%s\n", job_ptr->req_switch, time_buf); xstrcat(out, tmp_line); } /****** Line 29 (optional) ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }