/* _print_job_step - print the specified job step's information */ static int _print_job_steps( bool clear_old ) { int error_code; static job_step_info_response_msg_t * old_step_ptr = NULL; static job_step_info_response_msg_t * new_step_ptr; uint16_t show_flags = 0; if (params.all_flag) show_flags |= SHOW_ALL; if (old_step_ptr) { if (clear_old) old_step_ptr->last_update = 0; /* Use a last_update time of 0 so that we can get an updated * run_time for jobs rather than just its start_time */ error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL, &new_step_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_step_info_response_msg( old_step_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_step_ptr = old_step_ptr; } } else { error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL, &new_step_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_get_job_steps error"); return SLURM_ERROR; } old_step_ptr = new_step_ptr; if (params.verbose) { printf ("last_update_time=%ld records=%u\n", (long) new_step_ptr->last_update, new_step_ptr->job_step_count); } if (!params.format && !params.format_long) params.format = "%.15i %.8j %.9P %.8u %.9M %N"; if (!params.format_list) { if (params.format) parse_format(params.format); else if (params.format_long) parse_long_format(params.format_long); } print_steps_array( new_step_ptr->job_steps, new_step_ptr->job_step_count, params.format_list ); return SLURM_SUCCESS; }
/* * slurm_terminate_job_step - terminates a job step by sending a * REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step. * IN job_id - the job's id * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id * to terminate a job's batch script * RET 0 on success, otherwise return -1 and set errno to indicate the error */ extern int slurm_terminate_job_step (uint32_t job_id, uint32_t step_id) { resource_allocation_response_msg_t *alloc_info = NULL; job_step_info_response_msg_t *step_info = NULL; int rc = 0; int i; int save_errno = 0; if (slurm_allocation_lookup_lite(job_id, &alloc_info)) { return -1; } /* * The controller won't give us info about the batch script job step, * so we need to handle that seperately. */ if (step_id == SLURM_BATCH_SCRIPT) { rc = _terminate_batch_script_step(alloc_info); slurm_free_resource_allocation_response_msg(alloc_info); errno = rc; return rc ? -1 : 0; } /* * Otherwise, look through the list of job step info and find * the one matching step_id. Terminate that step. */ rc = slurm_get_job_steps((time_t)0, job_id, step_id, &step_info, SHOW_ALL); if (rc != 0) { save_errno = errno; goto fail; } for (i = 0; i < step_info->job_step_count; i++) { if ((step_info->job_steps[i].job_id == job_id) && (step_info->job_steps[i].step_id == step_id)) { rc = _terminate_job_step(&step_info->job_steps[i], alloc_info); save_errno = errno; break; } } slurm_free_job_step_info_response_msg(step_info); fail: slurm_free_resource_allocation_response_msg(alloc_info); errno = save_errno; return rc ? -1 : 0; }
/* Return the current time limit of the specified job/step_id or NO_VAL if the * information is not available */ static uint32_t _get_step_time(uint32_t job_id, uint32_t step_id) { uint32_t time_limit = NO_VAL; int i, rc; job_step_info_response_msg_t *resp; rc = slurm_get_job_steps((time_t) 0, job_id, step_id, &resp, SHOW_ALL); if (rc == SLURM_SUCCESS) { for (i = 0; i < resp->job_step_count; i++) { if ((resp->job_steps[i].job_id != job_id) || (resp->job_steps[i].step_id != step_id)) continue; /* should not happen */ time_limit = resp->job_steps[i].time_limit; break; } slurm_free_job_step_info_response_msg(resp); } else { error("Could not load state information for step %u.%u: %m", job_id, step_id); } return time_limit; }
int main(int argc, char **argv) { ListIterator itr = NULL; uint32_t req_cpufreq = NO_VAL; uint32_t stepid = NO_VAL; slurmdb_selected_step_t *selected_step = NULL; #ifdef HAVE_ALPS_CRAY error("The sstat command is not supported on Cray systems"); return 1; #endif #ifdef HAVE_BG error("The sstat command is not supported on IBM BlueGene systems"); return 1; #endif slurm_conf_init(NULL); print_fields_list = list_create(NULL); print_fields_itr = list_iterator_create(print_fields_list); parse_command_line(argc, argv); if (!params.opt_job_list || !list_count(params.opt_job_list)) { error("You didn't give me any jobs to stat."); return 1; } print_fields_header(print_fields_list); itr = list_iterator_create(params.opt_job_list); while ((selected_step = list_next(itr))) { char *nodelist = NULL; bool free_nodelist = false; if (selected_step->stepid == INFINITE) { /* get the batch step info */ job_info_msg_t *job_ptr = NULL; hostlist_t hl; if (slurm_load_job( &job_ptr, selected_step->jobid, SHOW_ALL)) { error("couldn't get info for job %u", selected_step->jobid); continue; } stepid = NO_VAL; hl = hostlist_create(job_ptr->job_array[0].nodes); nodelist = hostlist_pop(hl); free_nodelist = true; hostlist_destroy(hl); slurm_free_job_info_msg(job_ptr); } else if (selected_step->stepid != NO_VAL) { stepid = selected_step->stepid; } else if (params.opt_all_steps) { job_step_info_response_msg_t *step_ptr = NULL; int i = 0; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } for (i = 0; i < step_ptr->job_step_count; i++) { _do_stat(selected_step->jobid, step_ptr->job_steps[i].step_id, step_ptr->job_steps[i].nodes, step_ptr->job_steps[i].cpu_freq); } slurm_free_job_step_info_response_msg(step_ptr); continue; } else { /* get the first running step to query against. */ job_step_info_response_msg_t *step_ptr = NULL; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } if (!step_ptr->job_step_count) { error("no steps running for job %u", selected_step->jobid); continue; } stepid = step_ptr->job_steps[0].step_id; nodelist = step_ptr->job_steps[0].nodes; req_cpufreq = step_ptr->job_steps[0].cpu_freq; } _do_stat(selected_step->jobid, stepid, nodelist, req_cpufreq); if (free_nodelist && nodelist) free(nodelist); } list_iterator_destroy(itr); xfree(params.opt_field_list); if (params.opt_job_list) list_destroy(params.opt_job_list); if (print_fields_itr) list_iterator_destroy(print_fields_itr); if (print_fields_list) list_destroy(print_fields_list); return 0; }
/* * scontrol_print_step - print the specified job step's information * IN job_step_id_str - job step's id or NULL to print information * about all job steps */ extern void scontrol_print_step (char *job_step_id_str) { int error_code, i, print_cnt = 0; uint32_t job_id = NO_VAL, step_id = NO_VAL; uint16_t array_id = (uint16_t) NO_VAL; char *next_str; job_step_info_response_msg_t *job_step_info_ptr; job_step_info_t * job_step_ptr; static uint32_t last_job_id = 0, last_array_id, last_step_id = 0; static job_step_info_response_msg_t *old_job_step_info_ptr = NULL; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; if (job_step_id_str) { job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10); if (next_str[0] == '_') array_id = (uint16_t) strtol(next_str+1, &next_str, 10); if (next_str[0] == '.') step_id = (uint32_t) strtol (next_str+1, NULL, 10); } if (all_flag) show_flags |= SHOW_ALL; if ((old_job_step_info_ptr) && (last_job_id == job_id) && (last_array_id == array_id) && (last_step_id == step_id)) { if (last_show_flags != show_flags) old_job_step_info_ptr->last_update = (time_t) 0; error_code = slurm_get_job_steps ( old_job_step_info_ptr->last_update, job_id, step_id, &job_step_info_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_step_info_ptr = old_job_step_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf ("slurm_get_job_steps no change in data\n"); } } else { if (old_job_step_info_ptr) { slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); old_job_step_info_ptr = NULL; } error_code = slurm_get_job_steps ( (time_t) 0, job_id, step_id, &job_step_info_ptr, show_flags); } if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_get_job_steps error"); return; } old_job_step_info_ptr = job_step_info_ptr; last_show_flags = show_flags; last_job_id = job_id; last_step_id = step_id; if (quiet_flag == -1) { char time_str[32]; slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update, time_str, sizeof(time_str)); printf ("last_update_time=%s, records=%d\n", time_str, job_step_info_ptr->job_step_count); } job_step_ptr = job_step_info_ptr->job_steps ; for (i = 0, job_step_ptr = job_step_info_ptr->job_steps; i < job_step_info_ptr->job_step_count; i++, job_step_ptr++) { if ((array_id != (uint16_t) NO_VAL) && (array_id != job_step_ptr->array_task_id)) continue; slurm_print_job_step_info(stdout, job_step_ptr, one_liner); print_cnt++; } if (print_cnt == 0) { if (job_step_id_str) { exit_code = 1; if (quiet_flag != 1) { if (array_id == (uint16_t) NO_VAL) { printf ("Job step %u.%u not found\n", job_id, step_id); } else { printf ("Job step %u_%u.%u not found\n", job_id, array_id, step_id); } } } else if (quiet_flag != 1) printf ("No job steps in the system\n"); } }