/* Load current job table information into *job_buffer_pptr */ extern int scontrol_load_job(job_info_msg_t ** job_buffer_pptr, uint32_t job_id) { int error_code; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; job_info_msg_t * job_info_ptr = NULL; if (all_flag) show_flags |= SHOW_ALL; if (detail_flag) { show_flags |= SHOW_DETAIL; if (detail_flag > 1) show_flags |= SHOW_DETAIL2; } if (federation_flag) show_flags |= SHOW_FEDERATION; if (local_flag) show_flags |= SHOW_LOCAL; if (sibling_flag) show_flags |= SHOW_FEDERATION | SHOW_SIBLING; if (old_job_info_ptr) { if (last_show_flags != show_flags) old_job_info_ptr->last_update = (time_t) 0; if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs( old_job_info_ptr->last_update, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg (old_job_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_info_ptr = old_job_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf ("slurm_load_jobs no change in data\n"); } } else if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) { old_job_info_ptr = job_info_ptr; if (job_id) old_job_info_ptr->last_update = (time_t) 0; last_show_flags = show_flags; *job_buffer_pptr = job_info_ptr; } return error_code; }
/* * slurm_xlate_job_id - Translate a Slurm job ID string into a slurm job ID * number. If this job ID contains an array index, map this to the * equivalent Slurm job ID number (e.g. "123_2" to 124) * * IN job_id_str - String containing a single job ID number * RET - equivalent job ID number or 0 on error */ extern uint32_t slurm_xlate_job_id(char *job_id_str) { char *next_str; uint32_t i, job_id; uint16_t array_id; job_info_msg_t *resp; slurm_job_info_t *job_ptr; job_id = (uint32_t) strtol(job_id_str, &next_str, 10); if (next_str[0] == '\0') return job_id; if (next_str[0] != '_') return (uint32_t) 0; array_id = (uint16_t) strtol(next_str + 1, &next_str, 10); if (next_str[0] != '\0') return (uint32_t) 0; if (slurm_load_job(&resp, job_id, SHOW_ALL) != 0) return (uint32_t) 0; job_id = 0; for (i = 0, job_ptr = resp->job_array; i < resp->record_count; i++, job_ptr++) { if (job_ptr->array_task_id == array_id) { job_id = job_ptr->job_id; break; } } slurm_free_job_info_msg(resp); return job_id; }
static bool _is_single_job(char *job_id_str) { uint32_t job_id, task_id; char *next_str = NULL; int rc; job_info_msg_t *resp; bool is_single = false; job_id = (uint32_t)strtol(job_id_str, &next_str, 10); if (next_str[0] == '_') { task_id = (uint32_t)strtol(next_str+1, &next_str, 10); if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return is_single; } } else if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return is_single; } else { task_id = NO_VAL; } rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { if (resp->record_count == 0) { error("Job ID %s not found", job_id_str); slurm_free_job_info_msg(resp); return is_single; } if ((resp->record_count > 1) && (task_id == NO_VAL)) { error("Job resizing not supported for job arrays"); slurm_free_job_info_msg(resp); return is_single; } is_single = true; /* Do not bother to validate */ slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %s: %m", job_id_str); } return is_single; }
/* _get_job_info() */ static job_info_msg_t * _get_job_info(const char *jobid, uint32_t *task_id) { char buf[64]; char *taskid; char *next_str; uint32_t job_id; int cc; job_info_msg_t *job_info; if (strlen(jobid) > 63) return NULL; strcpy(buf, jobid); taskid = strchr(buf, '_'); if (taskid) { *taskid = 0; ++taskid; *task_id = (uint32_t)strtol(taskid, &next_str, 10); if (next_str[0] != '\0') { fprintf(stderr, "Invalid task_id specified\n"); return NULL; } } job_id = (uint32_t)strtol(buf, &next_str, 10); if (next_str[0] != '\0') { fprintf(stderr, "Invalid job_id specified\n"); return NULL; } cc = slurm_load_job(&job_info, job_id, SHOW_ALL); if (cc < 0) { slurm_perror("slurm_load_job"); return NULL; } return job_info; }
/* Return the current time limit of the specified job_id or NO_VAL if the * information is not available */ static uint32_t _get_job_time(uint32_t job_id) { uint32_t time_limit = NO_VAL; int i, rc; job_info_msg_t *resp; rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { for (i = 0; i < resp->record_count; i++) { if (resp->job_array[i].job_id != job_id) continue; /* should not happen */ time_limit = resp->job_array[i].time_limit; break; } slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %u: %m", job_id); } return time_limit; }
/* _print_job - print the specified job's information */ static int _print_job ( bool clear_old ) { static job_info_msg_t * old_job_ptr = NULL, * new_job_ptr; int error_code; uint16_t show_flags = 0; uint32_t job_id = 0; if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; /* We require detail data when CPUs are requested */ if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; if (params.job_list && (list_count(params.job_list) == 1)) { ListIterator iterator; uint32_t *job_id_ptr; iterator = list_iterator_create(params.job_list); job_id_ptr = list_next(iterator); job_id = *job_id_ptr; list_iterator_destroy(iterator); } if (old_job_ptr) { if (clear_old) old_job_ptr->last_update = 0; if (job_id) { error_code = slurm_load_job( &new_job_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs( old_job_ptr->last_update, &new_job_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg( old_job_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = old_job_ptr; } } else if (job_id) { error_code = slurm_load_job(&new_job_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_load_jobs error"); return SLURM_ERROR; } old_job_ptr = new_job_ptr; if (job_id) old_job_ptr->last_update = (time_t) 0; if (params.verbose) { printf ("last_update_time=%ld\n", (long) new_job_ptr->last_update); } if (params.format == NULL) { if (params.long_list) params.format = "%.7i %.9P %.8j %.8u %.8T %.10M %.9l " "%.6D %R"; else params.format = "%.7i %.9P %.8j %.8u %.2t %.10M %.6D %R"; } if (params.format_list == NULL) parse_format(params.format); print_jobs_array( new_job_ptr->job_array, new_job_ptr->record_count , params.format_list ) ; return SLURM_SUCCESS; }
int main(int argc, char **argv) { ListIterator itr = NULL; uint32_t req_cpufreq = NO_VAL; uint32_t stepid = NO_VAL; slurmdb_selected_step_t *selected_step = NULL; #ifdef HAVE_ALPS_CRAY error("The sstat command is not supported on Cray systems"); return 1; #endif #ifdef HAVE_BG error("The sstat command is not supported on IBM BlueGene systems"); return 1; #endif slurm_conf_init(NULL); print_fields_list = list_create(NULL); print_fields_itr = list_iterator_create(print_fields_list); parse_command_line(argc, argv); if (!params.opt_job_list || !list_count(params.opt_job_list)) { error("You didn't give me any jobs to stat."); return 1; } print_fields_header(print_fields_list); itr = list_iterator_create(params.opt_job_list); while ((selected_step = list_next(itr))) { char *nodelist = NULL; bool free_nodelist = false; if (selected_step->stepid == INFINITE) { /* get the batch step info */ job_info_msg_t *job_ptr = NULL; hostlist_t hl; if (slurm_load_job( &job_ptr, selected_step->jobid, SHOW_ALL)) { error("couldn't get info for job %u", selected_step->jobid); continue; } stepid = NO_VAL; hl = hostlist_create(job_ptr->job_array[0].nodes); nodelist = hostlist_pop(hl); free_nodelist = true; hostlist_destroy(hl); slurm_free_job_info_msg(job_ptr); } else if (selected_step->stepid != NO_VAL) { stepid = selected_step->stepid; } else if (params.opt_all_steps) { job_step_info_response_msg_t *step_ptr = NULL; int i = 0; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } for (i = 0; i < step_ptr->job_step_count; i++) { _do_stat(selected_step->jobid, step_ptr->job_steps[i].step_id, step_ptr->job_steps[i].nodes, step_ptr->job_steps[i].cpu_freq); } slurm_free_job_step_info_response_msg(step_ptr); continue; } else { /* get the first running step to query against. */ job_step_info_response_msg_t *step_ptr = NULL; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } if (!step_ptr->job_step_count) { error("no steps running for job %u", selected_step->jobid); continue; } stepid = step_ptr->job_steps[0].step_id; nodelist = step_ptr->job_steps[0].nodes; req_cpufreq = step_ptr->job_steps[0].cpu_freq; } _do_stat(selected_step->jobid, stepid, nodelist, req_cpufreq); if (free_nodelist && nodelist) free(nodelist); } list_iterator_destroy(itr); xfree(params.opt_field_list); if (params.opt_job_list) list_destroy(params.opt_job_list); if (print_fields_itr) list_iterator_destroy(print_fields_itr); if (print_fields_list) list_destroy(print_fields_list); return 0; }
static void slurmdrmaa_job_update_status( fsd_job_t *self ) { job_info_msg_t *job_info = NULL; slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self; fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { int _slurm_errno = slurm_get_errno(); if (_slurm_errno == ESLURM_INVALID_JOB_ID) { self->on_missing(self); } else { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); } } if (job_info) { fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); switch(job_info->job_array[0].job_state & JOB_STATE_BASE) { case JOB_PENDING: switch(job_info->job_array[0].state_reason) { #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) case WAIT_HELD_USER: /* job is held by user */ fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); self->state = DRMAA_PS_USER_ON_HOLD; break; #endif case WAIT_HELD: /* job is held by administrator */ fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); self->state = DRMAA_PS_SYSTEM_ON_HOLD; break; default: fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); self->state = DRMAA_PS_QUEUED_ACTIVE; } break; case JOB_RUNNING: fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); self->state = DRMAA_PS_RUNNING; break; case JOB_SUSPENDED: if(slurm_self->user_suspended == true) { fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); self->state = DRMAA_PS_USER_SUSPENDED; } else { fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); self->state = DRMAA_PS_SYSTEM_SUSPENDED; } break; case JOB_COMPLETE: fsd_log_debug(("interpreting as DRMAA_PS_DONE")); self->state = DRMAA_PS_DONE; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; case JOB_CANCELLED: fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); self->state = DRMAA_PS_FAILED; self->exit_status = -1; case JOB_FAILED: case JOB_TIMEOUT: case JOB_NODE_FAIL: #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) case JOB_PREEMPTED: #endif fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); self->state = DRMAA_PS_FAILED; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; default: /*unknown state */ fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { fsd_log_debug(("Epilog completing")); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { fsd_log_debug(("Nodes booting")); } if (self->exit_status == -1) /* input,output,error path failure etc*/ self->state = DRMAA_PS_FAILED; self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) { fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); fsd_cond_broadcast( &self->status_cond ); } } } FINALLY { if(job_info != NULL) slurm_free_job_info_msg (job_info); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
/* Return the current time limit of the specified job_id or NO_VAL if the * information is not available */ static uint32_t _get_job_time(const char *job_id_str) { uint32_t job_id, task_id; char *next_str = NULL; uint32_t time_limit = NO_VAL; int i, rc; job_info_msg_t *resp; bitstr_t *array_bitmap; job_id = (uint32_t)strtol(job_id_str, &next_str, 10); if (next_str[0] == '_') { task_id = (uint32_t)strtol(next_str+1, &next_str, 10); if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return time_limit; } } else if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return time_limit; } else { task_id = NO_VAL; } rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { if (resp->record_count == 0) { error("Job ID %s not found", job_id_str); slurm_free_job_info_msg(resp); return time_limit; } if ((resp->record_count > 1) && (task_id == NO_VAL)) { error("TimeLimit increment/decrement not supported " "for job arrays"); slurm_free_job_info_msg(resp); return time_limit; } for (i = 0; i < resp->record_count; i++) { if ((resp->job_array[i].job_id == job_id) && (resp->job_array[i].array_task_id == NO_VAL) && (resp->job_array[i].array_bitmap == NULL)) { /* Regular job match */ time_limit = resp->job_array[i].time_limit; break; } if (resp->job_array[i].array_job_id != job_id) continue; array_bitmap = (bitstr_t *) resp->job_array[i].array_bitmap; if ((task_id == NO_VAL) || (resp->job_array[i].array_task_id == task_id) || (array_bitmap && (task_id < bit_size(array_bitmap)) && bit_test(array_bitmap, task_id))) { /* Array job with task_id match */ time_limit = resp->job_array[i].time_limit; break; } } slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %s: %m", job_id_str); } return time_limit; }
int _x11_init_remote_batch(spank_t sp,uint32_t jobid,uint32_t stepid) { int status; FILE* f; char localhost[256]; char* cmd_pattern= X11_LIBEXEC_PROG " -u %s -s \"%s\" -o \"%s\" -f %s -d %s -t %s -i %u.%u -cwg %s &"; char* cmd; size_t cmd_length; char display[256]; struct passwd user_pwent; struct passwd *p_pwent; size_t pwent_buffer_length = sysconf(_SC_GETPW_R_SIZE_MAX); char pwent_buffer[pwent_buffer_length]; job_info_msg_t * job_buffer_ptr; job_info_t* job_ptr; /* * get current hostname */ if ( gethostname(localhost,256) != 0 ) { status = -20; goto exit; } /* * the batch script inherits the DISPLAY value of the * submission command. We will use it on the allocation node * for proper establishment of a working X11 ssh tunnel */ if ( spank_getenv(sp,"DISPLAY",display,256) != ESPANK_SUCCESS ) { ERROR("x11: unable to read batch step " "inherited DISPLAY value"); status = -1; goto exit; } /* get job infos */ status = slurm_load_job(&job_buffer_ptr,jobid,SHOW_ALL); if ( status != 0 ) { ERROR("x11: unable to get job infos"); status = -3; goto exit; } /* check infos validity */ if ( job_buffer_ptr->record_count != 1 ) { ERROR("x11: job infos are invalid"); status = -4; goto clean_exit; } job_ptr = job_buffer_ptr->job_array; /* get user name */ status = getpwuid_r(job_ptr->user_id,&user_pwent,pwent_buffer, pwent_buffer_length,&p_pwent) ; if (status) { error("x11: unable to get username for uid=%u : %s",job_ptr->user_id, strerror(status)) ; status = -10; goto clean_exit; } /* * build the command line that will be used to forward the * alloc node X11 tunnel */ cmd_length = strlen(cmd_pattern) + 128 ; cmd = (char*) malloc(cmd_length*sizeof(char)); if ( cmd == NULL || snprintf(cmd,cmd_length,cmd_pattern,user_pwent.pw_name, (ssh_cmd == NULL) ? DEFAULT_SSH_CMD : ssh_cmd, (ssh_args == NULL) ? DEFAULT_SSH_ARGS : ssh_args, job_ptr->alloc_node,display,localhost,jobid,stepid, (helpertask_args == NULL) ? DEFAULT_HELPERTASK_ARGS : helpertask_args) >= cmd_length ) { ERROR("x11: error while building cmd"); status = -2; } else { INFO("x11: batch mode : executing %s",cmd); /* execute the command to retrieve the DISPLAY value to use */ f = popen(cmd,"r"); if ( f != NULL ) { if ( fscanf(f,"%255s",display) == 1 ) { if ( spank_setenv(sp,"DISPLAY",display,1) != ESPANK_SUCCESS ) { ERROR("x11: unable to set DISPLAY" " in job env"); status = -5; } else { INFO("x11: now using DISPLAY=%s", display); status=0; } } else { ERROR("x11: unable to get a DISPLAY value"); status = -6; } pclose(f); } else { ERROR("x11: unable to exec get cmd '%s'",cmd); status = -3; } } if ( cmd != NULL ) free(cmd); clean_exit: slurm_free_job_info_msg(job_buffer_ptr); exit: return status; }
/* * srun call, the client node connects the allocated node(s) */ int slurm_spank_local_user_init (spank_t sp, int ac, char **av) { int status; uint32_t jobid; uint32_t stepid; job_info_msg_t * job_buffer_ptr; job_info_t* job_ptr; /* only handle interactive usage */ if ( x11_mode == X11_MODE_NONE || x11_mode == X11_MODE_BATCH ) return 0; /* check DISPLAY value */ if ( getenv("DISPLAY") == NULL ) { ERROR("x11: no local DISPLAY defined, skipping",jobid); return 0; } /* get job id */ if ( spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS ) { status = -1; goto exit; } /* get job step id */ if ( spank_get_item (sp, S_JOB_STEPID, &stepid) != ESPANK_SUCCESS ) { status = -1; goto exit; } /* get job infos */ status = slurm_load_job(&job_buffer_ptr,jobid,SHOW_ALL); if ( status != 0 ) { ERROR("x11: unable to get job infos"); status = -3; goto exit; } /* check infos validity */ if ( job_buffer_ptr->record_count != 1 ) { ERROR("x11: job infos are invalid"); status = -4; goto clean_exit; } job_ptr = job_buffer_ptr->job_array; /* check allocated nodes var */ if ( job_ptr->nodes == NULL ) { ERROR("x11: job has no allocated nodes defined"); status = -5; goto clean_exit; } /* connect required nodes */ status = _x11_connect_nodes(job_ptr->nodes,jobid,stepid); clean_exit: slurm_free_job_info_msg(job_buffer_ptr); exit: return status; }
/* _print_job - print the specified job's information */ static int _print_job ( bool clear_old ) { static job_info_msg_t * old_job_ptr = NULL, * new_job_ptr; int error_code; uint16_t show_flags = 0; if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; /* We require detail data when CPUs are requested */ if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; if (old_job_ptr) { if (clear_old) old_job_ptr->last_update = 0; if (params.job_id) { error_code = slurm_load_job( &new_job_ptr, params.job_id, show_flags); } else if (params.user_id) { error_code = slurm_load_job_user(&new_job_ptr, params.user_id, show_flags); } else { error_code = slurm_load_jobs( old_job_ptr->last_update, &new_job_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg( old_job_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = old_job_ptr; } } else if (params.job_id) { error_code = slurm_load_job(&new_job_ptr, params.job_id, show_flags); } else if (params.user_id) { error_code = slurm_load_job_user(&new_job_ptr, params.user_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_load_jobs error"); return SLURM_ERROR; } old_job_ptr = new_job_ptr; if (params.job_id || params.job_id) old_job_ptr->last_update = (time_t) 0; if (params.verbose) { printf ("last_update_time=%ld records=%u\n", (long) new_job_ptr->last_update, new_job_ptr->record_count); } if (!params.format && !params.format_long) { if (params.long_list) { xstrcat(params.format, "%.18i %.9P %.8j %.8u %.8T %.10M %.9l %.6D %R"); } else { xstrcat(params.format, "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R"); } } if (!params.format_list) { if (params.format) parse_format(params.format); else if (params.format_long) parse_long_format(params.format_long); } print_jobs_array(new_job_ptr->job_array, new_job_ptr->record_count, params.format_list) ; return SLURM_SUCCESS; }