/* _proc_cluster - process job cancellation on a specific cluster */ static int _proc_cluster(void) { int filter_cnt = 0; int rc; if (has_default_opt() && !has_job_steps()) { rc = _signal_job_by_str(); return rc; } _load_job_records(); rc = _verify_job_ids(); if ((opt.account) || (opt.job_name) || (opt.nodelist) || (opt.partition) || (opt.qos) || (opt.reservation) || (opt.state != JOB_END) || (opt.user_name) || (opt.wckey)) { filter_cnt = _filter_job_records(); } rc = MAX(_cancel_jobs(filter_cnt), rc); slurm_free_job_info_msg(job_buffer_ptr); return rc; }
/* * slurm_xlate_job_id - Translate a Slurm job ID string into a slurm job ID * number. If this job ID contains an array index, map this to the * equivalent Slurm job ID number (e.g. "123_2" to 124) * * IN job_id_str - String containing a single job ID number * RET - equivalent job ID number or 0 on error */ extern uint32_t slurm_xlate_job_id(char *job_id_str) { char *next_str; uint32_t i, job_id; uint16_t array_id; job_info_msg_t *resp; slurm_job_info_t *job_ptr; job_id = (uint32_t) strtol(job_id_str, &next_str, 10); if (next_str[0] == '\0') return job_id; if (next_str[0] != '_') return (uint32_t) 0; array_id = (uint16_t) strtol(next_str + 1, &next_str, 10); if (next_str[0] != '\0') return (uint32_t) 0; if (slurm_load_job(&resp, job_id, SHOW_ALL) != 0) return (uint32_t) 0; job_id = 0; for (i = 0, job_ptr = resp->job_array; i < resp->record_count; i++, job_ptr++) { if (job_ptr->array_task_id == array_id) { job_id = job_ptr->job_id; break; } } slurm_free_job_info_msg(resp); return job_id; }
static int _get_job_size(uint32_t job_id) { job_info_msg_t *job_buffer_ptr; job_info_t * job_ptr; int i, size = 1; hostlist_t hl; if (slurm_load_jobs((time_t) 0, &job_buffer_ptr, SHOW_ALL)) { slurm_perror("slurm_load_jobs"); return 1; } for (i = 0; i < job_buffer_ptr->record_count; i++) { job_ptr = &job_buffer_ptr->job_array[i]; if (job_ptr->job_id != job_id) continue; hl = hostlist_create(job_ptr->nodes); if (hl) { size = hostlist_count(hl); hostlist_destroy(hl); } break; } slurm_free_job_info_msg (job_buffer_ptr); #if _DEBUG printf("Size is %d\n", size); #endif return size; }
// Get bar summaries for cluster nodes void ClusterMenu::get_lines() { // First we set the time of this update last_update = std::chrono::steady_clock::now(); // Call SLURM API to write node information to pointer // Free pointer memory first if it has been previously set if (node_info_buffer_ptr != NULL) { slurm_free_node_info_msg(node_info_buffer_ptr); } slurm_load_node ((time_t) NULL, &node_info_buffer_ptr, SHOW_ALL); // Create a NodeContainer struct and populate with node information node_container.populate_nodes_from_slurm(node_info_buffer_ptr); // Call API function, pass job_info_ptr as reference (double pointer); flags must be SHOW_DETAIL to get job allocations // Free pointer memory first if it has been previously set if (job_info_buffer_ptr != NULL) { slurm_free_job_info_msg(job_info_buffer_ptr); } slurm_load_jobs((time_t) NULL, &job_info_buffer_ptr, SHOW_DETAIL); // Populate nodes with job allocations node_container.populate_job_allocations_from_slurm(job_info_buffer_ptr); // Get line content lines = node_container.get_node_bar_summary(32); // Record largest line for later use in horizontal scrolling get_longest_line(); }
/* Load current job table information into *job_buffer_pptr */ extern int scontrol_load_job(job_info_msg_t ** job_buffer_pptr, uint32_t job_id) { int error_code; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; job_info_msg_t * job_info_ptr = NULL; if (all_flag) show_flags |= SHOW_ALL; if (detail_flag) { show_flags |= SHOW_DETAIL; if (detail_flag > 1) show_flags |= SHOW_DETAIL2; } if (federation_flag) show_flags |= SHOW_FEDERATION; if (local_flag) show_flags |= SHOW_LOCAL; if (sibling_flag) show_flags |= SHOW_FEDERATION | SHOW_SIBLING; if (old_job_info_ptr) { if (last_show_flags != show_flags) old_job_info_ptr->last_update = (time_t) 0; if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs( old_job_info_ptr->last_update, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg (old_job_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_info_ptr = old_job_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf ("slurm_load_jobs no change in data\n"); } } else if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) { old_job_info_ptr = job_info_ptr; if (job_id) old_job_info_ptr->last_update = (time_t) 0; last_show_flags = show_flags; *job_buffer_pptr = job_info_ptr; } return error_code; }
static bool _is_single_job(char *job_id_str) { uint32_t job_id, task_id; char *next_str = NULL; int rc; job_info_msg_t *resp; bool is_single = false; job_id = (uint32_t)strtol(job_id_str, &next_str, 10); if (next_str[0] == '_') { task_id = (uint32_t)strtol(next_str+1, &next_str, 10); if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return is_single; } } else if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return is_single; } else { task_id = NO_VAL; } rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { if (resp->record_count == 0) { error("Job ID %s not found", job_id_str); slurm_free_job_info_msg(resp); return is_single; } if ((resp->record_count > 1) && (task_id == NO_VAL)) { error("Job resizing not supported for job arrays"); slurm_free_job_info_msg(resp); return is_single; } is_single = true; /* Do not bother to validate */ slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %s: %m", job_id_str); } return is_single; }
static hostlist_t _slurm_wcoll (List joblist) { int i; hostlist_t hl = NULL; job_info_msg_t * msg; int32_t envjobid = 0; int alljobids = 0; if ((joblist == NULL) && (envjobid = _slurm_jobid()) < 0) return (NULL); if (slurm_load_jobs((time_t) NULL, &msg, 1) < 0) errx ("Unable to contact slurm controller: %s\n", slurm_strerror (errno)); /* * Check for "all" in joblist */ alljobids = _alljobids_requested (joblist); for (i = 0; i < msg->record_count; i++) { job_info_t *j = &msg->job_array[i]; if (alljobids && j->job_state == JOB_RUNNING) hl = _hl_append (hl, j->nodes); else if (!joblist && (j->job_id == envjobid)) { /* * Only use SLURM_JOBID environment variable if user * didn't override with -j option */ hl = hostlist_create (j->nodes); break; } else if (_jobid_requested (joblist, j->job_id)) { hl = _hl_append (hl, j->nodes); /* * Exit when there is no more jobids to search */ if (list_count (joblist) == 0) break; } } slurm_free_job_info_msg (msg); if (hl) hostlist_uniq (hl); return (hl); }
/* Translate a job name to relevant job IDs * NOTE: xfree the return value to avoid memory leak */ static char *_job_name2id(char *job_name, uint32_t job_uid) { int i, rc; job_info_msg_t *resp; slurm_job_info_t *job_ptr; char *job_id_str = NULL, *sep = ""; xassert(job_name); rc = scontrol_load_job(&resp, 0); if (rc == SLURM_SUCCESS) { if (resp->record_count == 0) { error("JobName %s not found", job_name); slurm_free_job_info_msg(resp); return job_id_str; } for (i = 0, job_ptr = resp->job_array; i < resp->record_count; i++, job_ptr++) { if ((job_uid != NO_VAL) && (job_uid != job_ptr->user_id)) continue; if (!job_ptr->name || xstrcmp(job_name, job_ptr->name)) continue; if (job_ptr->array_task_id != NO_VAL) { xstrfmtcat(job_id_str, "%s%u_%u", sep, job_ptr->array_job_id, job_ptr->array_task_id); } else { xstrfmtcat(job_id_str, "%s%u", sep, job_ptr->job_id); } sep = ","; } if (!job_id_str) { if (job_uid == NO_VAL) { error("No jobs with name \'%s\'", job_name); } else { error("No jobs with user ID %u and name \'%s\'", job_uid, job_name); } } } else { error("Could not load state information: %m"); } return job_id_str; }
/* main is used here for testing purposes only */ int main (int argc, char *argv[]) { static time_t last_update_time = (time_t) NULL; int error_code; job_info_msg_t * job_info_msg_ptr = NULL; error_code = slurm_load_jobs (last_update_time, &job_info_msg_ptr, 1); if (error_code) { slurm_perror ("slurm_load_jobs"); return (error_code); } slurm_print_job_info_msg ( stdout, job_info_msg_ptr, 1 ) ; slurm_free_job_info_msg ( job_info_msg_ptr ) ; return (0); }
/* Return the current time limit of the specified job_id or NO_VAL if the * information is not available */ static uint32_t _get_job_time(uint32_t job_id) { uint32_t time_limit = NO_VAL; int i, rc; job_info_msg_t *resp; rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { for (i = 0; i < resp->record_count; i++) { if (resp->job_array[i].job_id != job_id) continue; /* should not happen */ time_limit = resp->job_array[i].time_limit; break; } slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %u: %m", job_id); } return time_limit; }
int main() { // Initialise container for all node information NodeContainer node_container; // Declare a pointer to which the SLURM API writes node information node_info_msg_t * node_info_buffer_ptr = NULL; // Call SLURM API to write node information to pointer slurm_load_node((time_t) NULL, &node_info_buffer_ptr, SHOW_ALL); // Create a NodeContainer struct and populate with node information node_container.populate_nodes_from_slurm(node_info_buffer_ptr); // Declare a pointer to which the SLURM API writes job information job_info_msg_t * job_info_buffer_ptr = NULL; // Call API function, pass job_info_ptr as reference (double pointer); flags must be SHOW_DETAIL to get job allocations slurm_load_jobs((time_t) NULL, &job_info_buffer_ptr, SHOW_DETAIL); // Populate nodes with job allocations node_container.populate_job_allocations_from_slurm(job_info_buffer_ptr); // Get lines for output std::vector<std::string> lines = node_container.get_node_bar_summary(32); // Print output for (std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); ++it) { printf("%s\n", it->c_str()); } // Clean up and nicely deallocated memory for SLURM pointers slurm_free_node_info_msg(node_info_buffer_ptr); slurm_free_job_info_msg(job_info_buffer_ptr); }
/* _print_job - print the specified job's information */ static int _print_job ( bool clear_old ) { static job_info_msg_t * old_job_ptr = NULL, * new_job_ptr; int error_code; uint16_t show_flags = 0; uint32_t job_id = 0; if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; /* We require detail data when CPUs are requested */ if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; if (params.job_list && (list_count(params.job_list) == 1)) { ListIterator iterator; uint32_t *job_id_ptr; iterator = list_iterator_create(params.job_list); job_id_ptr = list_next(iterator); job_id = *job_id_ptr; list_iterator_destroy(iterator); } if (old_job_ptr) { if (clear_old) old_job_ptr->last_update = 0; if (job_id) { error_code = slurm_load_job( &new_job_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs( old_job_ptr->last_update, &new_job_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg( old_job_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = old_job_ptr; } } else if (job_id) { error_code = slurm_load_job(&new_job_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_load_jobs error"); return SLURM_ERROR; } old_job_ptr = new_job_ptr; if (job_id) old_job_ptr->last_update = (time_t) 0; if (params.verbose) { printf ("last_update_time=%ld\n", (long) new_job_ptr->last_update); } if (params.format == NULL) { if (params.long_list) params.format = "%.7i %.9P %.8j %.8u %.8T %.10M %.9l " "%.6D %R"; else params.format = "%.7i %.9P %.8j %.8u %.2t %.10M %.6D %R"; } if (params.format_list == NULL) parse_format(params.format); print_jobs_array( new_job_ptr->job_array, new_job_ptr->record_count , params.format_list ) ; return SLURM_SUCCESS; }
int get_batch_queues(bridge_batch_manager_t* p_batch_manager, bridge_batch_queue_t** p_p_batch_queues, int* p_batch_queues_nb, char* batch_queue_name) { int fstatus=-1; int i,j; int queue_nb=0; int stored_queue_nb=0; bridge_batch_queue_t* bn; partition_info_msg_t* ppim; partition_info_t* ppi; job_info_msg_t* pjim; job_info_t* pji; node_info_msg_t* pnim; node_info_t* pni; /* get slurm partition infos */ if (slurm_load_partitions(0,&ppim,SHOW_ALL) != 0) { DEBUG3_LOGGER("unable to get slurm partitions infos"); ppim=NULL; goto exit; } /* get nodes status */ if(slurm_load_node(0,&pnim,SHOW_ALL)) { DEBUG3_LOGGER("unable to get nodes informations"); slurm_free_partition_info_msg(ppim); pnim=NULL; goto exit; } /* get slurm job infos */ if (slurm_load_jobs(0,&pjim,SHOW_ALL) != 0) { DEBUG3_LOGGER("unable to get allocations informations"); slurm_free_partition_info_msg(ppim); slurm_free_node_info_msg(pnim); goto exit; } /* build/initialize storage structures */ queue_nb = ppim->record_count; if (*p_p_batch_queues != NULL) { if (*p_batch_queues_nb < queue_nb) queue_nb=*p_batch_queues_nb; } else { *p_p_batch_queues = (bridge_batch_queue_t*) malloc(queue_nb*(sizeof(bridge_batch_queue_t)+1)); if (*p_p_batch_queues == NULL) { *p_batch_queues_nb = 0; queue_nb = *p_batch_queues_nb; } else { *p_batch_queues_nb = queue_nb; } } stored_queue_nb=0; /* fill queue structures */ for (i=0; i<ppim->record_count && stored_queue_nb<queue_nb; i++) { /* get partition pointer */ ppi=ppim->partition_array+i; if (ppi->name == NULL) continue; /* queue name filter */ if (batch_queue_name != NULL && strcmp(batch_queue_name,ppi->name) != 0) continue; bn = &(*p_p_batch_queues)[stored_queue_nb]; /* put default values */ init_batch_queue(p_batch_manager,bn); /* queue Name */ bn->name=strdup(ppi->name); bn->default_queue = (uint32_t) ( ppi->flags | PART_FLAG_DEFAULT); bn->priority = (uint32_t) ppi->priority; /* queue activity */ if(ppi->state_up == PARTITION_UP) { bn->activity = BRIDGE_BATCH_QUEUE_ACTIVITY_ACTIVE ; bn->state = BRIDGE_BATCH_QUEUE_STATE_OPENED ; } else if (ppi->state_up == PARTITION_DRAIN) { bn->activity = BRIDGE_BATCH_QUEUE_ACTIVITY_ACTIVE ; bn->state = BRIDGE_BATCH_QUEUE_STATE_CLOSED ; } else if (ppi->state_up == PARTITION_DOWN) { bn->activity = BRIDGE_BATCH_QUEUE_ACTIVITY_INACTIVE ; bn->state = BRIDGE_BATCH_QUEUE_STATE_OPENED ; } else if (ppi->state_up == PARTITION_INACTIVE) { bn->activity = BRIDGE_BATCH_QUEUE_ACTIVITY_INACTIVE ; bn->state = BRIDGE_BATCH_QUEUE_STATE_CLOSED ; } else { bn->activity = BRIDGE_BATCH_QUEUE_ACTIVITY_UNKNOWN ; bn->state = BRIDGE_BATCH_QUEUE_STATE_UNKNOWN ; } /* max times */ if ( ppi->max_time != INFINITE ) bn->seq_time_max = (uint32_t) ppi->max_time * 60 ; else bn->seq_time_max = NO_LIMIT; bn->par_time_max = bn->seq_time_max ; /* slurm */ for ( j=0 ; j < pjim->record_count ; j++ ) { pji=pjim->job_array+j; if ( strcmp(pji->partition,ppi->name) != 0 ) continue; switch ( pji->job_state & JOB_STATE_BASE ) { case JOB_PENDING : bn->jobs_nb++; bn->pending_jobs_nb++; break; case JOB_RUNNING : bn->jobs_nb++; bn->running_jobs_nb++; break; case JOB_SUSPENDED : bn->jobs_nb++; bn->syssuspended_jobs_nb++; break; } } /* Slurm does not provide information about Min and Max cpus per * partition. So we use the following method : * * if partition->name ~= /.*_seq/ min=max=1 * otherwise, calculate it using MinNodes, MaxNodes and nodes * informations */ int done = 0 ; char * p; p = rindex(ppi->name,'_'); if ( p != NULL ) { if ( strcmp(p+1,"seq") == 0 ) { done = 1; bn->par_cores_nb_min = 1; bn->par_cores_nb_max = 1; } } if ( ! done ) { /* use partition nodes information to build the min and max */ /* number of cores (only min and max nodes number are provided */ /* by slurm so we have to build this information) */ uint32_t max_cpus_per_node=0; uint32_t min_cpus_per_node=-1; bridge_nodelist_t list1,list2; bridge_nodelist_init(&list1,NULL,0); bridge_nodelist_add_nodes(&list1,ppi->nodes); for ( j=0 ; j < pnim->record_count ; j++ ) { pni=pnim->node_array+j; bridge_nodelist_init(&list2,NULL,0); bridge_nodelist_add_nodes(&list2,pni->name); if(bridge_nodelist_intersects(&list1,&list2)==0) { bridge_nodelist_free_contents(&list2); continue; } if ( pni->cpus > max_cpus_per_node ) max_cpus_per_node = pni->cpus ; if ( pni->cpus < min_cpus_per_node ) min_cpus_per_node = pni->cpus ; bridge_nodelist_free_contents(&list2); } bridge_nodelist_free_contents(&list1); if ( max_cpus_per_node > 0 && ppi->max_nodes != INFINITE ) bn->par_cores_nb_max = max_cpus_per_node * ppi->max_nodes ; if ( min_cpus_per_node < (uint32_t) -1 && ppi->min_nodes > 1 ) bn->par_cores_nb_min = min_cpus_per_node * ppi->min_nodes ; } stored_queue_nb++; } fstatus=0; /* free slurm informations */ slurm_free_job_info_msg(pjim); slurm_free_node_info_msg(pnim); slurm_free_partition_info_msg(ppim); if(stored_queue_nb<queue_nb) { *p_p_batch_queues=(bridge_batch_queue_t*) realloc(*p_p_batch_queues, stored_queue_nb*(sizeof(bridge_batch_queue_t)+1)); if(*p_p_batch_queues==NULL) *p_batch_queues_nb=0; else *p_batch_queues_nb=stored_queue_nb; } exit: return fstatus; }
int main(int argc, char **argv) { ListIterator itr = NULL; uint32_t req_cpufreq = NO_VAL; uint32_t stepid = NO_VAL; slurmdb_selected_step_t *selected_step = NULL; #ifdef HAVE_ALPS_CRAY error("The sstat command is not supported on Cray systems"); return 1; #endif #ifdef HAVE_BG error("The sstat command is not supported on IBM BlueGene systems"); return 1; #endif slurm_conf_init(NULL); print_fields_list = list_create(NULL); print_fields_itr = list_iterator_create(print_fields_list); parse_command_line(argc, argv); if (!params.opt_job_list || !list_count(params.opt_job_list)) { error("You didn't give me any jobs to stat."); return 1; } print_fields_header(print_fields_list); itr = list_iterator_create(params.opt_job_list); while ((selected_step = list_next(itr))) { char *nodelist = NULL; bool free_nodelist = false; if (selected_step->stepid == INFINITE) { /* get the batch step info */ job_info_msg_t *job_ptr = NULL; hostlist_t hl; if (slurm_load_job( &job_ptr, selected_step->jobid, SHOW_ALL)) { error("couldn't get info for job %u", selected_step->jobid); continue; } stepid = NO_VAL; hl = hostlist_create(job_ptr->job_array[0].nodes); nodelist = hostlist_pop(hl); free_nodelist = true; hostlist_destroy(hl); slurm_free_job_info_msg(job_ptr); } else if (selected_step->stepid != NO_VAL) { stepid = selected_step->stepid; } else if (params.opt_all_steps) { job_step_info_response_msg_t *step_ptr = NULL; int i = 0; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } for (i = 0; i < step_ptr->job_step_count; i++) { _do_stat(selected_step->jobid, step_ptr->job_steps[i].step_id, step_ptr->job_steps[i].nodes, step_ptr->job_steps[i].cpu_freq); } slurm_free_job_step_info_response_msg(step_ptr); continue; } else { /* get the first running step to query against. */ job_step_info_response_msg_t *step_ptr = NULL; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } if (!step_ptr->job_step_count) { error("no steps running for job %u", selected_step->jobid); continue; } stepid = step_ptr->job_steps[0].step_id; nodelist = step_ptr->job_steps[0].nodes; req_cpufreq = step_ptr->job_steps[0].cpu_freq; } _do_stat(selected_step->jobid, stepid, nodelist, req_cpufreq); if (free_nodelist && nodelist) free(nodelist); } list_iterator_destroy(itr); xfree(params.opt_field_list); if (params.opt_job_list) list_destroy(params.opt_job_list); if (print_fields_itr) list_iterator_destroy(print_fields_itr); if (print_fields_list) list_destroy(print_fields_list); return 0; }
/* _print_job - print the specified job's information */ static int _print_job ( bool clear_old ) { static job_info_msg_t * old_job_ptr = NULL, * new_job_ptr; int error_code; uint16_t show_flags = 0; if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; /* We require detail data when CPUs are requested */ if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; if (old_job_ptr) { if (clear_old) old_job_ptr->last_update = 0; if (params.job_id) { error_code = slurm_load_job( &new_job_ptr, params.job_id, show_flags); } else if (params.user_id) { error_code = slurm_load_job_user(&new_job_ptr, params.user_id, show_flags); } else { error_code = slurm_load_jobs( old_job_ptr->last_update, &new_job_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg( old_job_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = old_job_ptr; } } else if (params.job_id) { error_code = slurm_load_job(&new_job_ptr, params.job_id, show_flags); } else if (params.user_id) { error_code = slurm_load_job_user(&new_job_ptr, params.user_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_load_jobs error"); return SLURM_ERROR; } old_job_ptr = new_job_ptr; if (params.job_id || params.job_id) old_job_ptr->last_update = (time_t) 0; if (params.verbose) { printf ("last_update_time=%ld records=%u\n", (long) new_job_ptr->last_update, new_job_ptr->record_count); } if (!params.format && !params.format_long) { if (params.long_list) { xstrcat(params.format, "%.18i %.9P %.8j %.8u %.8T %.10M %.9l %.6D %R"); } else { xstrcat(params.format, "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R"); } } if (!params.format_list) { if (params.format) parse_format(params.format); else if (params.format_long) parse_long_format(params.format_long); } print_jobs_array(new_job_ptr->job_array, new_job_ptr->record_count, params.format_list) ; return SLURM_SUCCESS; }
// Destructor, not sure calling the slurm_free* functios are necessary ClusterMenu::~ClusterMenu() { // Free pointer memory slurm_free_node_info_msg(node_info_buffer_ptr); slurm_free_job_info_msg(job_info_buffer_ptr); }
static void slurmdrmaa_job_update_status( fsd_job_t *self ) { job_info_msg_t *job_info = NULL; slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self; fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { int _slurm_errno = slurm_get_errno(); if (_slurm_errno == ESLURM_INVALID_JOB_ID) { self->on_missing(self); } else { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); } } if (job_info) { fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); switch(job_info->job_array[0].job_state & JOB_STATE_BASE) { case JOB_PENDING: switch(job_info->job_array[0].state_reason) { #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) case WAIT_HELD_USER: /* job is held by user */ fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); self->state = DRMAA_PS_USER_ON_HOLD; break; #endif case WAIT_HELD: /* job is held by administrator */ fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); self->state = DRMAA_PS_SYSTEM_ON_HOLD; break; default: fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); self->state = DRMAA_PS_QUEUED_ACTIVE; } break; case JOB_RUNNING: fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); self->state = DRMAA_PS_RUNNING; break; case JOB_SUSPENDED: if(slurm_self->user_suspended == true) { fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); self->state = DRMAA_PS_USER_SUSPENDED; } else { fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); self->state = DRMAA_PS_SYSTEM_SUSPENDED; } break; case JOB_COMPLETE: fsd_log_debug(("interpreting as DRMAA_PS_DONE")); self->state = DRMAA_PS_DONE; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; case JOB_CANCELLED: fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); self->state = DRMAA_PS_FAILED; self->exit_status = -1; case JOB_FAILED: case JOB_TIMEOUT: case JOB_NODE_FAIL: #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) case JOB_PREEMPTED: #endif fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); self->state = DRMAA_PS_FAILED; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; default: /*unknown state */ fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { fsd_log_debug(("Epilog completing")); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { fsd_log_debug(("Nodes booting")); } if (self->exit_status == -1) /* input,output,error path failure etc*/ self->state = DRMAA_PS_FAILED; self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) { fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); fsd_cond_broadcast( &self->status_cond ); } } } FINALLY { if(job_info != NULL) slurm_free_job_info_msg (job_info); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
extern void _change_cluster_main(GtkComboBox *combo, gpointer extra) { GtkTreeModel *model; display_data_t *display_data; GtkTreeIter iter; slurmdb_cluster_rec_t *cluster_rec = NULL; char *tmp, *ui_description; GError *error = NULL; GtkWidget *node_tab = NULL; int rc; bool got_grid = 0; if (!gtk_combo_box_get_active_iter(combo, &iter)) { g_print("nothing selected\n"); return; } model = gtk_combo_box_get_model(combo); if (!model) { g_print("nothing selected\n"); return; } gtk_tree_model_get(model, &iter, 1, &cluster_rec, -1); if (!cluster_rec) { g_print("no cluster_rec pointer here!"); return; } /* From testing it doesn't appear you can get here without a legitimate change, so there isn't a need to check if we are going back to the same cluster we were just at. */ /* if (working_cluster_rec) { */ /* if (!xstrcmp(cluster_rec->name, working_cluster_rec->name)) */ /* return; */ /* } */ /* free old info under last cluster */ slurm_free_block_info_msg(g_block_info_ptr); g_block_info_ptr = NULL; slurm_free_front_end_info_msg(g_front_end_info_ptr); g_front_end_info_ptr = NULL; slurm_free_burst_buffer_info_msg(g_bb_info_ptr); g_bb_info_ptr = NULL; slurm_free_job_info_msg(g_job_info_ptr); g_job_info_ptr = NULL; slurm_free_node_info_msg(g_node_info_ptr); g_node_info_ptr = NULL; slurm_free_partition_info_msg(g_part_info_ptr); g_part_info_ptr = NULL; slurm_free_reservation_info_msg(g_resv_info_ptr); g_resv_info_ptr = NULL; slurm_free_ctl_conf(g_ctl_info_ptr); g_ctl_info_ptr = NULL; slurm_free_job_step_info_response_msg(g_step_info_ptr); g_step_info_ptr = NULL; slurm_free_topo_info_msg(g_topo_info_msg_ptr); g_topo_info_msg_ptr = NULL; /* set up working_cluster_rec */ if (cluster_dims > 1) { /* reset from a multi-dim cluster */ working_sview_config.grid_x_width = default_sview_config.grid_x_width; working_sview_config.grid_hori = default_sview_config.grid_hori; working_sview_config.grid_vert = default_sview_config.grid_vert; } gtk_table_set_col_spacings(main_grid_table, 0); gtk_table_set_row_spacings(main_grid_table, 0); if (!orig_cluster_name) orig_cluster_name = slurm_get_cluster_name(); if (!xstrcmp(cluster_rec->name, orig_cluster_name)) working_cluster_rec = NULL; else working_cluster_rec = cluster_rec; cluster_dims = slurmdb_setup_cluster_dims(); cluster_flags = slurmdb_setup_cluster_flags(); display_data = main_display_data; while (display_data++) { if (display_data->id == -1) break; if (cluster_flags & CLUSTER_FLAG_BG) { switch(display_data->id) { case BLOCK_PAGE: display_data->show = true; break; case NODE_PAGE: display_data->name = "Midplanes"; break; default: break; } } else { switch(display_data->id) { case BLOCK_PAGE: display_data->show = false; break; case NODE_PAGE: display_data->name = "Nodes"; break; default: break; } } } /* set up menu */ ui_description = _get_ui_description(); gtk_ui_manager_remove_ui(g_ui_manager, g_menu_id); if (!(g_menu_id = gtk_ui_manager_add_ui_from_string( g_ui_manager, ui_description, -1, &error))) { xfree(ui_description); g_error("building menus failed: %s", error->message); g_error_free (error); exit (0); } xfree(ui_description); /* make changes for each object */ cluster_change_block(); cluster_change_front_end(); cluster_change_resv(); cluster_change_part(); cluster_change_job(); cluster_change_node(); cluster_change_bb(); /* destroy old stuff */ if (grid_button_list) { FREE_NULL_LIST(grid_button_list); got_grid = 1; } select_g_ba_fini(); /* sorry popups can't survive a cluster change */ if (popup_list) list_flush(popup_list); if (signal_params_list) list_flush(signal_params_list); if (signal_params_list) list_flush(signal_params_list); if (g_switch_nodes_maps) free_switch_nodes_maps(g_switch_nodes_maps); /* change the node tab name if needed */ node_tab = gtk_notebook_get_nth_page( GTK_NOTEBOOK(main_notebook), NODE_PAGE); node_tab = gtk_notebook_get_tab_label(GTK_NOTEBOOK(main_notebook), node_tab); #ifdef GTK2_USE_GET_FOCUS /* ok, now we have a table which we have set up to contain an * event_box which contains the label we are interested. We * setup this label to be the focus child of the table, so all * we have to do is grab that and we are set. */ node_tab = gtk_container_get_focus_child(GTK_CONTAINER(node_tab)); #else /* See above comment. Since gtk_container_get_focus_child * doesn't exist yet we will just traverse the children until * we find the label widget and then break. */ { int i = 0; GList *children = gtk_container_get_children( GTK_CONTAINER(node_tab)); while ((node_tab = g_list_nth_data(children, i++))) { int j = 0; GList *children2 = gtk_container_get_children( GTK_CONTAINER(node_tab)); while ((node_tab = g_list_nth_data(children2, j++))) { if (GTK_IS_LABEL(node_tab)) break; } g_list_free(children2); if (node_tab) break; } g_list_free(children); } #endif if (node_tab) gtk_label_set_text(GTK_LABEL(node_tab), main_display_data[NODE_PAGE].name); /* The name in the visible tabs is easier since it is really just a button with a label on it. */ if (default_sview_config.page_check_widget[NODE_PAGE]) { gtk_button_set_label(GTK_BUTTON(default_sview_config. page_check_widget[NODE_PAGE]), main_display_data[NODE_PAGE].name); } /* reinit */ rc = get_system_stats(main_grid_table); if (rc == SLURM_SUCCESS) { /* It turns out if we didn't have the grid (cluster not responding) before the new grid doesn't get set up correctly. Redoing the system_stats fixes it. There is probably a better way of doing this, but it doesn't happen very often and isn't that bad to handle every once in a while. */ if (!got_grid) { /* I know we just did this before, but it needs to be done again here. */ FREE_NULL_LIST(grid_button_list); get_system_stats(main_grid_table); } refresh_main(NULL, NULL); } tmp = g_strdup_printf("Cluster changed to %s", cluster_rec->name); display_edit_note(tmp); g_free(tmp); }
extern void get_job(void) { int error_code = -1, i, recs; static int printed_jobs = 0; static int count = 0; static job_info_msg_t *job_info_ptr = NULL, *new_job_ptr = NULL; job_info_t *job_ptr = NULL; uint16_t show_flags = 0; bitstr_t *nodes_req = NULL; static uint16_t last_flags = 0; if (params.all_flag) show_flags |= SHOW_ALL; if (job_info_ptr) { if (show_flags != last_flags) job_info_ptr->last_update = 0; error_code = slurm_load_jobs(job_info_ptr->last_update, &new_job_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg(job_info_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_job_ptr = job_info_ptr; } } else error_code = slurm_load_jobs((time_t) NULL, &new_job_ptr, show_flags); last_flags = show_flags; if (error_code) { if (quiet_flag != 1) { if (!params.commandline) { mvwprintw(text_win, main_ycord, 1, "slurm_load_jobs: %s", slurm_strerror(slurm_get_errno())); main_ycord++; } else { printf("slurm_load_jobs: %s\n", slurm_strerror(slurm_get_errno())); } } } if (!params.no_header) _print_header_job(); if (new_job_ptr) recs = new_job_ptr->record_count; else recs = 0; if (!params.commandline) if ((text_line_cnt+printed_jobs) > count) text_line_cnt--; printed_jobs = 0; count = 0; if (params.hl) nodes_req = get_requested_node_bitmap(); for (i = 0; i < recs; i++) { job_ptr = &(new_job_ptr->job_array[i]); if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; /* job has completed */ if (nodes_req) { int overlap = 0; bitstr_t *loc_bitmap = bit_alloc(bit_size(nodes_req)); inx2bitstr(loc_bitmap, job_ptr->node_inx); overlap = bit_overlap(loc_bitmap, nodes_req); FREE_NULL_BITMAP(loc_bitmap); if (!overlap) continue; } if (job_ptr->node_inx[0] != -1) { int j = 0; job_ptr->num_nodes = 0; while (job_ptr->node_inx[j] >= 0) { job_ptr->num_nodes += (job_ptr->node_inx[j + 1] + 1) - job_ptr->node_inx[j]; set_grid_inx(job_ptr->node_inx[j], job_ptr->node_inx[j + 1], count); j += 2; } if (!params.commandline) { if ((count >= text_line_cnt) && (printed_jobs < (getmaxy(text_win) - 4))) { job_ptr->num_cpus = (int)letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); _print_text_job(job_ptr); wattroff(text_win, COLOR_PAIR(colors[count%6])); printed_jobs++; } } else { job_ptr->num_cpus = (int)letters[count%62]; _print_text_job(job_ptr); } count++; } if (count == 128) count = 0; } for (i = 0; i < recs; i++) { job_ptr = &(new_job_ptr->job_array[i]); if (!IS_JOB_PENDING(job_ptr)) continue; /* job has completed */ if (!params.commandline) { if ((count>=text_line_cnt) && (printed_jobs < (getmaxy(text_win) - 4))) { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); job_ptr->num_cpus = (int) letters[count%62]; wattron(text_win, COLOR_PAIR(colors[count%6])); _print_text_job(job_ptr); wattroff(text_win, COLOR_PAIR(colors[count%6])); printed_jobs++; } } else { xfree(job_ptr->nodes); job_ptr->nodes = xstrdup("waiting..."); job_ptr->num_cpus = (int) letters[count%62]; _print_text_job(job_ptr); printed_jobs++; } count++; if (count == 128) count = 0; } if (params.commandline && params.iterate) printf("\n"); if (!params.commandline) main_ycord++; job_info_ptr = new_job_ptr; return; }
/* * srun call, the client node connects the allocated node(s) */ int slurm_spank_local_user_init (spank_t sp, int ac, char **av) { int status; uint32_t jobid; uint32_t stepid; job_info_msg_t * job_buffer_ptr; job_info_t* job_ptr; /* only handle interactive usage */ if ( x11_mode == X11_MODE_NONE || x11_mode == X11_MODE_BATCH ) return 0; /* check DISPLAY value */ if ( getenv("DISPLAY") == NULL ) { ERROR("x11: no local DISPLAY defined, skipping",jobid); return 0; } /* get job id */ if ( spank_get_item (sp, S_JOB_ID, &jobid) != ESPANK_SUCCESS ) { status = -1; goto exit; } /* get job step id */ if ( spank_get_item (sp, S_JOB_STEPID, &stepid) != ESPANK_SUCCESS ) { status = -1; goto exit; } /* get job infos */ status = slurm_load_job(&job_buffer_ptr,jobid,SHOW_ALL); if ( status != 0 ) { ERROR("x11: unable to get job infos"); status = -3; goto exit; } /* check infos validity */ if ( job_buffer_ptr->record_count != 1 ) { ERROR("x11: job infos are invalid"); status = -4; goto clean_exit; } job_ptr = job_buffer_ptr->job_array; /* check allocated nodes var */ if ( job_ptr->nodes == NULL ) { ERROR("x11: job has no allocated nodes defined"); status = -5; goto clean_exit; } /* connect required nodes */ status = _x11_connect_nodes(job_ptr->nodes,jobid,stepid); clean_exit: slurm_free_job_info_msg(job_buffer_ptr); exit: return status; }
int _x11_init_remote_batch(spank_t sp,uint32_t jobid,uint32_t stepid) { int status; FILE* f; char localhost[256]; char* cmd_pattern= X11_LIBEXEC_PROG " -u %s -s \"%s\" -o \"%s\" -f %s -d %s -t %s -i %u.%u -cwg %s &"; char* cmd; size_t cmd_length; char display[256]; struct passwd user_pwent; struct passwd *p_pwent; size_t pwent_buffer_length = sysconf(_SC_GETPW_R_SIZE_MAX); char pwent_buffer[pwent_buffer_length]; job_info_msg_t * job_buffer_ptr; job_info_t* job_ptr; /* * get current hostname */ if ( gethostname(localhost,256) != 0 ) { status = -20; goto exit; } /* * the batch script inherits the DISPLAY value of the * submission command. We will use it on the allocation node * for proper establishment of a working X11 ssh tunnel */ if ( spank_getenv(sp,"DISPLAY",display,256) != ESPANK_SUCCESS ) { ERROR("x11: unable to read batch step " "inherited DISPLAY value"); status = -1; goto exit; } /* get job infos */ status = slurm_load_job(&job_buffer_ptr,jobid,SHOW_ALL); if ( status != 0 ) { ERROR("x11: unable to get job infos"); status = -3; goto exit; } /* check infos validity */ if ( job_buffer_ptr->record_count != 1 ) { ERROR("x11: job infos are invalid"); status = -4; goto clean_exit; } job_ptr = job_buffer_ptr->job_array; /* get user name */ status = getpwuid_r(job_ptr->user_id,&user_pwent,pwent_buffer, pwent_buffer_length,&p_pwent) ; if (status) { error("x11: unable to get username for uid=%u : %s",job_ptr->user_id, strerror(status)) ; status = -10; goto clean_exit; } /* * build the command line that will be used to forward the * alloc node X11 tunnel */ cmd_length = strlen(cmd_pattern) + 128 ; cmd = (char*) malloc(cmd_length*sizeof(char)); if ( cmd == NULL || snprintf(cmd,cmd_length,cmd_pattern,user_pwent.pw_name, (ssh_cmd == NULL) ? DEFAULT_SSH_CMD : ssh_cmd, (ssh_args == NULL) ? DEFAULT_SSH_ARGS : ssh_args, job_ptr->alloc_node,display,localhost,jobid,stepid, (helpertask_args == NULL) ? DEFAULT_HELPERTASK_ARGS : helpertask_args) >= cmd_length ) { ERROR("x11: error while building cmd"); status = -2; } else { INFO("x11: batch mode : executing %s",cmd); /* execute the command to retrieve the DISPLAY value to use */ f = popen(cmd,"r"); if ( f != NULL ) { if ( fscanf(f,"%255s",display) == 1 ) { if ( spank_setenv(sp,"DISPLAY",display,1) != ESPANK_SUCCESS ) { ERROR("x11: unable to set DISPLAY" " in job env"); status = -5; } else { INFO("x11: now using DISPLAY=%s", display); status=0; } } else { ERROR("x11: unable to get a DISPLAY value"); status = -6; } pclose(f); } else { ERROR("x11: unable to exec get cmd '%s'",cmd); status = -3; } } if ( cmd != NULL ) free(cmd); clean_exit: slurm_free_job_info_msg(job_buffer_ptr); exit: return status; }
/* Return the current time limit of the specified job_id or NO_VAL if the * information is not available */ static uint32_t _get_job_time(const char *job_id_str) { uint32_t job_id, task_id; char *next_str = NULL; uint32_t time_limit = NO_VAL; int i, rc; job_info_msg_t *resp; bitstr_t *array_bitmap; job_id = (uint32_t)strtol(job_id_str, &next_str, 10); if (next_str[0] == '_') { task_id = (uint32_t)strtol(next_str+1, &next_str, 10); if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return time_limit; } } else if (next_str[0] != '\0') { error("Invalid job ID %s", job_id_str); return time_limit; } else { task_id = NO_VAL; } rc = slurm_load_job(&resp, job_id, SHOW_ALL); if (rc == SLURM_SUCCESS) { if (resp->record_count == 0) { error("Job ID %s not found", job_id_str); slurm_free_job_info_msg(resp); return time_limit; } if ((resp->record_count > 1) && (task_id == NO_VAL)) { error("TimeLimit increment/decrement not supported " "for job arrays"); slurm_free_job_info_msg(resp); return time_limit; } for (i = 0; i < resp->record_count; i++) { if ((resp->job_array[i].job_id == job_id) && (resp->job_array[i].array_task_id == NO_VAL) && (resp->job_array[i].array_bitmap == NULL)) { /* Regular job match */ time_limit = resp->job_array[i].time_limit; break; } if (resp->job_array[i].array_job_id != job_id) continue; array_bitmap = (bitstr_t *) resp->job_array[i].array_bitmap; if ((task_id == NO_VAL) || (resp->job_array[i].array_task_id == task_id) || (array_bitmap && (task_id < bit_size(array_bitmap)) && bit_test(array_bitmap, task_id))) { /* Array job with task_id match */ time_limit = resp->job_array[i].time_limit; break; } } slurm_free_job_info_msg(resp); } else { error("Could not load state information for job %s: %m", job_id_str); } return time_limit; }
/* _get_job_ids() */ static uint32_t * _get_job_ids(const char *jobid, uint32_t *num_ids) { job_info_msg_t *job_info; uint32_t *job_ids; uint32_t task_id; int i; int cc; task_id = 0; job_info = _get_job_info(jobid, &task_id); if (job_info == NULL) return NULL; if (_is_array_task_id(jobid)) { job_ids = xmalloc(sizeof(uint32_t)); *num_ids = 1; /* Search for the job_id of the specified * task. */ for (cc = 0; cc < job_info->record_count; cc++) { if (task_id == job_info->job_array[cc].array_task_id) { job_ids[0] = job_info->job_array[cc].job_id; break; } } slurm_free_job_info_msg(job_info); return job_ids; } if (job_info->record_count == 1) { /* No task elements beside the * job itself so it cannot be * a job array. */ job_ids = xmalloc(sizeof(uint32_t)); *num_ids = 1; job_ids[0] = job_info->job_array[0].job_id; slurm_free_job_info_msg(job_info); return job_ids; } *num_ids = job_info->record_count; job_ids = xmalloc((*num_ids) * sizeof(uint32_t)); /* First save the pending jobs */ i = 0; for (cc = 0; cc < job_info->record_count; cc++) { if (job_info->job_array[cc].job_state == JOB_PENDING) { job_ids[i] = job_info->job_array[cc].job_id; ++i; } } /* then the rest of the states */ for (cc = 0; cc < job_info->record_count; cc++) { if (job_info->job_array[cc].job_state != JOB_PENDING) { job_ids[i] = job_info->job_array[cc].job_id; ++i; } } xassert(i == *num_ids); slurm_free_job_info_msg(job_info); return job_ids; }