/* _print_job_step - print the specified job step's information */ static int _print_job_steps( bool clear_old ) { int error_code; static job_step_info_response_msg_t * old_step_ptr = NULL; static job_step_info_response_msg_t * new_step_ptr; uint16_t show_flags = 0; if (params.all_flag) show_flags |= SHOW_ALL; if (old_step_ptr) { if (clear_old) old_step_ptr->last_update = 0; /* Use a last_update time of 0 so that we can get an updated * run_time for jobs rather than just its start_time */ error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL, &new_step_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_step_info_response_msg( old_step_ptr ); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_step_ptr = old_step_ptr; } } else { error_code = slurm_get_job_steps((time_t) 0, NO_VAL, NO_VAL, &new_step_ptr, show_flags); } if (error_code) { slurm_perror ("slurm_get_job_steps error"); return SLURM_ERROR; } old_step_ptr = new_step_ptr; if (params.verbose) { printf ("last_update_time=%ld records=%u\n", (long) new_step_ptr->last_update, new_step_ptr->job_step_count); } if (!params.format && !params.format_long) params.format = "%.15i %.8j %.9P %.8u %.9M %N"; if (!params.format_list) { if (params.format) parse_format(params.format); else if (params.format_long) parse_long_format(params.format_long); } print_steps_array( new_step_ptr->job_steps, new_step_ptr->job_step_count, params.format_list ); return SLURM_SUCCESS; }
/* * slurm_terminate_job_step - terminates a job step by sending a * REQUEST_TERMINATE_TASKS rpc to all slurmd of a job step. * IN job_id - the job's id * IN step_id - the job step's id - use SLURM_BATCH_SCRIPT as the step_id * to terminate a job's batch script * RET 0 on success, otherwise return -1 and set errno to indicate the error */ extern int slurm_terminate_job_step (uint32_t job_id, uint32_t step_id) { resource_allocation_response_msg_t *alloc_info = NULL; job_step_info_response_msg_t *step_info = NULL; int rc = 0; int i; int save_errno = 0; if (slurm_allocation_lookup_lite(job_id, &alloc_info)) { return -1; } /* * The controller won't give us info about the batch script job step, * so we need to handle that seperately. */ if (step_id == SLURM_BATCH_SCRIPT) { rc = _terminate_batch_script_step(alloc_info); slurm_free_resource_allocation_response_msg(alloc_info); errno = rc; return rc ? -1 : 0; } /* * Otherwise, look through the list of job step info and find * the one matching step_id. Terminate that step. */ rc = slurm_get_job_steps((time_t)0, job_id, step_id, &step_info, SHOW_ALL); if (rc != 0) { save_errno = errno; goto fail; } for (i = 0; i < step_info->job_step_count; i++) { if ((step_info->job_steps[i].job_id == job_id) && (step_info->job_steps[i].step_id == step_id)) { rc = _terminate_job_step(&step_info->job_steps[i], alloc_info); save_errno = errno; break; } } slurm_free_job_step_info_response_msg(step_info); fail: slurm_free_resource_allocation_response_msg(alloc_info); errno = save_errno; return rc ? -1 : 0; }
/* Return the current time limit of the specified job/step_id or NO_VAL if the * information is not available */ static uint32_t _get_step_time(uint32_t job_id, uint32_t step_id) { uint32_t time_limit = NO_VAL; int i, rc; job_step_info_response_msg_t *resp; rc = slurm_get_job_steps((time_t) 0, job_id, step_id, &resp, SHOW_ALL); if (rc == SLURM_SUCCESS) { for (i = 0; i < resp->job_step_count; i++) { if ((resp->job_steps[i].job_id != job_id) || (resp->job_steps[i].step_id != step_id)) continue; /* should not happen */ time_limit = resp->job_steps[i].time_limit; break; } slurm_free_job_step_info_response_msg(resp); } else { error("Could not load state information for step %u.%u: %m", job_id, step_id); } return time_limit; }
int main(int argc, char **argv) { ListIterator itr = NULL; uint32_t req_cpufreq = NO_VAL; uint32_t stepid = NO_VAL; slurmdb_selected_step_t *selected_step = NULL; #ifdef HAVE_ALPS_CRAY error("The sstat command is not supported on Cray systems"); return 1; #endif #ifdef HAVE_BG error("The sstat command is not supported on IBM BlueGene systems"); return 1; #endif slurm_conf_init(NULL); print_fields_list = list_create(NULL); print_fields_itr = list_iterator_create(print_fields_list); parse_command_line(argc, argv); if (!params.opt_job_list || !list_count(params.opt_job_list)) { error("You didn't give me any jobs to stat."); return 1; } print_fields_header(print_fields_list); itr = list_iterator_create(params.opt_job_list); while ((selected_step = list_next(itr))) { char *nodelist = NULL; bool free_nodelist = false; if (selected_step->stepid == INFINITE) { /* get the batch step info */ job_info_msg_t *job_ptr = NULL; hostlist_t hl; if (slurm_load_job( &job_ptr, selected_step->jobid, SHOW_ALL)) { error("couldn't get info for job %u", selected_step->jobid); continue; } stepid = NO_VAL; hl = hostlist_create(job_ptr->job_array[0].nodes); nodelist = hostlist_pop(hl); free_nodelist = true; hostlist_destroy(hl); slurm_free_job_info_msg(job_ptr); } else if (selected_step->stepid != NO_VAL) { stepid = selected_step->stepid; } else if (params.opt_all_steps) { job_step_info_response_msg_t *step_ptr = NULL; int i = 0; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } for (i = 0; i < step_ptr->job_step_count; i++) { _do_stat(selected_step->jobid, step_ptr->job_steps[i].step_id, step_ptr->job_steps[i].nodes, step_ptr->job_steps[i].cpu_freq); } slurm_free_job_step_info_response_msg(step_ptr); continue; } else { /* get the first running step to query against. */ job_step_info_response_msg_t *step_ptr = NULL; if (slurm_get_job_steps( 0, selected_step->jobid, NO_VAL, &step_ptr, SHOW_ALL)) { error("couldn't get steps for job %u", selected_step->jobid); continue; } if (!step_ptr->job_step_count) { error("no steps running for job %u", selected_step->jobid); continue; } stepid = step_ptr->job_steps[0].step_id; nodelist = step_ptr->job_steps[0].nodes; req_cpufreq = step_ptr->job_steps[0].cpu_freq; } _do_stat(selected_step->jobid, stepid, nodelist, req_cpufreq); if (free_nodelist && nodelist) free(nodelist); } list_iterator_destroy(itr); xfree(params.opt_field_list); if (params.opt_job_list) list_destroy(params.opt_job_list); if (print_fields_itr) list_iterator_destroy(print_fields_itr); if (print_fields_list) list_destroy(print_fields_list); return 0; }
/* * scontrol_print_step - print the specified job step's information * IN job_step_id_str - job step's id or NULL to print information * about all job steps */ extern void scontrol_print_step (char *job_step_id_str) { int error_code, i, print_cnt = 0; uint32_t job_id = NO_VAL, step_id = NO_VAL; uint16_t array_id = (uint16_t) NO_VAL; char *next_str; job_step_info_response_msg_t *job_step_info_ptr; job_step_info_t * job_step_ptr; static uint32_t last_job_id = 0, last_array_id, last_step_id = 0; static job_step_info_response_msg_t *old_job_step_info_ptr = NULL; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; if (job_step_id_str) { job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10); if (next_str[0] == '_') array_id = (uint16_t) strtol(next_str+1, &next_str, 10); if (next_str[0] == '.') step_id = (uint32_t) strtol (next_str+1, NULL, 10); } if (all_flag) show_flags |= SHOW_ALL; if ((old_job_step_info_ptr) && (last_job_id == job_id) && (last_array_id == array_id) && (last_step_id == step_id)) { if (last_show_flags != show_flags) old_job_step_info_ptr->last_update = (time_t) 0; error_code = slurm_get_job_steps ( old_job_step_info_ptr->last_update, job_id, step_id, &job_step_info_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_step_info_ptr = old_job_step_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf ("slurm_get_job_steps no change in data\n"); } } else { if (old_job_step_info_ptr) { slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); old_job_step_info_ptr = NULL; } error_code = slurm_get_job_steps ( (time_t) 0, job_id, step_id, &job_step_info_ptr, show_flags); } if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_get_job_steps error"); return; } old_job_step_info_ptr = job_step_info_ptr; last_show_flags = show_flags; last_job_id = job_id; last_step_id = step_id; if (quiet_flag == -1) { char time_str[32]; slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update, time_str, sizeof(time_str)); printf ("last_update_time=%s, records=%d\n", time_str, job_step_info_ptr->job_step_count); } job_step_ptr = job_step_info_ptr->job_steps ; for (i = 0, job_step_ptr = job_step_info_ptr->job_steps; i < job_step_info_ptr->job_step_count; i++, job_step_ptr++) { if ((array_id != (uint16_t) NO_VAL) && (array_id != job_step_ptr->array_task_id)) continue; slurm_print_job_step_info(stdout, job_step_ptr, one_liner); print_cnt++; } if (print_cnt == 0) { if (job_step_id_str) { exit_code = 1; if (quiet_flag != 1) { if (array_id == (uint16_t) NO_VAL) { printf ("Job step %u.%u not found\n", job_id, step_id); } else { printf ("Job step %u_%u.%u not found\n", job_id, array_id, step_id); } } } else if (quiet_flag != 1) printf ("No job steps in the system\n"); } }
extern void _change_cluster_main(GtkComboBox *combo, gpointer extra) { GtkTreeModel *model; display_data_t *display_data; GtkTreeIter iter; slurmdb_cluster_rec_t *cluster_rec = NULL; char *tmp, *ui_description; GError *error = NULL; GtkWidget *node_tab = NULL; int rc; bool got_grid = 0; if (!gtk_combo_box_get_active_iter(combo, &iter)) { g_print("nothing selected\n"); return; } model = gtk_combo_box_get_model(combo); if (!model) { g_print("nothing selected\n"); return; } gtk_tree_model_get(model, &iter, 1, &cluster_rec, -1); if (!cluster_rec) { g_print("no cluster_rec pointer here!"); return; } /* From testing it doesn't appear you can get here without a legitimate change, so there isn't a need to check if we are going back to the same cluster we were just at. */ /* if (working_cluster_rec) { */ /* if (!xstrcmp(cluster_rec->name, working_cluster_rec->name)) */ /* return; */ /* } */ /* free old info under last cluster */ slurm_free_block_info_msg(g_block_info_ptr); g_block_info_ptr = NULL; slurm_free_front_end_info_msg(g_front_end_info_ptr); g_front_end_info_ptr = NULL; slurm_free_burst_buffer_info_msg(g_bb_info_ptr); g_bb_info_ptr = NULL; slurm_free_job_info_msg(g_job_info_ptr); g_job_info_ptr = NULL; slurm_free_node_info_msg(g_node_info_ptr); g_node_info_ptr = NULL; slurm_free_partition_info_msg(g_part_info_ptr); g_part_info_ptr = NULL; slurm_free_reservation_info_msg(g_resv_info_ptr); g_resv_info_ptr = NULL; slurm_free_ctl_conf(g_ctl_info_ptr); g_ctl_info_ptr = NULL; slurm_free_job_step_info_response_msg(g_step_info_ptr); g_step_info_ptr = NULL; slurm_free_topo_info_msg(g_topo_info_msg_ptr); g_topo_info_msg_ptr = NULL; /* set up working_cluster_rec */ if (cluster_dims > 1) { /* reset from a multi-dim cluster */ working_sview_config.grid_x_width = default_sview_config.grid_x_width; working_sview_config.grid_hori = default_sview_config.grid_hori; working_sview_config.grid_vert = default_sview_config.grid_vert; } gtk_table_set_col_spacings(main_grid_table, 0); gtk_table_set_row_spacings(main_grid_table, 0); if (!orig_cluster_name) orig_cluster_name = slurm_get_cluster_name(); if (!xstrcmp(cluster_rec->name, orig_cluster_name)) working_cluster_rec = NULL; else working_cluster_rec = cluster_rec; cluster_dims = slurmdb_setup_cluster_dims(); cluster_flags = slurmdb_setup_cluster_flags(); display_data = main_display_data; while (display_data++) { if (display_data->id == -1) break; if (cluster_flags & CLUSTER_FLAG_BG) { switch(display_data->id) { case BLOCK_PAGE: display_data->show = true; break; case NODE_PAGE: display_data->name = "Midplanes"; break; default: break; } } else { switch(display_data->id) { case BLOCK_PAGE: display_data->show = false; break; case NODE_PAGE: display_data->name = "Nodes"; break; default: break; } } } /* set up menu */ ui_description = _get_ui_description(); gtk_ui_manager_remove_ui(g_ui_manager, g_menu_id); if (!(g_menu_id = gtk_ui_manager_add_ui_from_string( g_ui_manager, ui_description, -1, &error))) { xfree(ui_description); g_error("building menus failed: %s", error->message); g_error_free (error); exit (0); } xfree(ui_description); /* make changes for each object */ cluster_change_block(); cluster_change_front_end(); cluster_change_resv(); cluster_change_part(); cluster_change_job(); cluster_change_node(); cluster_change_bb(); /* destroy old stuff */ if (grid_button_list) { FREE_NULL_LIST(grid_button_list); got_grid = 1; } select_g_ba_fini(); /* sorry popups can't survive a cluster change */ if (popup_list) list_flush(popup_list); if (signal_params_list) list_flush(signal_params_list); if (signal_params_list) list_flush(signal_params_list); if (g_switch_nodes_maps) free_switch_nodes_maps(g_switch_nodes_maps); /* change the node tab name if needed */ node_tab = gtk_notebook_get_nth_page( GTK_NOTEBOOK(main_notebook), NODE_PAGE); node_tab = gtk_notebook_get_tab_label(GTK_NOTEBOOK(main_notebook), node_tab); #ifdef GTK2_USE_GET_FOCUS /* ok, now we have a table which we have set up to contain an * event_box which contains the label we are interested. We * setup this label to be the focus child of the table, so all * we have to do is grab that and we are set. */ node_tab = gtk_container_get_focus_child(GTK_CONTAINER(node_tab)); #else /* See above comment. Since gtk_container_get_focus_child * doesn't exist yet we will just traverse the children until * we find the label widget and then break. */ { int i = 0; GList *children = gtk_container_get_children( GTK_CONTAINER(node_tab)); while ((node_tab = g_list_nth_data(children, i++))) { int j = 0; GList *children2 = gtk_container_get_children( GTK_CONTAINER(node_tab)); while ((node_tab = g_list_nth_data(children2, j++))) { if (GTK_IS_LABEL(node_tab)) break; } g_list_free(children2); if (node_tab) break; } g_list_free(children); } #endif if (node_tab) gtk_label_set_text(GTK_LABEL(node_tab), main_display_data[NODE_PAGE].name); /* The name in the visible tabs is easier since it is really just a button with a label on it. */ if (default_sview_config.page_check_widget[NODE_PAGE]) { gtk_button_set_label(GTK_BUTTON(default_sview_config. page_check_widget[NODE_PAGE]), main_display_data[NODE_PAGE].name); } /* reinit */ rc = get_system_stats(main_grid_table); if (rc == SLURM_SUCCESS) { /* It turns out if we didn't have the grid (cluster not responding) before the new grid doesn't get set up correctly. Redoing the system_stats fixes it. There is probably a better way of doing this, but it doesn't happen very often and isn't that bad to handle every once in a while. */ if (!got_grid) { /* I know we just did this before, but it needs to be done again here. */ FREE_NULL_LIST(grid_button_list); get_system_stats(main_grid_table); } refresh_main(NULL, NULL); } tmp = g_strdup_printf("Cluster changed to %s", cluster_rec->name); display_edit_note(tmp); g_free(tmp); }