struct ipmipower_connection * ipmipower_connection_array_create(const char *hostname, unsigned int *len) { char *str = NULL; int index = 0; hostlist_t hl = NULL; hostlist_iterator_t itr = NULL; struct ipmipower_connection *ics; int size = sizeof(struct ipmipower_connection); int hl_count; int errcount = 0; int emfilecount = 0; assert(hostname && len); *len = 0; if (!(hl = hostlist_create(hostname))) { ipmipower_output(MSG_TYPE_HOSTNAME_INVALID, hostname); return NULL; } if (!(itr = hostlist_iterator_create(hl))) ierr_exit("hostlist_iterator_create() error"); hostlist_uniq(hl); hl_count = hostlist_count(hl); ics = (struct ipmipower_connection *)Malloc(size * hl_count); memset(ics, '\0', (size * hl_count)); while ((str = hostlist_next(itr))) { ics[index].ipmi_fd = -1; ics[index].ping_fd = -1; /* cleanup only at the end, gather all error outputs for * later */ if (_connection_setup(&ics[index], str) < 0) { if (errno == EMFILE && !emfilecount) { cbuf_printf(ttyout, "file descriptor limit reached\n"); emfilecount++; } errcount++; } free(str); index++; } hostlist_iterator_destroy(itr); hostlist_destroy(hl); if (errcount) { int i; for (i = 0; i < hl_count; i++) { close(ics[i].ipmi_fd); close(ics[i].ping_fd); if (ics[i].ipmi_in) cbuf_destroy(ics[i].ipmi_in); if (ics[i].ipmi_out) cbuf_destroy(ics[i].ipmi_out); if (ics[i].ping_in) cbuf_destroy(ics[i].ping_in); if (ics[i].ping_out) cbuf_destroy(ics[i].ping_out); } Free(ics); return NULL; } *len = hl_count; return ics; }
extern int sacctmgr_list_cluster(int argc, char *argv[]) { int rc = SLURM_SUCCESS; slurmdb_cluster_cond_t *cluster_cond = xmalloc(sizeof(slurmdb_cluster_cond_t)); List cluster_list; int i=0; ListIterator itr = NULL; ListIterator itr2 = NULL; slurmdb_cluster_rec_t *cluster = NULL; char *tmp_char = NULL; int field_count = 0; print_field_t *field = NULL; List format_list = list_create(slurm_destroy_char); List print_fields_list; /* types are of print_field_t */ slurmdb_init_cluster_cond(cluster_cond, 0); cluster_cond->cluster_list = list_create(slurm_destroy_char); for (i=0; i<argc; i++) { int command_len = strlen(argv[i]); if (!strncasecmp(argv[i], "Where", MAX(command_len, 5)) || !strncasecmp(argv[i], "Set", MAX(command_len, 3))) i++; _set_cond(&i, argc, argv, cluster_cond, format_list); } if(exit_code) { slurmdb_destroy_cluster_cond(cluster_cond); list_destroy(format_list); return SLURM_ERROR; } if(!list_count(format_list)) { slurm_addto_char_list(format_list, "Cl,Controlh,Controlp,RPC"); if(!without_limits) slurm_addto_char_list(format_list, "Fa,GrpJ,GrpN,GrpS,MaxJ,MaxN," "MaxS,MaxW,QOS,DefaultQOS"); } cluster_cond->with_deleted = with_deleted; print_fields_list = sacctmgr_process_format_list(format_list); list_destroy(format_list); if(exit_code) { slurmdb_destroy_cluster_cond(cluster_cond); list_destroy(print_fields_list); return SLURM_ERROR; } cluster_list = acct_storage_g_get_clusters(db_conn, my_uid, cluster_cond); slurmdb_destroy_cluster_cond(cluster_cond); if(!cluster_list) { exit_code=1; fprintf(stderr, " Problem with query.\n"); list_destroy(print_fields_list); return SLURM_ERROR; } itr = list_iterator_create(cluster_list); itr2 = list_iterator_create(print_fields_list); print_fields_header(print_fields_list); field_count = list_count(print_fields_list); while((cluster = list_next(itr))) { int curr_inx = 1; slurmdb_association_rec_t *assoc = cluster->root_assoc; /* set up the working cluster rec so nodecnt's and node names * are handled correctly */ working_cluster_rec = cluster; while((field = list_next(itr2))) { switch(field->type) { case PRINT_CLUSTER: field->print_routine(field, cluster->name, (curr_inx == field_count)); break; case PRINT_CHOST: field->print_routine(field, cluster->control_host, (curr_inx == field_count)); break; case PRINT_CPORT: field->print_routine(field, cluster->control_port, (curr_inx == field_count)); break; case PRINT_CLASS: field->print_routine(field, get_classification_str( cluster-> classification), (curr_inx == field_count)); break; case PRINT_CPUS: { char tmp_char[9]; convert_num_unit((float)cluster->cpu_count, tmp_char, sizeof(tmp_char), UNIT_NONE); field->print_routine(field, tmp_char, (curr_inx == field_count)); break; } case PRINT_DQOS: if(!g_qos_list) { g_qos_list = acct_storage_g_get_qos( db_conn, my_uid, NULL); } tmp_char = slurmdb_qos_str(g_qos_list, assoc->def_qos_id); field->print_routine( field, tmp_char, (curr_inx == field_count)); break; case PRINT_FAIRSHARE: field->print_routine( field, assoc->shares_raw, (curr_inx == field_count)); break; case PRINT_FLAGS: { char *tmp_char = slurmdb_cluster_flags_2_str( cluster->flags); field->print_routine( field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; } case PRINT_GRPC: field->print_routine(field, assoc->grp_cpus, (curr_inx == field_count)); break; case PRINT_GRPJ: field->print_routine(field, assoc->grp_jobs, (curr_inx == field_count)); break; case PRINT_GRPN: field->print_routine(field, assoc->grp_nodes, (curr_inx == field_count)); break; case PRINT_GRPS: field->print_routine(field, assoc->grp_submit_jobs, (curr_inx == field_count)); break; case PRINT_MAXCM: field->print_routine( field, assoc->max_cpu_mins_pj, (curr_inx == field_count)); break; case PRINT_MAXC: field->print_routine(field, assoc->max_cpus_pj, (curr_inx == field_count)); break; case PRINT_MAXJ: field->print_routine(field, assoc->max_jobs, (curr_inx == field_count)); break; case PRINT_MAXN: field->print_routine(field, assoc->max_nodes_pj, (curr_inx == field_count)); break; case PRINT_MAXS: field->print_routine(field, assoc->max_submit_jobs, (curr_inx == field_count)); break; case PRINT_MAXW: field->print_routine( field, assoc->max_wall_pj, (curr_inx == field_count)); break; case PRINT_NODECNT: { hostlist_t hl = hostlist_create(cluster->nodes); int cnt = 0; if(hl) { cnt = hostlist_count(hl); hostlist_destroy(hl); } field->print_routine( field, cnt, (curr_inx == field_count)); break; } case PRINT_CLUSTER_NODES: field->print_routine( field, cluster->nodes, (curr_inx == field_count)); break; case PRINT_QOS: if(!g_qos_list) g_qos_list = acct_storage_g_get_qos( db_conn, my_uid, NULL); field->print_routine(field, g_qos_list, assoc->qos_list, (curr_inx == field_count)); break; case PRINT_QOS_RAW: field->print_routine(field, assoc->qos_list, (curr_inx == field_count)); break; case PRINT_RPC_VERSION: field->print_routine( field, cluster->rpc_version, (curr_inx == field_count)); break; case PRINT_SELECT: field->print_routine( field, cluster->plugin_id_select, (curr_inx == field_count)); break; default: field->print_routine( field, NULL, (curr_inx == field_count)); break; } curr_inx++; } list_iterator_reset(itr2); printf("\n"); } /* clear the working cluster rec */ working_cluster_rec = NULL; list_iterator_destroy(itr2); list_iterator_destroy(itr); list_destroy(cluster_list); list_destroy(print_fields_list); return rc; }
extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, slurmdb_job_cond_t *job_cond, void **curr_cluster) { List local_cluster_list = NULL; time_t now = time(NULL); MYSQL_RES *result = NULL; MYSQL_ROW row; hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; char *query = NULL; int dims = 0; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } /* get the dimensions of this cluster so we know how to deal with the hostlists */ query = xstrdup_printf("select dimensions from %s where name='%s'", cluster_table, (char *)list_peek(job_cond->cluster_list)); debug4("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); return NULL; } xfree(query); if (!(row = mysql_fetch_row(result))) { error("Couldn't get the dimensions of cluster '%s'.", (char *)list_peek(job_cond->cluster_list)); return NULL; } dims = atoi(row[0]); temp_hl = hostlist_create_dims(job_cond->used_nodes, dims); if (hostlist_count(temp_hl) <= 0) { error("we didn't get any real hosts to look for."); goto no_hosts; } h_itr = hostlist_iterator_create(temp_hl); query = xstrdup_printf("select cluster_nodes, time_start, " "time_end from \"%s_%s\" where node_name='' " "&& cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " && ((time_start < %ld) " "&& (time_end >= %ld || time_end = 0))", job_cond->usage_end, job_cond->usage_start); } debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); goto no_hosts; } xfree(query); local_cluster_list = list_create(_destroy_local_cluster); while ((row = mysql_fetch_row(result))) { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create_dims(row[0], dims); local_cluster->start = slurm_atoul(row[1]); local_cluster->end = slurm_atoul(row[2]); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while ((host = hostlist_next_dims(h_itr, dims))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(local_cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; (*curr_cluster) = local_cluster; } } else _destroy_local_cluster(local_cluster); } mysql_free_result(result); if (!list_count(local_cluster_list)) { list_destroy(local_cluster_list); local_cluster_list = NULL; goto no_hosts; } no_hosts: hostlist_iterator_destroy(h_itr); hostlist_destroy(temp_hl); return local_cluster_list; }
static int _resources_set(char ***env) { char *p = NULL; /* Initialize all memory pointers that would be allocated to NULL * So in case of error exit we will know what to xfree */ _pmixp_job_info.job_hl = hostlist_create(""); _pmixp_job_info.step_hl = hostlist_create(""); _pmixp_job_info.hostname = NULL; /* Save step host list */ p = getenvp(*env, PMIXP_STEP_NODES_ENV); if (!p) { PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found", PMIXP_STEP_NODES_ENV); goto err_exit; } hostlist_push(_pmixp_job_info.step_hl, p); /* Extract our node name */ p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id); _pmixp_job_info.hostname = xstrdup(p); free(p); /* Determine job-wide node id and job-wide node count */ p = getenvp(*env, PMIXP_JOB_NODES_ENV); if (p == NULL) { p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP); if (p == NULL) { /* shouldn't happen if we are under SLURM! */ PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!", PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP); goto err_exit; } } hostlist_push(_pmixp_job_info.job_hl, p); _pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl); _pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl, _pmixp_job_info.hostname); /* FIXME!! ------------------------------------------------------------- */ /* TODO: _get_task_count not always works well. if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) { _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; } xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job); */ _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; /* Save task-to-node mapping */ p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV); if (p == NULL) { /* Direct modex won't work */ PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!", PMIXP_SLURM_MAPPING_ENV); goto err_exit; } _pmixp_job_info.task_map_packed = xstrdup(p); return SLURM_SUCCESS; err_exit: hostlist_destroy(_pmixp_job_info.job_hl); hostlist_destroy(_pmixp_job_info.step_hl); if (NULL != _pmixp_job_info.hostname) { xfree(_pmixp_job_info.hostname); } return SLURM_ERROR; }
/* * Read a Slurm hostfile specified by "filename". "filename" must contain * a list of Slurm NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any Slurm daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(const char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL, *end_part = NULL; char *asterisk, *tmp_text = NULL, *save_ptr = NULL, *host_name; int total_file_len = 0; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_size = strlen(in_line); for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } /* * Get the string length again just to in case it changed from * the above loop */ line_size = strlen(in_line); total_file_len += line_size; /* * If there was an end section from before set it up to be on * the front of this next chunk. */ if (end_part) { tmp_text = end_part; end_part = NULL; } if (line_size == (BUFFER_SIZE - 1)) { /* * If we filled up the buffer get the end past the last * comma. We will tack it on the next pass through. */ char *last_comma = strrchr(in_line, ','); if (!last_comma) { error("Line %d, of hostfile %s too long", line_num, filename); fclose(fp); hostlist_destroy(hostlist); return NULL; } end_part = xstrdup(last_comma + 1); *last_comma = '\0'; } else line_num++; xstrcat(tmp_text, in_line); /* Skip this line */ if (tmp_text[0] == '\0') continue; if (!isalpha(tmp_text[0]) && !isdigit(tmp_text[0])) { error("Invalid hostfile %s contents on line %d", filename, line_num); fclose(fp); hostlist_destroy(hostlist); xfree(end_part); xfree(tmp_text); return NULL; } host_name = strtok_r(tmp_text, ",", &save_ptr); while (host_name) { if ((asterisk = strchr(host_name, '*')) && (i = atoi(asterisk + 1))) { asterisk[0] = '\0'; /* * Don't forget the extra space potentially * needed */ total_file_len += strlen(host_name) * i; for (j = 0; j < i; j++) hostlist_push_host(hostlist, host_name); } else { hostlist_push_host(hostlist, host_name); } host_name = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_text); if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n)) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in Slurm Hostfile"); goto cleanup_hostfile; } total_file_len += 1024; nodelist = (char *)malloc(total_file_len); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); xfree(end_part); xfree(tmp_text); return nodelist; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j; char time_str[32], *group_name, *user_name; char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr; char tmp_line[512]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *core_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int bit_inx, bit_reps; uint32_t *last_mem_alloc_ptr = NULL; uint32_t last_mem_alloc = NO_VAL; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id); out = xstrdup(tmp_line); if (job_ptr->array_job_id) { snprintf(tmp_line, sizeof(tmp_line), "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); xstrcat(out, tmp_line); } snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); snprintf(tmp_line, sizeof(tmp_line), "UserId=%s(%u) GroupId=%s(%u)", user_name, job_ptr->user_id, group_name, job_ptr->group_id); xfree(user_name); xfree(group_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 3 ******/ snprintf(tmp_line, sizeof(tmp_line), "Priority=%u Account=%s QOS=%s", job_ptr->priority, job_ptr->account, job_ptr->qos); xstrcat(out, tmp_line); if (slurm_get_track_wckey()) { snprintf(tmp_line, sizeof(tmp_line), " WCKey=%s", job_ptr->wckey); xstrcat(out, tmp_line); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 4 ******/ if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } tmp6_ptr = job_ptr->state_desc; } else tmp6_ptr = job_reason_string(job_ptr->state_reason); snprintf(tmp_line, sizeof(tmp_line), "JobState=%s Reason=%s Dependency=%s", job_state_string(job_ptr->job_state), tmp6_ptr, job_ptr->dependency); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5 ******/ snprintf(tmp_line, sizeof(tmp_line), "Requeue=%u Restarts=%u BatchFlag=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag); xstrcat(out, tmp_line); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); snprintf(tmp_line, sizeof(tmp_line), "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5a (optional) ******/ if (!(job_ptr->show_flags & SHOW_DETAIL)) goto line6; if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); snprintf(tmp_line, sizeof(tmp_line), "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 6 ******/ line6: snprintf(tmp_line, sizeof(tmp_line), "RunTime="); xstrcat(out, tmp_line); if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, tmp1, sizeof(tmp1)); sprintf(tmp_line, "%s ", tmp1); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "TimeLimit="); xstrcat(out, tmp_line); if (job_ptr->time_limit == NO_VAL) sprintf(tmp_line, "Partition_Limit"); else { mins2time_str(job_ptr->time_limit, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), " TimeMin="); xstrcat(out, tmp_line); if (job_ptr->time_min == 0) sprintf(tmp_line, "N/A"); else { mins2time_str(job_ptr->time_min, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 7 ******/ slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str); xstrcat(out, tmp_line); slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 9 ******/ slurm_make_time_str((time_t *)&job_ptr->start_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "EndTime="); xstrcat(out, tmp_line); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) sprintf(tmp_line, "Unknown"); else { slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str, sizeof(time_str)); sprintf(tmp_line, "%s", time_str); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) sprintf(tmp_line, "PreemptTime=None "); else { slurm_make_time_str((time_t *)&job_ptr->preempt_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ", time_str); } xstrcat(out, tmp_line); if (job_ptr->suspend_time) { slurm_make_time_str ((time_t *)&job_ptr->suspend_time, time_str, sizeof(time_str)); } else { strncpy(time_str, "None", sizeof(time_str)); } snprintf(tmp_line, sizeof(tmp_line), "SuspendTime=%s SecsPreSuspend=%ld", time_str, (long int)job_ptr->pre_sus_time); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 11 ******/ snprintf(tmp_line, sizeof(tmp_line), "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 12 ******/ snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 13 ******/ xstrfmtcat(out, "%s=", nodelist); xstrcat(out, job_ptr->nodes); if (job_ptr->nodes && ionodes) { snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes); xstrcat(out, tmp_line); xfree(ionodes); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s", job_ptr->batch_host); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } _sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus); _sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes); if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) strcpy(tmp3, "*"); else snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) strcpy(tmp4, "*"); else snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) strcpy(tmp5, "*"); else snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core); snprintf(tmp_line, sizeof(tmp_line), "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s", tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (!job_resrcs) goto line15; if (cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); length += 10; for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { if (length > 70) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); length += 4; } snprintf(tmp_line, sizeof(tmp_line), "%d", job_resrcs->cpus[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); if (job_resrcs->cpu_array_reps[i] > 1) { snprintf(tmp_line, sizeof(tmp_line), "*%d", job_resrcs->cpu_array_reps[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } } else { if (!job_resrcs->core_bitmap) goto line15; last = bit_fls(job_resrcs->core_bitmap); if (last == -1) goto line15; hl = hostlist_create(job_ptr->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_ptr->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; core_bitmap = bit_alloc(bit_reps); for (j=0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)) bit_set(core_bitmap, j); bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), core_bitmap); FREE_NULL_BITMAP(core_bitmap); host = hostlist_shift(hl); /* * If the allocation values for this host are not the same as the * last host, print the report of the last group of hosts that had * identical allocation values. */ if (strcmp(tmp1, tmp2) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 15 ******/ line15: if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s", tmp1); } else { snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u", job_ptr->pn_min_cpus); } xstrcat(out, tmp_line); convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA); snprintf(tmp_line, sizeof(tmp_line), " MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 16 ******/ snprintf(tmp_line, sizeof(tmp_line), "Features=%s Gres=%s Reservation=%s", job_ptr->features, job_ptr->gres, job_ptr->resv_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 17 ******/ snprintf(tmp_line, sizeof(tmp_line), "Shared=%s Contiguous=%d Licenses=%s Network=%s", (job_ptr->shared == 0 ? "0" : job_ptr->shared == 1 ? "1" : "OK"), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 18 ******/ snprintf(tmp_line, sizeof(tmp_line), "Command=%s", job_ptr->command); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 19 ******/ snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s", job_ptr->work_dir); xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 20 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Block_ID=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 21 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, select_buf); } if (cluster_flags & CLUSTER_FLAG_BGL) { /****** Line 22 (optional) ******/ select_g_select_jobinfo_sprint( job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BLRTS_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "BlrtsImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "LinuxImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "CnloadImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "MloaderImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 25 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "RamDiskImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "IoloadImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 26 (optional) ******/ if (job_ptr->comment) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ", job_ptr->comment); xstrcat(out, tmp_line); } /****** Line 27 (optional) ******/ if (job_ptr->batch_script) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 28 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); snprintf(tmp_line, sizeof(tmp_line), "Switches=%u@%s\n", job_ptr->req_switch, time_buf); xstrcat(out, tmp_line); } /****** Line 29 (optional) ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
static slurmdb_job_rec_t *_slurmdb_create_job_rec( filetxt_job_rec_t *filetxt_job, slurmdb_job_cond_t *job_cond) { slurmdb_job_rec_t *slurmdb_job = NULL; ListIterator itr = NULL; filetxt_step_rec_t *filetxt_step = NULL; if (!job_cond) goto no_cond; if (job_cond->state_list && list_count(job_cond->state_list)) { char *object = NULL; itr = list_iterator_create(job_cond->state_list); while((object = list_next(itr))) { if (atoi(object) == filetxt_job->status) { list_iterator_destroy(itr); goto foundstate; } } list_iterator_destroy(itr); return NULL; /* no match */ } foundstate: no_cond: slurmdb_job = slurmdb_create_job_rec(); slurmdb_job->associd = 0; slurmdb_job->account = xstrdup(filetxt_job->account); slurmdb_job->blockid = xstrdup(filetxt_job->header.blockid); slurmdb_job->cluster = NULL; slurmdb_job->elapsed = filetxt_job->elapsed; slurmdb_job->eligible = filetxt_job->header.job_submit; slurmdb_job->end = filetxt_job->header.timestamp; slurmdb_job->exitcode = filetxt_job->exitcode; slurmdb_job->gid = filetxt_job->header.gid; slurmdb_job->jobid = filetxt_job->header.jobnum; slurmdb_job->jobname = xstrdup(filetxt_job->jobname); slurmdb_job->partition = xstrdup(filetxt_job->header.partition); slurmdb_job->req_cpus = filetxt_job->ncpus; slurmdb_job->alloc_cpus = filetxt_job->ncpus; if (filetxt_job->nodes) { hostlist_t hl = hostlist_create(filetxt_job->nodes); slurmdb_job->alloc_nodes = hostlist_count(hl); hostlist_destroy(hl); } slurmdb_job->nodes = xstrdup(filetxt_job->nodes); slurmdb_job->priority = filetxt_job->priority; slurmdb_job->requid = filetxt_job->requid; memcpy(&slurmdb_job->stats, &filetxt_job->stats, sizeof(slurmdb_stats_t)); slurmdb_job->show_full = filetxt_job->show_full; slurmdb_job->start = filetxt_job->header.timestamp - slurmdb_job->elapsed; slurmdb_job->state = filetxt_job->status; slurmdb_job->steps = list_create(slurmdb_destroy_step_rec); if (filetxt_job->steps) { itr = list_iterator_create(filetxt_job->steps); while((filetxt_step = list_next(itr))) { slurmdb_step_rec_t *step = _slurmdb_create_step_rec(filetxt_step); if (step) { step->job_ptr = slurmdb_job; if (!slurmdb_job->first_step_ptr) slurmdb_job->first_step_ptr = step; list_append(slurmdb_job->steps, step); } } list_iterator_destroy(itr); } slurmdb_job->submit = filetxt_job->header.job_submit; slurmdb_job->sys_cpu_sec = filetxt_job->rusage.ru_stime.tv_sec; slurmdb_job->sys_cpu_usec = filetxt_job->rusage.ru_stime.tv_usec; slurmdb_job->tot_cpu_sec = filetxt_job->tot_cpu_sec; slurmdb_job->tot_cpu_usec = filetxt_job->tot_cpu_usec; slurmdb_job->track_steps = filetxt_job->track_steps; slurmdb_job->uid = filetxt_job->header.uid; slurmdb_job->user = NULL; slurmdb_job->user_cpu_sec = filetxt_job->rusage.ru_utime.tv_sec; slurmdb_job->user_cpu_usec = filetxt_job->rusage.ru_utime.tv_usec; return slurmdb_job; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j, k; char time_str[32], *group_name, *user_name; char *gres_last = "", tmp1[128], tmp2[128]; char *tmp6_ptr; char tmp_line[1024 * 128]; char tmp_path[MAXPATHLEN]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *cpu_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int64_t nice; int bit_inx, bit_reps; uint64_t *last_mem_alloc_ptr = NULL; uint64_t last_mem_alloc = NO_VAL64; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); uint32_t threads; char *line_end = (one_liner) ? " " : "\n "; if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ xstrfmtcat(out, "JobId=%u ", job_ptr->job_id); if (job_ptr->array_job_id) { if (job_ptr->array_task_str) { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ", job_ptr->array_job_id, job_ptr->array_task_str); } else { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); } } xstrfmtcat(out, "JobName=%s", job_ptr->name); xstrcat(out, line_end); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s", user_name, job_ptr->user_id, group_name, job_ptr->group_id, (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label); xfree(user_name); xfree(group_name); xstrcat(out, line_end); /****** Line 3 ******/ nice = ((int64_t)job_ptr->nice) - NICE_OFFSET; xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s", job_ptr->priority, nice, job_ptr->account, job_ptr->qos); if (slurm_get_track_wckey()) xstrfmtcat(out, " WCKey=%s", job_ptr->wckey); xstrcat(out, line_end); /****** Line 4 ******/ xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state)); if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc); } else xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason)); xstrfmtcat(out, "Dependency=%s", job_ptr->dependency); xstrcat(out, line_end); /****** Line 5 ******/ xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag, job_ptr->reboot); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); /****** Line 5a (optional) ******/ if (job_ptr->show_flags & SHOW_DETAIL) { if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); } /****** Line 6 ******/ if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, time_str, sizeof(time_str)); xstrfmtcat(out, "RunTime=%s ", time_str); if (job_ptr->time_limit == NO_VAL) xstrcat(out, "TimeLimit=Partition_Limit "); else { mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeLimit=%s ", time_str); } if (job_ptr->time_min == 0) xstrcat(out, "TimeMin=N/A"); else { mins2time_str(job_ptr->time_min, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeMin=%s", time_str); } xstrcat(out, line_end); /****** Line 7 ******/ slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SubmitTime=%s ", time_str); slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EligibleTime=%s", time_str); xstrcat(out, line_end); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str)); xstrfmtcat(out, "ResizeTime=%s", time_str); xstrcat(out, line_end); } /****** Line 9 ******/ slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str)); xstrfmtcat(out, "StartTime=%s ", time_str); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) xstrcat(out, "EndTime=Unknown "); else { slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EndTime=%s ", time_str); } if (job_ptr->deadline) { slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str)); xstrfmtcat(out, "Deadline=%s", time_str); } else { xstrcat(out, "Deadline=N/A"); } xstrcat(out, line_end); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) xstrcat(out, "PreemptTime=None "); else { slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str)); xstrfmtcat(out, "PreemptTime=%s ", time_str); } if (job_ptr->suspend_time) { slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SuspendTime=%s ", time_str); } else xstrcat(out, "SuspendTime=None "); xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time); xstrcat(out, line_end); /****** Line 11 ******/ xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, line_end); /****** Line 12 ******/ xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, line_end); /****** Line 13 ******/ xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes); if (job_ptr->nodes && ionodes) { xstrfmtcat(out, "[%s]", ionodes); xfree(ionodes); } if (job_ptr->sched_nodes) xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes); xstrcat(out, line_end); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host); xstrcat(out, line_end); } /****** Line 14a (optional) ******/ if (job_ptr->fed_siblings) { xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s", job_ptr->fed_origin_str, job_ptr->fed_siblings_str); xstrcat(out, line_end); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else if (IS_JOB_PENDING(job_ptr)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; if (max_nodes && (max_nodes < min_nodes)) min_nodes = max_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = 0; } _sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes); xstrfmtcat(out, "NumNodes=%s ", tmp_line); _sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus); xstrfmtcat(out, "NumCPUs=%s ", tmp_line); xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks); xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task); if (job_ptr->boards_per_node == (uint16_t) NO_VAL) xstrcat(out, "ReqB:S:C:T=*:"); else xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node); if (job_ptr->sockets_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->sockets_per_board); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) xstrcat(out, "*"); else xstrfmtcat(out, "%u", job_ptr->threads_per_core); xstrcat(out, line_end); /****** Line 16 ******/ /* Tres should already of been converted at this point from simple */ xstrfmtcat(out, "TRES=%s", job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str : job_ptr->tres_req_str); xstrcat(out, line_end); /****** Line 17 ******/ if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) xstrcat(out, "Socks/Node=* "); else xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node); if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL) xstrcat(out, "NtasksPerN:B:S:C=*:"); else xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node); if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board); if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_socket == (uint16_t) INFINITE)) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket); if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_core == (uint16_t) INFINITE)) xstrcat(out, "* "); else xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core); if (job_ptr->core_spec == (uint16_t) NO_VAL) xstrcat(out, "CoreSpec=*"); else if (job_ptr->core_spec & CORE_SPEC_THREAD) xstrfmtcat(out, "ThreadSpec=%d", (job_ptr->core_spec & (~CORE_SPEC_THREAD))); else xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec); xstrcat(out, line_end); if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { /* only print 60 characters worth of this record */ if (length > 60) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); } length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]); if (job_resrcs->cpu_array_reps[i] > 1) { length += xstrfmtcat(out, "*%d", job_resrcs->cpu_array_reps[i]); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } xstrcat(out, line_end); } } else if (job_resrcs && job_resrcs->core_bitmap && ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) { hl = hostlist_create(job_resrcs->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_resrcs->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; gres_last = ""; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; host = hostlist_shift(hl); threads = _threads_per_core(host); cpu_bitmap = bit_alloc(bit_reps * threads); for (j = 0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)){ for (k = 0; k < threads; k++) bit_set(cpu_bitmap, (j * threads) + k); } bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap); FREE_NULL_BITMAP(cpu_bitmap); /* * If the allocation values for this host are not the * same as the last host, print the report of the last * group of hosts that had identical allocation values. */ if (xstrcmp(tmp1, tmp2) || ((rel_node_inx < job_ptr->gres_detail_cnt) && xstrcmp(job_ptr->gres_detail_str[rel_node_inx], gres_last)) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s " "Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); if (rel_node_inx < job_ptr->gres_detail_cnt) { gres_last = job_ptr-> gres_detail_str[rel_node_inx]; } else { gres_last = ""; } last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL64; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 18 ******/ if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinCPUsNode=%s ", tmp1); } else { xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus); } convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, line_end); /****** Line ******/ secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1)); xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "Gres=%s Reservation=%s", job_ptr->gres, job_ptr->resv_name); xstrcat(out, line_end); /****** Line 20 ******/ xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s", job_share_string(job_ptr->shared), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, line_end); /****** Line 21 ******/ xstrfmtcat(out, "Command=%s", job_ptr->command); xstrcat(out, line_end); /****** Line 22 ******/ xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "Block_ID=%s", select_buf); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrcat(out, select_buf); } /****** Line 26 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "CnloadImage=%s", select_buf); } /****** Line 27 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "MloaderImage=%s", select_buf); } /****** Line 28 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "IoloadImage=%s", select_buf); } } /****** Line (optional) ******/ if (job_ptr->admin_comment) { xstrcat(out, line_end); xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment); } /****** Line (optional) ******/ if (job_ptr->comment) { xstrcat(out, line_end); xstrfmtcat(out, "Comment=%s ", job_ptr->comment); } /****** Line 30 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdErr=%s", tmp_path); } /****** Line 31 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdIn=%s", tmp_path); } /****** Line 32 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdOut=%s", tmp_path); } /****** Line 33 (optional) ******/ if (job_ptr->batch_script) { xstrcat(out, line_end); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 34 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; xstrcat(out, line_end); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf); } /****** Line 35 (optional) ******/ if (job_ptr->burst_buffer) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer); } /****** Line (optional) ******/ if (job_ptr->burst_buffer_state) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBufferState=%s", job_ptr->burst_buffer_state); } /****** Line 36 (optional) ******/ if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1), job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min, job_ptr->cpu_freq_max, NO_VAL) != 0) { xstrcat(out, line_end); xstrcat(out, tmp1); } /****** Line 37 ******/ xstrcat(out, line_end); xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags)); /****** Line 38 (optional) ******/ if (job_ptr->bitflags) { xstrcat(out, line_end); if (job_ptr->bitflags & GRES_ENFORCE_BIND) xstrcat(out, "GresEnforceBind=Yes"); if (job_ptr->bitflags & KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=Yes"); if (job_ptr->bitflags & NO_KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=No"); if (job_ptr->bitflags & SPREAD_JOB) xstrcat(out, "SpreadJob=Yes"); } /****** END OF JOB RECORD ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
/* * parse_command_line, fill in params data structure with data */ extern void parse_command_line(int argc, char **argv) { char *env_val = NULL; int opt_char; int option_index; hostlist_t host_list; bool long_form = false; bool opt_a_set = false, opt_p_set = false; bool env_a_set = false, env_p_set = false; static struct option long_options[] = { {"all", no_argument, 0, 'a'}, {"bg", no_argument, 0, 'b'}, {"dead", no_argument, 0, 'd'}, {"exact", no_argument, 0, 'e'}, {"federation",no_argument, 0, OPT_LONG_FEDR}, {"help", no_argument, 0, OPT_LONG_HELP}, {"hide", no_argument, 0, OPT_LONG_HIDE}, {"iterate", required_argument, 0, 'i'}, {"local", no_argument, 0, OPT_LONG_LOCAL}, {"long", no_argument, 0, 'l'}, {"cluster", required_argument, 0, 'M'}, {"clusters", required_argument, 0, 'M'}, {"nodes", required_argument, 0, 'n'}, {"noconvert", no_argument, 0, OPT_LONG_NOCONVERT}, {"noheader", no_argument, 0, 'h'}, {"Node", no_argument, 0, 'N'}, {"format", required_argument, 0, 'o'}, {"Format", required_argument, 0, 'O'}, {"partition", required_argument, 0, 'p'}, {"responding",no_argument, 0, 'r'}, {"list-reasons", no_argument, 0, 'R'}, {"summarize", no_argument, 0, 's'}, {"sort", required_argument, 0, 'S'}, {"states", required_argument, 0, 't'}, {"reservation",no_argument, 0, 'T'}, {"usage", no_argument, 0, OPT_LONG_USAGE}, {"verbose", no_argument, 0, 'v'}, {"version", no_argument, 0, 'V'}, {NULL, 0, 0, 0} }; params.convert_flags = CONVERT_NUM_UNIT_EXACT; if (slurmctld_conf.fed_params && strstr(slurmctld_conf.fed_params, "fed_display")) params.federation_flag = true; if (getenv("SINFO_ALL")) { env_a_set = true; params.all_flag = true; } if (getenv("SINFO_FEDERATION")) params.federation_flag = true; if (getenv("SINFO_LOCAL")) params.local = true; if ( ( env_val = getenv("SINFO_PARTITION") ) ) { env_p_set = true; params.partition = xstrdup(env_val); params.part_list = _build_part_list(env_val); params.all_flag = true; } if (env_a_set && env_p_set) { error("Conflicting options, SINFO_ALL and SINFO_PARTITION, specified. " "Please choose one or the other."); exit(1); } if ( ( env_val = getenv("SINFO_SORT") ) ) params.sort = xstrdup(env_val); if ( ( env_val = getenv("SLURM_CLUSTERS") ) ) { if (!(params.clusters = slurmdb_get_info_cluster(env_val))) { print_db_notok(env_val, 1); exit(1); } working_cluster_rec = list_peek(params.clusters); params.local = true; } while ((opt_char = getopt_long(argc, argv, "abdehi:lM:n:No:O:p:rRsS:t:TvV", long_options, &option_index)) != -1) { switch (opt_char) { case (int)'?': fprintf(stderr, "Try \"sinfo --help\" for more information\n"); exit(1); break; case (int)'a': opt_a_set = true; xfree(params.partition); FREE_NULL_LIST(params.part_list); params.all_flag = true; break; case (int)'b': params.cluster_flags = slurmdb_setup_cluster_flags(); if (params.cluster_flags & CLUSTER_FLAG_BG) params.bg_flag = true; else { error("Must be on a BG system to use --bg " "option, if using --cluster option " "put the --bg option " "after the --cluster option."); exit(1); } break; case (int)'d': params.dead_nodes = true; break; case (int)'e': params.exact_match = true; break; case (int)'h': params.no_header = true; break; case (int) 'i': params.iterate= atoi(optarg); if (params.iterate <= 0) { error ("Error: invalid entry for " "--iterate=%s", optarg); exit(1); } break; case (int) 'l': params.long_output = true; break; case (int) 'M': FREE_NULL_LIST(params.clusters); if (!(params.clusters = slurmdb_get_info_cluster(optarg))) { print_db_notok(optarg, 0); exit(1); } working_cluster_rec = list_peek(params.clusters); params.local = true; break; case OPT_LONG_NOCONVERT: params.convert_flags |= CONVERT_NUM_UNIT_NO; break; case (int) 'n': xfree(params.nodes); params.nodes = xstrdup(optarg); /* * confirm valid nodelist entry */ host_list = hostlist_create(params.nodes); if (!host_list) { error("'%s' invalid entry for --nodes", optarg); exit(1); } if (hostlist_count(host_list) == 1) { params.node_name_single = true; xfree(params.nodes); params.nodes = hostlist_deranged_string_xmalloc(host_list); } else params.node_name_single = false; hostlist_destroy(host_list); break; case (int) 'N': params.node_flag = true; break; case (int) 'o': xfree(params.format); params.format = xstrdup(optarg); break; case (int) 'O': long_form = true; xfree(params.format); params.format = xstrdup(optarg); break; case (int) 'p': opt_p_set = true; xfree(params.partition); FREE_NULL_LIST(params.part_list); params.partition = xstrdup(optarg); params.part_list = _build_part_list(optarg); params.all_flag = true; break; case (int) 'r': params.responding_nodes = true; break; case (int) 'R': params.list_reasons = true; break; case (int) 's': params.summarize = true; break; case (int) 'S': xfree(params.sort); params.sort = xstrdup(optarg); break; case (int) 't': xfree(params.states); params.states = xstrdup(optarg); if (!(params.state_list = _build_state_list(optarg))) { error ("valid states: %s", _node_state_list ()); exit (1); } break; case (int) 'T': params.reservation_flag = true; break; case (int) 'v': params.verbose++; break; case (int) 'V': print_slurm_version (); exit(0); case (int) OPT_LONG_FEDR: params.federation_flag = true; break; case (int) OPT_LONG_HELP: _help(); exit(0); case (int) OPT_LONG_USAGE: _usage(); exit(0); case OPT_LONG_HIDE: params.all_flag = false; break; case OPT_LONG_LOCAL: params.local = true; break; } } if (opt_a_set && opt_p_set) { error("Conflicting options, -a and -p, specified. " "Please choose one or the other."); exit(1); } params.cluster_flags = slurmdb_setup_cluster_flags(); if (params.federation_flag && !params.clusters && !params.local) { void *ptr = NULL; char *cluster_name = slurm_get_cluster_name(); if (slurm_load_federation(&ptr) || !cluster_in_federation(ptr, cluster_name)) { /* Not in federation */ params.local = true; slurm_destroy_federation_rec(ptr); } else { params.fed = (slurmdb_federation_rec_t *) ptr; } xfree(cluster_name); } if ( params.format == NULL ) { if ( params.summarize ) { params.part_field_flag = true; /* compute size later */ if (params.cluster_flags & CLUSTER_FLAG_BG) params.format = "%9P %.5a %.10l %.32F %N"; else params.format = "%9P %.5a %.10l %.16F %N"; } else if ( params.node_flag ) { params.node_field_flag = true; /* compute size later */ params.part_field_flag = true; /* compute size later */ params.format = params.long_output ? "%N %.6D %.9P %.11T %.4c %.8z %.6m %.8d %.6w %.8f %20E" : "%N %.6D %.9P %6t"; } else if (params.list_reasons) { params.format = params.long_output ? "%20E %12U %19H %6t %N" : "%20E %9u %19H %N"; } else if ((env_val = getenv ("SINFO_FORMAT"))) { params.format = xstrdup(env_val); } else if (params.fed) { params.part_field_flag = true; /* compute size later */ params.format = params.long_output ? "%9P %8V %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" : "%9P %8V %.5a %.10l %.6D %.6t %N"; } else { params.part_field_flag = true; /* compute size later */ params.format = params.long_output ? "%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" : "%9P %.5a %.10l %.6D %.6t %N"; } } if (long_form) _parse_long_format(params.format); else _parse_format(params.format); if (params.list_reasons && (params.state_list == NULL)) { params.states = xstrdup("down,fail,drain,error"); if (!(params.state_list = _build_state_list (params.states))) fatal ("Unable to build state list for -R!"); } if (params.dead_nodes || params.nodes || params.partition || params.responding_nodes ||params.state_list) params.filtering = true; if (params.verbose) _print_options(); }
/* build maps for task layout on nodes */ static int _init_task_layout(slurm_step_layout_t *step_layout, const char *arbitrary_nodes, uint16_t *cpus_per_node, uint32_t *cpu_count_reps, uint16_t cpus_per_task, uint16_t task_dist, uint16_t plane_size) { int cpu_cnt = 0, cpu_inx = 0, i; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); /* char *name = NULL; */ uint16_t cpus[step_layout->node_cnt]; if (step_layout->node_cnt == 0) return SLURM_ERROR; if (step_layout->tasks) /* layout already completed */ return SLURM_SUCCESS; if ((int)cpus_per_task < 1 || cpus_per_task == (uint16_t)NO_VAL) cpus_per_task = 1; step_layout->plane_size = plane_size; step_layout->tasks = xmalloc(sizeof(uint16_t) * step_layout->node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) * step_layout->node_cnt); if (!(cluster_flags & CLUSTER_FLAG_BG)) { hostlist_t hl = hostlist_create(step_layout->node_list); /* make sure the number of nodes we think we have * is the correct number */ i = hostlist_count(hl); if (step_layout->node_cnt > i) step_layout->node_cnt = i; hostlist_destroy(hl); } debug("laying out the %u tasks on %u hosts %s dist %u", step_layout->task_cnt, step_layout->node_cnt, step_layout->node_list, task_dist); if (step_layout->node_cnt < 1) { error("no hostlist given can't layout tasks"); return SLURM_ERROR; } for (i=0; i<step_layout->node_cnt; i++) { /* name = hostlist_shift(hl); */ /* if (!name) { */ /* error("hostlist incomplete for this job request"); */ /* hostlist_destroy(hl); */ /* return SLURM_ERROR; */ /* } */ /* debug2("host %d = %s", i, name); */ /* free(name); */ cpus[i] = (cpus_per_node[cpu_inx] / cpus_per_task); if (cpus[i] == 0) { /* this can be a result of a heterogeneous allocation * (e.g. 4 cpus on one node and 2 on the second with * cpus_per_task=3) */ cpus[i] = 1; } //info("got %d cpus", cpus[i]); if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { /* move to next record */ cpu_inx++; cpu_cnt = 0; } } if ((task_dist == SLURM_DIST_CYCLIC) || (task_dist == SLURM_DIST_CYCLIC_CYCLIC) || (task_dist == SLURM_DIST_CYCLIC_BLOCK)) return _task_layout_cyclic(step_layout, cpus); else if (task_dist == SLURM_DIST_ARBITRARY && !(cluster_flags & CLUSTER_FLAG_FE)) return _task_layout_hostfile(step_layout, arbitrary_nodes); else if (task_dist == SLURM_DIST_PLANE) return _task_layout_plane(step_layout, cpus); else return _task_layout_block(step_layout, cpus); }
/* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; char *host_task = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; debug2("job list is %s", step_layout->node_list); job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); if (!arbitrary_nodes) { error("no hostlist given for arbitrary dist"); return SLURM_ERROR; } debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %u tasks have %d in the nodelist. " "Check your nodelist, or set the -n option to be %d", step_layout->task_cnt, hostlist_count(step_alloc_hosts), hostlist_count(step_alloc_hosts)); return SLURM_ERROR; } itr_task = hostlist_iterator_create(step_alloc_hosts); while((host = hostlist_next(itr))) { step_layout->tasks[i] = 0; while((host_task = hostlist_next(itr_task))) { if (!strcmp(host, host_task)) { step_layout->tasks[i]++; task_cnt++; } free(host_task); if (task_cnt >= step_layout->task_cnt) break; } debug3("%s got %u tasks", host, step_layout->tasks[i]); if (step_layout->tasks[i] == 0) goto reset_hosts; step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); taskid = 0; j = 0; hostlist_iterator_reset(itr_task); while((host_task = hostlist_next(itr_task))) { if (!strcmp(host, host_task)) { step_layout->tids[i][j] = taskid; j++; } taskid++; free(host_task); if (j >= step_layout->tasks[i]) break; } i++; reset_hosts: hostlist_iterator_reset(itr_task); free(host); if (i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); if (task_cnt != step_layout->task_cnt) { error("Asked for %u tasks but placed %d. Check your nodelist", step_layout->task_cnt, task_cnt); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; int step_inx = 0, step_hosts_cnt = 0; struct node_record **step_hosts_ptrs = NULL; struct node_record *host_ptr = NULL; debug2("job list is %s", step_layout->node_list); if (!arbitrary_nodes) { error("no hostlist given for arbitrary dist"); return SLURM_ERROR; } debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %u tasks have %d in the nodelist. " "Check your nodelist, or set the -n option to be %d", step_layout->task_cnt, hostlist_count(step_alloc_hosts), hostlist_count(step_alloc_hosts)); hostlist_destroy(step_alloc_hosts); return SLURM_ERROR; } job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); itr_task = hostlist_iterator_create(step_alloc_hosts); /* * Build array of pointers so that we can do pointer comparisons rather * than strcmp's on nodes. */ step_hosts_cnt = hostlist_count(step_alloc_hosts); step_hosts_ptrs = xmalloc(sizeof(struct node_record *) * step_hosts_cnt); step_inx = 0; while((host = hostlist_next(itr_task))) { step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host); free(host); } while((host = hostlist_next(itr))) { host_ptr = find_node_record(host); step_layout->tasks[i] = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tasks[i]++; task_cnt++; } if (task_cnt >= step_layout->task_cnt) break; } debug3("%s got %u tasks", host, step_layout->tasks[i]); if (step_layout->tasks[i] == 0) goto reset_hosts; step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); taskid = 0; j = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tids[i][j] = taskid; j++; } taskid++; if (j >= step_layout->tasks[i]) break; } i++; reset_hosts: free(host); if (i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); xfree(step_hosts_ptrs); if (task_cnt != step_layout->task_cnt) { error("Asked for %u tasks but placed %d. Check your nodelist", step_layout->task_cnt, task_cnt); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* build maps for task layout on nodes */ static int _init_task_layout(slurm_step_layout_req_t *step_layout_req, slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int cpu_cnt = 0, cpu_inx = 0, cpu_task_cnt = 0, cpu_task_inx = 0, i; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); uint16_t cpus[step_layout->node_cnt]; uint16_t cpus_per_task[1]; uint32_t cpus_task_reps[1]; if (step_layout->node_cnt == 0) return SLURM_ERROR; if (step_layout->tasks) /* layout already completed */ return SLURM_SUCCESS; if (!step_layout_req->cpus_per_task) { cpus_per_task[0] = 1; cpus_task_reps[0] = step_layout_req->num_hosts; step_layout_req->cpus_per_task = cpus_per_task; step_layout_req->cpus_task_reps = cpus_task_reps; } if (((int)step_layout_req->cpus_per_task[0] < 1) || (step_layout_req->cpus_per_task[0] == NO_VAL16)) { step_layout_req->cpus_per_task[0] = 1; step_layout_req->cpus_task_reps[0] = step_layout_req->num_hosts; } step_layout->plane_size = step_layout_req->plane_size; step_layout->tasks = xmalloc(sizeof(uint16_t) * step_layout->node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) * step_layout->node_cnt); if (!(cluster_flags & CLUSTER_FLAG_BG)) { hostlist_t hl = hostlist_create(step_layout->node_list); /* make sure the number of nodes we think we have * is the correct number */ i = hostlist_count(hl); if (step_layout->node_cnt > i) step_layout->node_cnt = i; hostlist_destroy(hl); } debug("laying out the %u tasks on %u hosts %s dist %u", step_layout->task_cnt, step_layout->node_cnt, step_layout->node_list, step_layout->task_dist); if (step_layout->node_cnt < 1) { error("no hostlist given can't layout tasks"); return SLURM_ERROR; } /* hostlist_t hl = hostlist_create(step_layout->node_list); */ for (i=0; i<step_layout->node_cnt; i++) { /* char *name = hostlist_shift(hl); */ /* if (!name) { */ /* error("hostlist incomplete for this job request"); */ /* hostlist_destroy(hl); */ /* return SLURM_ERROR; */ /* } */ /* debug2("host %d = %s", i, name); */ /* free(name); */ cpus[i] = (step_layout_req->cpus_per_node[cpu_inx] / step_layout_req->cpus_per_task[cpu_task_inx]); if (cpus[i] == 0) { /* this can be a result of a heterogeneous allocation * (e.g. 4 cpus on one node and 2 on the second with * step_layout_req->cpus_per_task=3) */ cpus[i] = 1; } if (step_layout->plane_size && (step_layout->plane_size != NO_VAL16) && ((step_layout->task_dist & SLURM_DIST_STATE_BASE) != SLURM_DIST_PLANE)) { /* plane_size when dist != plane is used to convey ntasks_per_node. Adjust the number of cpus to reflect that. */ uint16_t cpus_per_node = step_layout->plane_size * step_layout_req->cpus_per_task[cpu_task_inx]; if (cpus[i] > cpus_per_node) cpus[i] = cpus_per_node; } /* info("got %d cpus", cpus[i]); */ if ((++cpu_cnt) >= step_layout_req->cpu_count_reps[cpu_inx]) { /* move to next record */ cpu_inx++; cpu_cnt = 0; } if ((++cpu_task_cnt) >= step_layout_req->cpus_task_reps[cpu_task_inx]) { /* move to next record */ cpu_task_inx++; cpu_task_cnt = 0; } } if ((step_layout->task_dist & SLURM_DIST_NODEMASK) == SLURM_DIST_NODECYCLIC) return _task_layout_cyclic(step_layout, cpus); else if (((step_layout->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_ARBITRARY) && !(cluster_flags & CLUSTER_FLAG_FE)) return _task_layout_hostfile(step_layout, arbitrary_nodes); else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_PLANE) return _task_layout_plane(step_layout, cpus); else return _task_layout_block(step_layout, cpus); }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }
void *_fwd_tree_thread(void *arg) { fwd_tree_t *fwd_tree = (fwd_tree_t *)arg; List ret_list = NULL; char *name = NULL; char *buf = NULL; slurm_msg_t send_msg; slurm_msg_t_init(&send_msg); send_msg.msg_type = fwd_tree->orig_msg->msg_type; send_msg.data = fwd_tree->orig_msg->data; send_msg.protocol_version = fwd_tree->orig_msg->protocol_version; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(fwd_tree->tree_hl))) { if (slurm_conf_get_addr(name, &send_msg.address) == SLURM_ERROR) { error("fwd_tree_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward(&fwd_tree->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } send_msg.forward.timeout = fwd_tree->timeout; if ((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))){ buf = hostlist_ranged_string_xmalloc( fwd_tree->tree_hl); send_msg.forward.nodelist = buf; } else send_msg.forward.nodelist = NULL; if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) { debug3("Tree sending to %s along with %s", name, send_msg.forward.nodelist); } else debug3("Tree sending to %s", name); ret_list = slurm_send_addr_recv_msgs(&send_msg, name, fwd_tree->timeout); xfree(send_msg.forward.nodelist); if (ret_list) { int ret_cnt = list_count(ret_list); /* This is most common if a slurmd is running an older version of Slurm than the originator of the message. */ if ((ret_cnt <= send_msg.forward.cnt) && (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) { error("fwd_tree_thread: %s failed to forward " "the message, expecting %d ret got only " "%d", name, send_msg.forward.cnt + 1, ret_cnt); if (ret_cnt > 1) { /* not likely */ ret_data_info_t *ret_data_info = NULL; ListIterator itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (xstrcmp(ret_data_info-> node_name, name)) hostlist_delete_host( fwd_tree-> tree_hl, ret_data_info-> node_name); } list_iterator_destroy(itr); } } slurm_mutex_lock(fwd_tree->tree_mutex); list_transfer(fwd_tree->ret_list, ret_list); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); FREE_NULL_LIST(ret_list); /* try next node */ if (ret_cnt <= send_msg.forward.cnt) { free(name); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _start_msg_tree_internal( fwd_tree->tree_hl, NULL, fwd_tree, hostlist_count(fwd_tree->tree_hl)); continue; } } else { /* This should never happen (when this was * written slurm_send_addr_recv_msgs always * returned a list */ error("fwd_tree_thread: no return list given from " "slurm_send_addr_recv_msgs spawned for %s", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward( &fwd_tree->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } free(name); /* check for error and try again */ if (errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR) continue; break; } _destroy_tree_fwd(fwd_tree); return NULL; }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; int max_depth, width, depth, i; char *p; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif width = slurm_get_tree_width(); coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(coll->my_peerid, coll->peers_cnt, width, &coll->prnt_peerid, &coll->chldrn_cnt, &depth, &max_depth); /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_children = 0; coll->contrib_local = false; coll->chldrn_ids = xmalloc(sizeof(int) * width); coll->contrib_chld = xmalloc(sizeof(int) * width); coll->chldrn_cnt = reverse_tree_direct_children(coll->my_peerid, coll->peers_cnt, width, depth, coll->chldrn_ids); if (coll->prnt_peerid == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want * ourselfs there) */ coll->prnt_host = NULL; coll->all_chldrn_hl = hostlist_copy(hl); hostlist_delete_host(coll->all_chldrn_hl, pmixp_info_hostname()); coll->chldrn_str = hostlist_ranged_string_xmalloc(coll->all_chldrn_hl); } else { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ /* * setup parent id's */ p = hostlist_nth(hl, coll->prnt_peerid); coll->prnt_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->prnt_peerid = pmixp_info_job_hostid(coll->prnt_host); /* * setup root id's * (we need this for the SLURM API communication case) */ p = hostlist_nth(hl, 0); coll->root_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->root_peerid = pmixp_info_job_hostid(coll->root_host); /* use empty hostlist here */ coll->all_chldrn_hl = hostlist_create(""); coll->chldrn_str = NULL; } /* fixup children peer ids to te global ones */ for(i=0; i<coll->chldrn_cnt; i++){ p = hostlist_nth(hl, coll->chldrn_ids[i]); coll->chldrn_ids[i] = pmixp_info_job_hostid(p); free(p); } hostlist_destroy(hl); /* Collective state */ coll->ufwd_buf = pmixp_server_buf_new(); coll->dfwd_buf = pmixp_server_buf_new(); _reset_coll_ufwd(coll); _reset_coll_dfwd(coll); coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
void *_forward_thread(void *arg) { forward_msg_t *fwd_msg = (forward_msg_t *)arg; forward_struct_t *fwd_struct = fwd_msg->fwd_struct; Buf buffer = init_buf(BUF_SIZE); /* probably enough for header */ List ret_list = NULL; int fd = -1; ret_data_info_t *ret_data_info = NULL; char *name = NULL; hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist); slurm_addr_t addr; char *buf = NULL; int steps = 0; int start_timeout = fwd_msg->timeout; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(hl))) { if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) { error("forward_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); continue; } goto cleanup; } if ((fd = slurm_open_msg_conn(&addr)) < 0) { error("forward_thread to %s: %m", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } buf = hostlist_ranged_string_xmalloc(hl); xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = buf; fwd_msg->header.forward.cnt = hostlist_count(hl); #if 0 info("sending %d forwards (%s) to %s", fwd_msg->header.forward.cnt, fwd_msg->header.forward.nodelist, name); #endif if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); } else debug3("forward: send to %s ", name); pack_header(&fwd_msg->header, buffer); /* add forward data to buffer */ if (remaining_buf(buffer) < fwd_struct->buf_len) { int new_size = buffer->processed + fwd_struct->buf_len; new_size += 1024; /* padded for paranoia */ xrealloc_nz(buffer->head, new_size); buffer->size = new_size; } if (fwd_struct->buf_len) { memcpy(&buffer->head[buffer->processed], fwd_struct->buf, fwd_struct->buf_len); buffer->processed += fwd_struct->buf_len; } /* * forward message */ if (slurm_msg_sendto(fd, get_buf_data(buffer), get_buf_offset(buffer), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) { error("forward_thread: slurm_msg_sendto: %m"); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); close(fd); fd = -1; /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } /* These messages don't have a return message, but if * we got here things worked out so make note of the * list of nodes as success. */ if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) || (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) || (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) { slurm_mutex_lock(&fwd_struct->forward_mutex); ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); while ((name = hostlist_shift(hl))) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); } goto cleanup; } if (fwd_msg->header.forward.cnt > 0) { static int message_timeout = -1; if (message_timeout < 0) message_timeout = slurm_get_msg_timeout() * 1000; if (!fwd_msg->header.forward.tree_width) fwd_msg->header.forward.tree_width = slurm_get_tree_width(); steps = (fwd_msg->header.forward.cnt+1) / fwd_msg->header.forward.tree_width; fwd_msg->timeout = (message_timeout*steps); /* info("got %d * %d = %d", message_timeout, */ /* steps, fwd_msg->timeout); */ steps++; fwd_msg->timeout += (start_timeout*steps); /* info("now + %d*%d = %d", start_timeout, */ /* steps, fwd_msg->timeout); */ } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); /* info("sent %d forwards got %d back", */ /* fwd_msg->header.forward.cnt, list_count(ret_list)); */ if (!ret_list || (fwd_msg->header.forward.cnt != 0 && list_count(ret_list) <= 1)) { slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); FREE_NULL_LIST(ret_list); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); close(fd); fd = -1; continue; } goto cleanup; } else if ((fwd_msg->header.forward.cnt+1) != list_count(ret_list)) { /* this should never be called since the above should catch the failed forwards and pipe them back down, but this is here so we never have to worry about a locked mutex */ ListIterator itr = NULL; char *tmp = NULL; int first_node_found = 0; hostlist_iterator_t host_itr = hostlist_iterator_create(hl); error("We shouldn't be here. We forwarded to %d " "but only got %d back", (fwd_msg->header.forward.cnt+1), list_count(ret_list)); while ((tmp = hostlist_next(host_itr))) { int node_found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (!ret_data_info->node_name) { first_node_found = 1; ret_data_info->node_name = xstrdup(name); } if (!xstrcmp(tmp, ret_data_info->node_name)) { node_found = 1; break; } } list_iterator_destroy(itr); if (!node_found) { mark_as_failed_forward( &fwd_struct->ret_list, tmp, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } hostlist_iterator_destroy(host_itr); if (!first_node_found) { mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; } slurm_mutex_lock(&fwd_struct->forward_mutex); if (ret_list) { while ((ret_data_info = list_pop(ret_list)) != NULL) { if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(name); } list_push(fwd_struct->ret_list, ret_data_info); debug3("got response from %s", ret_data_info->node_name); } FREE_NULL_LIST(ret_list); } free(name); cleanup: if ((fd >= 0) && close(fd) < 0) error ("close(%d): %m", fd); hostlist_destroy(hl); destroy_forward(&fwd_msg->header.forward); free_buf(buffer); slurm_cond_signal(&fwd_struct->notify); slurm_mutex_unlock(&fwd_struct->forward_mutex); xfree(fwd_msg); return (NULL); }
static int eliminate_nodes (char **hosts) { hostlist_t hl = NULL; hostlist_t hlnew = NULL; hostlist_iterator_t hitr = NULL; ipmidetect_t id = NULL; char *host = NULL; char hostbuf[HOSTLIST_BUFLEN + 1]; int rv = -1; assert (hosts); assert (*hosts); if (!(id = ipmidetect_handle_create ())) { fprintf (stderr, "ipmidetect_handle_create\n"); goto cleanup; } if (ipmidetect_load_data (id, NULL, 0, 0) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT) fprintf (stderr, "Error connecting to ipmidetect daemon\n"); else fprintf (stderr, "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!(hl = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hlnew = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hitr = hostlist_iterator_create (hl))) { fprintf (stderr, "hostlist_iterator_create: %s\n", strerror (errno)); goto cleanup; } while ((host = hostlist_next (hitr))) { int ret; if ((ret = ipmidetect_is_node_detected (id, host)) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND) fprintf (stderr, "Node '%s' unrecognized by ipmidetect\n", host); else fprintf (stderr, "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!ret) hostlist_delete (hlnew, host); free (host); } host = NULL; if (!hostlist_count (hlnew)) { rv = 0; goto cleanup; } memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1); if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0) { fprintf (stderr, "hostlist_ranged_string: truncation\n"); goto cleanup; } free (*hosts); if (!(*hosts = strdup (hostbuf))) { fprintf (stderr, "strdup: %s\n", strerror (errno)); goto cleanup; } rv = hostlist_count (hlnew); cleanup: if (id) ipmidetect_handle_destroy (id); if (hitr) hostlist_iterator_destroy (hitr); if (hl) hostlist_destroy (hl); if (hlnew) hostlist_destroy (hlnew); free (host); return (rv); }
/* * Read a SLURM hostfile specified by "filename". "filename" must contain * a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any SLURM daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_num++; line_size = strlen(in_line); if (line_size == (BUFFER_SIZE - 1)) { error ("Line %d, of hostfile %s too long", line_num, filename); fclose (fp); hostlist_destroy(hostlist); return NULL; } for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } hostlist_push(hostlist, in_line); if (n != (int)NO_VAL && hostlist_count(hostlist) == n) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in SLURM Hostfile"); goto cleanup_hostfile; } nodelist = (char *)malloc(0xffff); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); return nodelist; }
/* * Convert all MPS records to a new entries in a list where each File is a * unique device (i.e. convert a record with "File=nvidia[0-3]" into 4 separate * records). Similar to _build_gpu_list(), but we copy more fields, divide the * "Count" across all MPS records and remove from the original list. */ static List _build_mps_list(List gres_list) { ListIterator itr; gres_slurmd_conf_t *gres_record, *mps_record; List mps_list; hostlist_t hl; char *f_name; uint64_t count_per_file; int mps_no_file_recs = 0, mps_file_recs = 0; if (gres_list == NULL) return NULL; mps_list = list_create(_delete_gres_list); itr = list_iterator_create(gres_list); while ((gres_record = list_next(itr))) { if (xstrcmp(gres_record->name, "mps")) continue; if (!gres_record->file) { if (mps_no_file_recs) fatal("gres/mps: bad configuration, multiple configurations without \"File\""); if (mps_file_recs) fatal("gres/mps: multiple configurations with and without \"File\""); mps_no_file_recs++; mps_record = xmalloc(sizeof(gres_slurmd_conf_t)); mps_record->config_flags = gres_record->config_flags; if (gres_record->type_name) mps_record->config_flags |= GRES_CONF_HAS_TYPE; mps_record->count = gres_record->count; mps_record->cpu_cnt = gres_record->cpu_cnt; mps_record->cpus = xstrdup(gres_record->cpus); if (gres_record->cpus_bitmap) { mps_record->cpus_bitmap = bit_copy(gres_record->cpus_bitmap); } mps_record->name = xstrdup(gres_record->name); mps_record->plugin_id = gres_record->plugin_id; mps_record->type_name = xstrdup(gres_record->type_name); list_append(mps_list, mps_record); } else { mps_file_recs++; if (mps_no_file_recs) fatal("gres/mps: multiple configurations with and without \"File\""); hl = hostlist_create(gres_record->file); count_per_file = gres_record->count/hostlist_count(hl); while ((f_name = hostlist_shift(hl))) { mps_record =xmalloc(sizeof(gres_slurmd_conf_t)); mps_record->config_flags = gres_record->config_flags; if (gres_record->type_name) { mps_record->config_flags |= GRES_CONF_HAS_TYPE; } mps_record->count = count_per_file; mps_record->cpu_cnt = gres_record->cpu_cnt; mps_record->cpus = xstrdup(gres_record->cpus); if (gres_record->cpus_bitmap) { mps_record->cpus_bitmap = bit_copy(gres_record->cpus_bitmap); } mps_record->file = xstrdup(f_name); mps_record->name = xstrdup(gres_record->name); mps_record->plugin_id = gres_record->plugin_id; mps_record->type_name = xstrdup(gres_record->type_name); list_append(mps_list, mps_record); free(f_name); } hostlist_destroy(hl); } (void) list_delete_item(itr); } list_iterator_destroy(itr); return mps_list; }
/* * _set_collectors call the split_hostlist API on the all nodes hostlist * to set the node to be used as a collector for unsolicited node aggregation. * * If this node is a forwarding node (first node in any hostlist), * then its collector and backup are the ControlMachine and it's backup. * * Otherwise, we find the hostlist containing this node. * The forwarding node in that hostlist becomes a collector, the next node * which is not this node becomes the backup. * That list is split, we iterate through it and searching for a list in * which this node is a forwarding node. If found, we set the collector and * backup, else this process is repeated. */ static void _set_collectors(char *this_node_name) { slurm_ctl_conf_t *conf; hostlist_t nodes; hostlist_t* hll = NULL; char *parent = NULL, *backup = NULL; char addrbuf[32]; int i, j, f; int hl_count = 0; uint16_t parent_port; uint16_t backup_port; bool found = false; bool ctldparent = true; #ifdef HAVE_FRONT_END return; /* on a FrontEnd system this would never be useful. */ #endif if (!run_in_daemon("slurmd")) return; /* Only compute nodes have collectors */ /* Set the initial iteration, collector is controller, * full list is split */ xassert(this_node_name); conf = slurm_conf_lock(); nodes = _get_all_nodes(); parent = strdup(conf->control_addr); if (conf->backup_addr) { backup = strdup(conf->backup_addr); } parent_port = conf->slurmctld_port; backup_port = parent_port; slurm_conf_unlock(); while (!found) { if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) { error("unable to split forward hostlist"); goto clean; /* collector addrs remains null */ } /* Find which hostlist contains this node */ for (i=0; i < hl_count; i++) { f = hostlist_find(hll[i], this_node_name); if (f != -1) break; } if (i == hl_count) { fatal("ROUTE -- %s not found in node_record_table", this_node_name); } if (f == 0) { /* we are a forwarded to node, * so our parent is parent */ if (hostlist_count(hll[i]) > 1) this_is_collector = true; xfree(msg_collect_node); msg_collect_node = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) slurm_set_addr(msg_collect_node, parent_port, parent); else { slurm_conf_get_addr(parent, msg_collect_node); msg_collect_node->sin_port = htons(parent_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); info("ROUTE -- message collector address is %s", addrbuf); } xfree(msg_collect_backup); if (backup) { msg_collect_backup = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_backup, backup_port, backup); } else { slurm_conf_get_addr(backup, msg_collect_backup); msg_collect_backup->sin_port = htons(backup_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr( msg_collect_backup, addrbuf, 32); info("ROUTE -- message collector backup" " address is %s", addrbuf); } } else { if (debug_flags & DEBUG_FLAG_ROUTE) { info("ROUTE -- no message collector " "backup"); } } found = true; goto clean; } /* We are not a forwarding node, the first node in this list * will split the forward_list. * We also know that the forwarding node is not a controller. * * clean up parent context */ ctldparent = false; hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); nodes = hostlist_copy(hll[i]); for (j=0; j < hl_count; j++) { hostlist_destroy(hll[j]); } xfree(hll); /* set our parent, backup, and continue search */ parent = hostlist_shift(nodes); backup = hostlist_nth(nodes, 0); if (strcmp(backup, this_node_name) == 0) { free(backup); backup = NULL; if (hostlist_count(nodes) > 1) backup = hostlist_nth(nodes, 1); } parent_port = slurm_conf_get_port(parent); if (backup) { backup_port = slurm_conf_get_port(backup); } else backup_port = 0; } clean: if (debug_flags & DEBUG_FLAG_ROUTE) { if (this_is_collector) info("ROUTE -- %s is a collector node", this_node_name); else info("ROUTE -- %s is a leaf node", this_node_name); } hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); for (i=0; i < hl_count; i++) { hostlist_destroy(hll[i]); } xfree(hll); }
void print_fields(type_t type, void *object) { if (!object) { fatal ("Job or step record is NULL"); return; } slurmdb_job_rec_t *job = (slurmdb_job_rec_t *)object; slurmdb_step_rec_t *step = (slurmdb_step_rec_t *)object; jobcomp_job_rec_t *job_comp = (jobcomp_job_rec_t *)object; print_field_t *field = NULL; int curr_inx = 1; struct passwd *pw = NULL; struct group *gr = NULL; char outbuf[FORMAT_STRING_SIZE]; bool got_stats = false; int cpu_tres_rec_count = 0; int step_cpu_tres_rec_count = 0; switch(type) { case JOB: step = NULL; if (!job->track_steps) step = (slurmdb_step_rec_t *)job->first_step_ptr; /* set this to avoid printing out info for things that don't mean anything. Like an allocation that never ran anything. */ if (!step) job->track_steps = 1; else step_cpu_tres_rec_count = slurmdb_find_tres_count_in_string( step->tres_alloc_str, TRES_CPU); if (job->stats.cpu_min != NO_VAL) got_stats = true; job_comp = NULL; cpu_tres_rec_count = slurmdb_find_tres_count_in_string( job->tres_alloc_str, TRES_CPU); break; case JOBSTEP: job = step->job_ptr; if (step->stats.cpu_min != NO_VAL) got_stats = true; if (!(step_cpu_tres_rec_count = slurmdb_find_tres_count_in_string( step->tres_alloc_str, TRES_CPU))) step_cpu_tres_rec_count = slurmdb_find_tres_count_in_string( job->tres_alloc_str, TRES_CPU); job_comp = NULL; break; case JOBCOMP: job = NULL; step = NULL; break; default: break; } list_iterator_reset(print_fields_itr); while((field = list_next(print_fields_itr))) { char *tmp_char = NULL, id[FORMAT_STRING_SIZE]; int tmp_int = NO_VAL, tmp_int2 = NO_VAL; double tmp_dub = (double)NO_VAL; uint32_t tmp_uint32 = (uint32_t)NO_VAL; uint64_t tmp_uint64 = (uint64_t)NO_VAL; memset(&outbuf, 0, sizeof(outbuf)); switch(field->type) { case PRINT_ALLOC_CPUS: switch(type) { case JOB: tmp_int = cpu_tres_rec_count; // we want to use the step info if (!step) break; case JOBSTEP: tmp_int = step_cpu_tres_rec_count; break; case JOBCOMP: default: tmp_int = job_comp->proc_cnt; break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_ALLOC_GRES: switch(type) { case JOB: tmp_char = job->alloc_gres; break; case JOBSTEP: tmp_char = step->job_ptr->alloc_gres; break; case JOBCOMP: default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_ACCOUNT: switch(type) { case JOB: tmp_char = job->account; break; case JOBSTEP: tmp_char = step->job_ptr->account; break; case JOBCOMP: default: tmp_char = "n/a"; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_ACT_CPUFREQ: if (got_stats) { switch (type) { case JOB: if (!job->track_steps) tmp_dub = step->stats.act_cpufreq; break; case JOBSTEP: tmp_dub = step->stats.act_cpufreq; break; default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) _local_convert_num_unit2((double)tmp_dub, outbuf, sizeof(outbuf), UNIT_KILO, 1000, false); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_ASSOCID: switch(type) { case JOB: tmp_int = job->associd; break; case JOBSTEP: tmp_int = step->job_ptr->associd; break; case JOBCOMP: default: tmp_int = NO_VAL; break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_AVECPU: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats.cpu_ave; break; case JOBSTEP: tmp_dub = step->stats.cpu_ave; break; case JOBCOMP: default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) tmp_char = _elapsed_time((long)tmp_dub, 0); field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_AVEDISKREAD: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job-> stats.disk_read_ave; break; case JOBSTEP: tmp_dub = step->stats.disk_read_ave; break; case JOBCOMP: default: break; } } _print_small_double(outbuf, sizeof(outbuf), tmp_dub, UNIT_MEGA); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_AVEDISKWRITE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job-> stats.disk_write_ave; break; case JOBSTEP: tmp_dub = step->stats.disk_write_ave; break; case JOBCOMP: default: break; } } _print_small_double(outbuf, sizeof(outbuf), tmp_dub, UNIT_MEGA); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_AVEPAGES: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats.pages_ave; break; case JOBSTEP: tmp_dub = step->stats.pages_ave; break; case JOBCOMP: default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) _local_convert_num_unit((double)tmp_dub, outbuf, sizeof(outbuf), UNIT_KILO); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_AVERSS: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats.rss_ave; break; case JOBSTEP: tmp_dub = step->stats.rss_ave; break; case JOBCOMP: default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) _local_convert_num_unit((double)tmp_dub, outbuf, sizeof(outbuf), UNIT_KILO); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_AVEVSIZE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats.vsize_ave; break; case JOBSTEP: tmp_dub = step->stats.vsize_ave; break; case JOBCOMP: default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) _local_convert_num_unit((double)tmp_dub, outbuf, sizeof(outbuf), UNIT_KILO); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_BLOCKID: switch(type) { case JOB: tmp_char = job->blockid; break; case JOBSTEP: break; case JOBCOMP: tmp_char = job_comp->blockid; break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_CLUSTER: switch(type) { case JOB: tmp_char = job->cluster; break; case JOBSTEP: tmp_char = step->job_ptr->cluster; break; case JOBCOMP: default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_COMMENT: switch(type) { case JOB: tmp_char = job->derived_es; break; case JOBSTEP: case JOBCOMP: default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_CONSUMED_ENERGY: if (got_stats) { switch (type) { case JOB: if (!job->track_steps) tmp_dub = step-> stats.consumed_energy; break; case JOBSTEP: tmp_dub = step->stats.consumed_energy; break; default: break; } } if (!fuzzy_equal(tmp_dub, NO_VAL)) _local_convert_num_unit2((double)tmp_dub, outbuf, sizeof(outbuf), UNIT_NONE, 1000, false); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_CONSUMED_ENERGY_RAW: if (got_stats) { switch (type) { case JOB: if (!job->track_steps) tmp_dub = step-> stats.consumed_energy; break; case JOBSTEP: tmp_dub = step->stats.consumed_energy; break; default: break; } } field->print_routine(field, tmp_dub, (curr_inx == field_count)); break; case PRINT_CPU_TIME: switch(type) { case JOB: tmp_uint64 = (uint64_t)job->elapsed * (uint64_t)cpu_tres_rec_count; break; case JOBSTEP: tmp_uint64 = (uint64_t)step->elapsed * (uint64_t)step_cpu_tres_rec_count; break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_uint64, (curr_inx == field_count)); break; case PRINT_CPU_TIME_RAW: switch(type) { case JOB: tmp_uint64 = (uint64_t)job->elapsed * (uint64_t)cpu_tres_rec_count; break; case JOBSTEP: tmp_uint64 = (uint64_t)step->elapsed * (uint64_t)step_cpu_tres_rec_count; break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_uint64, (curr_inx == field_count)); break; case PRINT_DERIVED_EC: tmp_int2 = 0; switch(type) { case JOB: tmp_int = job->derived_ec; if (tmp_int == NO_VAL) tmp_int = 0; if (WIFSIGNALED(tmp_int)) tmp_int2 = WTERMSIG(tmp_int); snprintf(outbuf, sizeof(outbuf), "%d:%d", WEXITSTATUS(tmp_int), tmp_int2); break; case JOBSTEP: case JOBCOMP: default: outbuf[0] = '\0'; break; } field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_ELAPSED: switch(type) { case JOB: tmp_int = job->elapsed; break; case JOBSTEP: tmp_int = step->elapsed; break; case JOBCOMP: tmp_int = job_comp->elapsed_time; break; default: tmp_int = NO_VAL; break; } field->print_routine(field, (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_ELIGIBLE: switch(type) { case JOB: tmp_int = job->eligible; break; case JOBSTEP: tmp_int = step->start; break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_END: switch(type) { case JOB: tmp_int = job->end; break; case JOBSTEP: tmp_int = step->end; break; case JOBCOMP: tmp_int = parse_time(job_comp->end_time, 1); break; default: tmp_int = NO_VAL; break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_EXITCODE: tmp_int = 0; tmp_int2 = 0; switch(type) { case JOB: tmp_int = job->exitcode; break; case JOBSTEP: tmp_int = step->exitcode; break; case JOBCOMP: default: break; } if (tmp_int != NO_VAL) { if (WIFSIGNALED(tmp_int)) tmp_int2 = WTERMSIG(tmp_int); tmp_int = WEXITSTATUS(tmp_int); if (tmp_int >= 128) tmp_int -= 128; } snprintf(outbuf, sizeof(outbuf), "%d:%d", tmp_int, tmp_int2); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_GID: switch(type) { case JOB: tmp_int = job->gid; break; case JOBSTEP: tmp_int = NO_VAL; break; case JOBCOMP: tmp_int = job_comp->gid; break; default: tmp_int = NO_VAL; break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_GROUP: switch(type) { case JOB: tmp_int = job->gid; break; case JOBSTEP: tmp_int = NO_VAL; break; case JOBCOMP: tmp_int = job_comp->gid; break; default: tmp_int = NO_VAL; break; } tmp_char = NULL; if ((gr=getgrgid(tmp_int))) tmp_char=gr->gr_name; field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_JOBID: if (type == JOBSTEP) job = step->job_ptr; if (job) { if (job->array_task_str) { _xlate_task_str(job); snprintf(id, FORMAT_STRING_SIZE, "%u_[%s]", job->array_job_id, job->array_task_str); } else if (job->array_task_id != NO_VAL) snprintf(id, FORMAT_STRING_SIZE, "%u_%u", job->array_job_id, job->array_task_id); else snprintf(id, FORMAT_STRING_SIZE, "%u", job->jobid); } switch (type) { case JOB: tmp_char = xstrdup(id); break; case JOBSTEP: if (step->stepid == SLURM_BATCH_SCRIPT) { tmp_char = xstrdup_printf( "%s.batch", id); } else if (step->stepid == SLURM_EXTERN_CONT) { tmp_char = xstrdup_printf( "%s.extern", id); } else { tmp_char = xstrdup_printf( "%s.%u", id, step->stepid); } break; case JOBCOMP: tmp_char = xstrdup_printf("%u", job_comp->jobid); break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_JOBIDRAW: switch (type) { case JOB: tmp_char = xstrdup_printf("%u", job->jobid); break; case JOBSTEP: if (step->stepid == SLURM_BATCH_SCRIPT) { tmp_char = xstrdup_printf( "%u.batch", step->job_ptr->jobid); } else if (step->stepid == SLURM_EXTERN_CONT) { tmp_char = xstrdup_printf( "%u.extern", step->job_ptr->jobid); } else { tmp_char = xstrdup_printf( "%u.%u", step->job_ptr->jobid, step->stepid); } break; case JOBCOMP: tmp_char = xstrdup_printf("%u", job_comp->jobid); break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_JOBNAME: switch(type) { case JOB: tmp_char = job->jobname; break; case JOBSTEP: tmp_char = step->stepname; break; case JOBCOMP: tmp_char = job_comp->jobname; break; default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_LAYOUT: switch(type) { case JOB: /* below really should be step. It is not a typo */ if (!job->track_steps) tmp_char = slurm_step_layout_type_name( step->task_dist); break; case JOBSTEP: tmp_char = slurm_step_layout_type_name( step->task_dist); break; case JOBCOMP: break; default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_MAXDISKREAD: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job-> stats.disk_read_max; break; case JOBSTEP: tmp_dub = step->stats.disk_read_max; break; case JOBCOMP: default: break; } } _print_small_double(outbuf, sizeof(outbuf), tmp_dub, UNIT_MEGA); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_MAXDISKREADNODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. disk_read_max_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats. disk_read_max_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MAXDISKREADTASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. disk_read_max_taskid; break; case JOBSTEP: tmp_uint32 = step->stats. disk_read_max_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_MAXDISKWRITE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats. disk_write_max; break; case JOBSTEP: tmp_dub = step->stats.disk_write_max; break; case JOBCOMP: default: break; } } _print_small_double(outbuf, sizeof(outbuf), tmp_dub, UNIT_MEGA); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_MAXDISKWRITENODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. disk_write_max_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats. disk_write_max_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MAXDISKWRITETASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. disk_write_max_taskid; break; case JOBSTEP: tmp_uint32 = step->stats. disk_write_max_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; } field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_MAXPAGES: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint64 = job->stats.pages_max; break; case JOBSTEP: tmp_uint64 = step->stats.pages_max; break; case JOBCOMP: default: break; } if (tmp_uint64 != (uint64_t)NO_VAL) _local_convert_num_unit( (double)tmp_uint64, outbuf, sizeof(outbuf), UNIT_KILO); } field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_MAXPAGESNODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. pages_max_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats.pages_max_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MAXPAGESTASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. pages_max_taskid; break; case JOBSTEP: tmp_uint32 = step->stats. pages_max_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; } field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_MAXRSS: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint64 = job->stats.rss_max; break; case JOBSTEP: tmp_uint64 = step->stats.rss_max; break; case JOBCOMP: default: break; } if (tmp_uint64 != (uint64_t)NO_VAL) _local_convert_num_unit( (double)tmp_uint64, outbuf, sizeof(outbuf), UNIT_KILO); } field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_MAXRSSNODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. rss_max_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats.rss_max_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MAXRSSTASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. rss_max_taskid; break; case JOBSTEP: tmp_uint32 = step->stats.rss_max_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; } field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_MAXVSIZE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint64 = job->stats. vsize_max; break; case JOBSTEP: tmp_uint64 = step->stats.vsize_max; break; case JOBCOMP: default: tmp_uint64 = (uint64_t)NO_VAL; break; } if (tmp_uint64 != (uint64_t)NO_VAL) _local_convert_num_unit( (double)tmp_uint64, outbuf, sizeof(outbuf), UNIT_KILO); } field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_MAXVSIZENODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. vsize_max_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats.vsize_max_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MAXVSIZETASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. vsize_max_taskid; break; case JOBSTEP: tmp_uint32 = step->stats. vsize_max_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; } field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_MINCPU: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_dub = job->stats.cpu_min; break; case JOBSTEP: tmp_dub = step->stats.cpu_min; break; case JOBCOMP: default: break; } if (!fuzzy_equal(tmp_dub, NO_VAL)) tmp_char = _elapsed_time( (long)tmp_dub, 0); } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MINCPUNODE: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_char = find_hostname( job->stats. cpu_min_nodeid, job->nodes); break; case JOBSTEP: tmp_char = find_hostname( step->stats.cpu_min_nodeid, step->nodes); break; case JOBCOMP: default: tmp_char = NULL; break; } } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_MINCPUTASK: if (got_stats) { switch(type) { case JOB: if (!job->track_steps) tmp_uint32 = job->stats. cpu_min_taskid; break; case JOBSTEP: tmp_uint32 = step->stats.cpu_min_taskid; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; } field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_NODELIST: switch(type) { case JOB: tmp_char = job->nodes; break; case JOBSTEP: tmp_char = step->nodes; break; case JOBCOMP: tmp_char = job_comp->nodelist; break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_NNODES: switch(type) { case JOB: tmp_int = job->alloc_nodes; tmp_char = job->nodes; break; case JOBSTEP: tmp_int = step->nnodes; tmp_char = step->nodes; break; case JOBCOMP: tmp_int = job_comp->node_cnt; tmp_char = job_comp->nodelist; break; default: break; } if (!tmp_int) { hostlist_t hl = hostlist_create(tmp_char); tmp_int = hostlist_count(hl); hostlist_destroy(hl); } _local_convert_num_unit((double)tmp_int, outbuf, sizeof(outbuf), UNIT_NONE); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_NTASKS: switch(type) { case JOB: if (!job->track_steps && !step) tmp_int = cpu_tres_rec_count; // we want to use the step info if (!step) break; case JOBSTEP: tmp_int = step->ntasks; break; case JOBCOMP: default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_PRIO: switch(type) { case JOB: tmp_int = job->priority; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_PARTITION: switch(type) { case JOB: tmp_char = job->partition; break; case JOBSTEP: break; case JOBCOMP: tmp_char = job_comp->partition; break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_QOS: switch(type) { case JOB: tmp_int = job->qosid; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } if (!g_qos_list) { slurmdb_qos_cond_t qos_cond; memset(&qos_cond, 0, sizeof(slurmdb_qos_cond_t)); qos_cond.with_deleted = 1; g_qos_list = slurmdb_qos_get( acct_db_conn, &qos_cond); } tmp_char = _find_qos_name_from_list(g_qos_list, tmp_int); field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_QOSRAW: switch(type) { case JOB: tmp_int = job->qosid; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_REQ_CPUFREQ_MIN: switch (type) { case JOB: if (!job->track_steps && !step) tmp_dub = NO_VAL; // we want to use the step info if (!step) break; case JOBSTEP: tmp_dub = step->req_cpufreq_min; break; default: break; } cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_REQ_CPUFREQ_MAX: switch (type) { case JOB: if (!job->track_steps && !step) tmp_dub = NO_VAL; // we want to use the step info if (!step) break; case JOBSTEP: tmp_dub = step->req_cpufreq_max; break; default: break; } cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_REQ_CPUFREQ_GOV: switch (type) { case JOB: if (!job->track_steps && !step) tmp_dub = NO_VAL; // we want to use the step info if (!step) break; case JOBSTEP: tmp_dub = step->req_cpufreq_gov; break; default: break; } cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_REQ_CPUS: switch(type) { case JOB: tmp_int = job->req_cpus; break; case JOBSTEP: tmp_int = step_cpu_tres_rec_count; break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_REQ_GRES: switch(type) { case JOB: tmp_char = job->req_gres; break; case JOBSTEP: tmp_char = step->job_ptr->req_gres; break; case JOBCOMP: default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_REQ_MEM: switch(type) { case JOB: tmp_uint32 = job->req_mem; break; case JOBSTEP: tmp_uint32 = step->job_ptr->req_mem; break; case JOBCOMP: default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 != (uint32_t)NO_VAL) { bool per_cpu = false; if (tmp_uint32 & MEM_PER_CPU) { tmp_uint32 &= (~MEM_PER_CPU); per_cpu = true; } _local_convert_num_unit((double)tmp_uint32, outbuf, sizeof(outbuf), UNIT_MEGA); if (per_cpu) sprintf(outbuf+strlen(outbuf), "c"); else sprintf(outbuf+strlen(outbuf), "n"); } field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_RESERVATION: switch(type) { case JOB: if (job->resv_name) { tmp_char = job->resv_name; } else { tmp_char = NULL; } break; case JOBSTEP: tmp_char = NULL; break; case JOBCOMP: tmp_char = NULL; break; default: tmp_char = NULL; break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_RESERVATION_ID: switch(type) { case JOB: if (job->resvid) tmp_uint32 = job->resvid; else tmp_uint32 = NO_VAL; break; case JOBSTEP: tmp_uint32 = NO_VAL; break; case JOBCOMP: tmp_uint32 = NO_VAL; break; default: tmp_uint32 = NO_VAL; break; } if (tmp_uint32 == (uint32_t)NO_VAL) tmp_uint32 = NO_VAL; field->print_routine(field, tmp_uint32, (curr_inx == field_count)); break; case PRINT_RESV: switch(type) { case JOB: if (job->start) tmp_int = job->start - job->eligible; else tmp_int = time(NULL) - job->eligible; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_RESV_CPU: switch(type) { case JOB: if (job->start) tmp_int = (job->start - job->eligible) * job->req_cpus; else tmp_int = (time(NULL) - job->eligible) * job->req_cpus; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_RESV_CPU_RAW: switch(type) { case JOB: if (job->start) tmp_int = (job->start - job->eligible) * job->req_cpus; else tmp_int = (time(NULL) - job->eligible) * job->req_cpus; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_START: switch(type) { case JOB: tmp_int = job->start; break; case JOBSTEP: tmp_int = step->start; break; case JOBCOMP: tmp_int = parse_time(job_comp->start_time, 1); break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_STATE: switch(type) { case JOB: tmp_int = job->state; tmp_int2 = job->requid; break; case JOBSTEP: tmp_int = step->state; tmp_int2 = step->requid; break; case JOBCOMP: tmp_char = job_comp->state; break; default: break; } if (((tmp_int & JOB_STATE_BASE) == JOB_CANCELLED) && (tmp_int2 != -1)) snprintf(outbuf, FORMAT_STRING_SIZE, "%s by %d", job_state_string(tmp_int), tmp_int2); else if (tmp_int != NO_VAL) snprintf(outbuf, FORMAT_STRING_SIZE, "%s", job_state_string(tmp_int)); else if (tmp_char) snprintf(outbuf, FORMAT_STRING_SIZE, "%s", tmp_char); field->print_routine(field, outbuf, (curr_inx == field_count)); break; case PRINT_SUBMIT: switch(type) { case JOB: tmp_int = job->submit; break; case JOBSTEP: tmp_int = step->start; break; case JOBCOMP: tmp_int = parse_time(job_comp->start_time, 1); break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_SUSPENDED: switch(type) { case JOB: tmp_int = job->suspended; break; case JOBSTEP: tmp_int = step->suspended; break; case JOBCOMP: break; default: break; } field->print_routine(field, (uint64_t)tmp_int, (curr_inx == field_count)); break; case PRINT_SYSTEMCPU: if (got_stats) { switch(type) { case JOB: tmp_int = job->sys_cpu_sec; tmp_int2 = job->sys_cpu_usec; break; case JOBSTEP: tmp_int = step->sys_cpu_sec; tmp_int2 = step->sys_cpu_usec; break; case JOBCOMP: default: break; } tmp_char = _elapsed_time(tmp_int, tmp_int2); } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_TIMELIMIT: switch(type) { case JOB: if (job->timelimit == INFINITE) tmp_char = "UNLIMITED"; else if (job->timelimit == NO_VAL) tmp_char = "Partition_Limit"; else if (job->timelimit) { char tmp1[128]; mins2time_str(job->timelimit, tmp1, sizeof(tmp1)); tmp_char = tmp1; } break; case JOBSTEP: break; case JOBCOMP: tmp_char = job_comp->timelimit; break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_TOTALCPU: switch(type) { case JOB: tmp_int = job->tot_cpu_sec; tmp_int2 = job->tot_cpu_usec; break; case JOBSTEP: tmp_int = step->tot_cpu_sec; tmp_int2 = step->tot_cpu_usec; break; case JOBCOMP: break; default: break; } tmp_char = _elapsed_time(tmp_int, tmp_int2); field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_TRES: switch(type) { case JOB: tmp_char = job->tres_alloc_str; break; case JOBSTEP: tmp_char = step->tres_alloc_str; break; case JOBCOMP: default: tmp_char = NULL; break; } if (!g_tres_list) { slurmdb_tres_cond_t tres_cond; memset(&tres_cond, 0, sizeof(slurmdb_tres_cond_t)); tres_cond.with_deleted = 1; g_tres_list = slurmdb_tres_get( acct_db_conn, &tres_cond); } tmp_char = slurmdb_make_tres_string_from_simple( tmp_char, g_tres_list); field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_UID: switch(type) { case JOB: if (job->user) { if ((pw=getpwnam(job->user))) tmp_int = pw->pw_uid; } else tmp_int = job->uid; break; case JOBSTEP: break; case JOBCOMP: tmp_int = job_comp->uid; break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; case PRINT_USER: switch(type) { case JOB: if (job->user) tmp_char = job->user; else if (job->uid != -1) { if ((pw=getpwuid(job->uid))) tmp_char = pw->pw_name; } break; case JOBSTEP: break; case JOBCOMP: tmp_char = job_comp->uid_name; break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_USERCPU: if (got_stats) { switch(type) { case JOB: tmp_int = job->user_cpu_sec; tmp_int2 = job->user_cpu_usec; break; case JOBSTEP: tmp_int = step->user_cpu_sec; tmp_int2 = step->user_cpu_usec; break; case JOBCOMP: default: break; } tmp_char = _elapsed_time(tmp_int, tmp_int2); } field->print_routine(field, tmp_char, (curr_inx == field_count)); xfree(tmp_char); break; case PRINT_WCKEY: switch(type) { case JOB: tmp_char = job->wckey; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_char, (curr_inx == field_count)); break; case PRINT_WCKEYID: switch(type) { case JOB: tmp_int = job->wckeyid; break; case JOBSTEP: break; case JOBCOMP: break; default: break; } field->print_routine(field, tmp_int, (curr_inx == field_count)); break; default: break; } curr_inx++; } printf("\n"); }
/* * Read a SLURM hostfile specified by "filename". "filename" must contain * a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any SLURM daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL; char *asterisk, *tmp_text, *save_ptr = NULL, *host_name; int total_file_len = 0; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_num++; if (!isalpha(in_line[0]) && !isdigit(in_line[0])) { error ("Invalid hostfile %s contents on line %d", filename, line_num); fclose (fp); hostlist_destroy(hostlist); return NULL; } line_size = strlen(in_line); total_file_len += line_size; if (line_size == (BUFFER_SIZE - 1)) { error ("Line %d, of hostfile %s too long", line_num, filename); fclose (fp); hostlist_destroy(hostlist); return NULL; } for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } tmp_text = xstrdup(in_line); host_name = strtok_r(tmp_text, ",", &save_ptr); while (host_name) { if ((asterisk = strchr(host_name, '*')) && (i = atoi(asterisk + 1))) { asterisk[0] = '\0'; for (j = 0; j < i; j++) hostlist_push_host(hostlist, host_name); } else { hostlist_push_host(hostlist, host_name); } host_name = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_text); if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n)) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in SLURM Hostfile"); goto cleanup_hostfile; } total_file_len += 1024; nodelist = (char *)malloc(total_file_len); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); return nodelist; }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the childern * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { int *span = NULL; fwd_tree_t *fwd_tree = NULL; pthread_mutex_t tree_mutex; pthread_cond_t notify; int j = 0, count = 0; List ret_list = NULL; char *name = NULL; int thr_count = 0; int host_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); span = set_span(host_count, 0); slurm_mutex_init(&tree_mutex); pthread_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); fwd_tree = xmalloc(sizeof(fwd_tree_t)); fwd_tree->orig_msg = msg; fwd_tree->ret_list = ret_list; fwd_tree->timeout = timeout; fwd_tree->notify = ¬ify; fwd_tree->p_thr_count = &thr_count; fwd_tree->tree_mutex = &tree_mutex; if (fwd_tree->timeout <= 0) { /* convert secs to msec */ fwd_tree->timeout = slurm_get_msg_timeout() * 1000; } fwd_tree->tree_hl = hostlist_create(name); free(name); for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(fwd_tree->tree_hl, name); free(name); } /* * Lock and increase thread counter, we need that to protect * the start_msg_tree waiting loop that was originally designed * around a "while ((count < host_count))" loop. In case where a * fwd thread was not able to get all the return codes from * children, the waiting loop was deadlocked. */ slurm_mutex_lock(&tree_mutex); thr_count++; slurm_mutex_unlock(&tree_mutex); while (pthread_create(&thread_agent, &attr_agent, _fwd_tree_thread, (void *)fwd_tree)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); } xfree(span); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { pthread_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); pthread_cond_destroy(¬ify); return ret_list; }
/* * build_all_frontend_info - get a array of slurm_conf_frontend_t structures * from the slurm.conf reader, build table, and set values * is_slurmd_context: set to true if run from slurmd * RET 0 if no error, error code otherwise */ extern int build_all_frontend_info (bool is_slurmd_context) { slurm_conf_frontend_t **ptr_array; #ifdef HAVE_FRONT_END slurm_conf_frontend_t *fe_single, *fe_line; int i, count, max_rc = SLURM_SUCCESS; bool front_end_debug; if (slurm_get_debug_flags() & DEBUG_FLAG_FRONT_END) front_end_debug = true; else front_end_debug = false; count = slurm_conf_frontend_array(&ptr_array); if (count == 0) fatal("No FrontendName information available!"); for (i = 0; i < count; i++) { hostlist_t hl_name, hl_addr; char *fe_name, *fe_addr; fe_line = ptr_array[i]; hl_name = hostlist_create(fe_line->frontends); if (hl_name == NULL) fatal("Invalid FrontendName:%s", fe_line->frontends); hl_addr = hostlist_create(fe_line->addresses); if (hl_addr == NULL) fatal("Invalid FrontendAddr:%s", fe_line->addresses); if (hostlist_count(hl_name) != hostlist_count(hl_addr)) { fatal("Inconsistent node count between " "FrontendName(%s) and FrontendAddr(%s)", fe_line->frontends, fe_line->addresses); } while ((fe_name = hostlist_shift(hl_name))) { fe_addr = hostlist_shift(hl_addr); fe_single = xmalloc(sizeof(slurm_conf_frontend_t)); list_append(front_end_list, fe_single); fe_single->frontends = xstrdup(fe_name); fe_single->addresses = xstrdup(fe_addr); free(fe_name); free(fe_addr); if (fe_line->allow_groups && fe_line->allow_groups[0]) { fe_single->allow_groups = xstrdup(fe_line->allow_groups); } if (fe_line->allow_users && fe_line->allow_users[0]) { fe_single->allow_users = xstrdup(fe_line->allow_users); } if (fe_line->deny_groups && fe_line->deny_groups[0]) { fe_single->deny_groups = xstrdup(fe_line->deny_groups); } if (fe_line->deny_users && fe_line->deny_users[0]) { fe_single->deny_users = xstrdup(fe_line->deny_users); } fe_single->port = fe_line->port; if (fe_line->reason && fe_line->reason[0]) fe_single->reason = xstrdup(fe_line->reason); fe_single->node_state = fe_line->node_state; if (front_end_debug && !is_slurmd_context) _dump_front_end(fe_single); } hostlist_destroy(hl_addr); hostlist_destroy(hl_name); } return max_rc; #else if (slurm_conf_frontend_array(&ptr_array) != 0) fatal("FrontendName information configured!"); return SLURM_SUCCESS; #endif }
/* * route_p_split_hostlist - logic to split an input hostlist into * a set of hostlists to forward to. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return; * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_p_split_hostlist(hostlist_t hl, hostlist_t** sp_hl, int* count) { int i, j, k, hl_ndx, msg_count, sw_count, lst_count; char *buf; bitstr_t *nodes_bitmap = NULL; /* nodes in message list */ bitstr_t *fwd_bitmap = NULL; /* nodes in forward list */ msg_count = hostlist_count(hl); if (switch_record_cnt == 0) { /* configs have not already been processed */ slurm_conf_init(NULL); if (init_node_conf()) { fatal("ROUTE: Failed to init slurm config"); } if (build_all_nodeline_info(false)) { fatal("ROUTE: Failed to build node config"); } rehash_node(); if (slurm_topo_build_config() != SLURM_SUCCESS) { fatal("ROUTE: Failed to build topology config"); } } *sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t)); /* create bitmap of nodes to send message too */ if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) { buf = hostlist_ranged_string_xmalloc(hl); fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf); } /* Find lowest level switch containing all the nodes in the list */ j = 0; for (i = 0; i <= switch_levels; i++) { for (j=0; j<switch_record_cnt; j++) { if (switch_record_table[j].level == i) { if (bit_super_set(nodes_bitmap, switch_record_table[j]. node_bitmap)) { /* All nodes in message list are in * this switch */ break; } } } if (j < switch_record_cnt) { /* Got here via break after bit_super_set */ break; // 'j' is our switch } /* else, no switches at this level reach all nodes */ } if (i > switch_levels) { /* This can only happen if trying to schedule multiple physical * clusters as a single logical cluster under the control of a * single slurmctld daemon, and sending something like a * node_registation request to all nodes. * Revert to default behavior*/ if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc(hl); debug("ROUTE: didn't find switch containing nodes=%s", buf); xfree(buf); } FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } if (switch_record_table[j].level == 0) { /* This is a leaf switch. Construct list based on TreeWidth */ FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } /* loop through children, construction a hostlist for each child switch * with nodes in the message list */ hl_ndx = 0; lst_count = 0; for (i=0; i < switch_record_table[j].num_switches; i++) { k = switch_record_table[j].switch_index[i]; fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap); bit_and(fwd_bitmap, nodes_bitmap); sw_count = bit_set_count(fwd_bitmap); if (sw_count == 0) { continue; /* no nodes on this switch in message list */ } (*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap); /* Now remove nodes from this switch from message list */ bit_not(fwd_bitmap); bit_and(nodes_bitmap, fwd_bitmap); FREE_NULL_BITMAP(fwd_bitmap); if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]); debug("ROUTE: ... sublist[%d] switch=%s :: %s", i, switch_record_table[i].name, buf); xfree(buf); } hl_ndx++; lst_count += sw_count; if (lst_count == msg_count) break; /* all nodes in message are in a child list */ } FREE_NULL_BITMAP(nodes_bitmap); *count = hl_ndx; return SLURM_SUCCESS; }
/* * setup_cluster_nodes - get cluster record list within requested * time period with used nodes. Used for deciding whether a nodelist is * overlapping with the required nodes. */ extern cluster_nodes_t * setup_cluster_nodes(pgsql_conn_t *pg_conn, slurmdb_job_cond_t *job_cond) { DEF_VARS; cluster_nodes_t *cnodes = NULL; time_t now = time(NULL); hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } temp_hl = hostlist_create(job_cond->used_nodes); if (!hostlist_count(temp_hl)) { error("we didn't get any real hosts to look for."); hostlist_destroy(temp_hl); return NULL; } query = xstrdup_printf("SELECT cluster_nodes, time_start, " "time_end FROM %s.%s WHERE node_name='' " "AND cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " AND ((time_start<%ld) " "AND (time_end>=%ld OR time_end=0))", job_cond->usage_end, job_cond->usage_start); } result = DEF_QUERY_RET; if (!result) { hostlist_destroy(temp_hl); return NULL; } h_itr = hostlist_iterator_create(temp_hl); cnodes = xmalloc(sizeof(cluster_nodes_t)); cnodes->cluster_list = list_create(_destroy_local_cluster); FOR_EACH_ROW { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create(ROW(0)); local_cluster->start = atoi(ROW(1)); local_cluster->end = atoi(ROW(2)); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while((host = hostlist_next(h_itr))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(cnodes->cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; cnodes->curr_cluster = local_cluster; } } else _destroy_local_cluster(local_cluster); } END_EACH_ROW; PQclear(result); hostlist_iterator_destroy(h_itr); if (!list_count(cnodes->cluster_list)) { destroy_cluster_nodes(cnodes); cnodes = NULL; } hostlist_destroy(temp_hl); return cnodes; }
int pstdout_launch(const char *hostnames, Pstdout_Thread pstdout_func, void *arg) { struct pstdout_thread_data **tdata = NULL; struct pstdout_state pstate; unsigned int pstate_init = 0; hostlist_iterator_t hitr = NULL; hostlist_t h = NULL; int h_count = 0; char *host = NULL; int exit_code = -1; sighandler_t sighandler_save = NULL; int sighandler_set = 0; int rc; int i; if (!pstdout_initialized) { pstdout_errnum = PSTDOUT_ERR_UNINITIALIZED; return -1; } if (!pstdout_func) { pstdout_errnum = PSTDOUT_ERR_PARAMETERS; return -1; } if ((rc = pthread_mutex_lock(&pstdout_launch_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } /* Special case */ if (!hostnames) { if (_pstdout_state_init(&pstate, NULL) < 0) goto cleanup; pstate_init++; exit_code = pstdout_func(&pstate, NULL, arg); pstdout_errnum = PSTDOUT_ERR_SUCCESS; goto cleanup; } if (!(h = hostlist_create(hostnames))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } h_count = hostlist_count(h); /* Sanity check */ if (h_count <= 0) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "h_count = %d\n", h_count); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } /* Special case */ if (h_count == 1) { if (_pstdout_state_init(&pstate, hostnames) < 0) goto cleanup; pstate_init++; exit_code = pstdout_func(&pstate, hostnames, arg); pstdout_errnum = PSTDOUT_ERR_SUCCESS; goto cleanup; } if ((sighandler_save = signal(SIGINT, _pstdout_sigint)) == SIG_ERR) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "signal\n"); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } sighandler_set++; if (!(hitr = hostlist_iterator_create(h))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } if (!(tdata = (struct pstdout_thread_data **)malloc(sizeof(struct pstdout_thread_data *) * h_count))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } memset(tdata, '\0', sizeof(struct pstdout_thread_data *) * h_count); i = 0; while ((host = hostlist_next(hitr))) { if (!(tdata[i] = (struct pstdout_thread_data *)malloc(sizeof(struct pstdout_thread_data)))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } memset(tdata[i], '\0', sizeof(struct pstdout_thread_data)); if (!(tdata[i]->hostname = strdup(host))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } tdata[i]->pstdout_func = pstdout_func; tdata[i]->arg = arg; if ((rc = pthread_attr_init(&(tdata[i]->attr)))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_attr_init: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } if ((rc = pthread_attr_setdetachstate(&(tdata[i]->attr), PTHREAD_CREATE_DETACHED))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_attr_setdetachstate: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } free(host); i++; } host = NULL; hostlist_iterator_destroy(hitr); hitr = NULL; hostlist_destroy(h); h = NULL; /* Launch threads up to fanout */ for (i = 0; i < h_count; i++) { if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } if (pstdout_threadcount == pstdout_fanout) { if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } if ((rc = pthread_create(&(tdata[i]->tid), &(tdata[i]->attr), _pstdout_func_entry, (void *) tdata[i]))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_create: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } pstdout_threadcount++; if ((rc = pthread_mutex_unlock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } /* Wait for Threads to finish */ if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } while (pstdout_threadcount > 0) { if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } if (_pstdout_output_consolidated_finish() < 0) goto cleanup; /* Determine exit code */ exit_code = 0; for (i = 0; i < h_count; i++) { if (tdata[i]->exit_code > exit_code) exit_code = tdata[i]->exit_code; } cleanup: /* Cannot pass NULL for key, so just pass dummy key */ list_delete_all(pstdout_consolidated_stdout, _pstdout_consolidated_data_delete_all, ""); list_delete_all(pstdout_consolidated_stderr, _pstdout_consolidated_data_delete_all, ""); if (pstate_init) _pstdout_state_cleanup(&pstate); if (tdata) { for (i = 0; i < h_count; i++) { if (tdata[i]) { free(tdata[i]->hostname); pthread_attr_destroy(&(tdata[i]->attr)); free(tdata[i]); } } free(tdata); } if (hitr) hostlist_iterator_destroy(hitr); if (h) hostlist_destroy(h); free(host); if ((rc = pthread_mutex_unlock(&pstdout_launch_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc)); /* Don't change error code, just move on */ } if (sighandler_set) signal(SIGINT, sighandler_save); return exit_code; }
/* * _build_single_nodeline_info - From the slurm.conf reader, build table, * and set values * RET 0 if no error, error code otherwise * Note: Operates on common variables * default_node_record - default node configuration values */ static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr, struct config_record *config_ptr) { int error_code = SLURM_SUCCESS; struct node_record *node_rec = NULL; hostlist_t address_list = NULL; hostlist_t alias_list = NULL; hostlist_t hostname_list = NULL; hostlist_t port_list = NULL; char *address = NULL; char *alias = NULL; char *hostname = NULL; char *port_str = NULL; int state_val = NODE_STATE_UNKNOWN; int address_count, alias_count, hostname_count, port_count; uint16_t port = 0; if (node_ptr->state != NULL) { state_val = state_str2int(node_ptr->state, node_ptr->nodenames); if (state_val == NO_VAL) goto cleanup; } if ((address_list = hostlist_create(node_ptr->addresses)) == NULL) { fatal("Unable to create NodeAddr list from %s", node_ptr->addresses); error_code = errno; goto cleanup; } if ((alias_list = hostlist_create(node_ptr->nodenames)) == NULL) { fatal("Unable to create NodeName list from %s", node_ptr->nodenames); error_code = errno; goto cleanup; } if ((hostname_list = hostlist_create(node_ptr->hostnames)) == NULL) { fatal("Unable to create NodeHostname list from %s", node_ptr->hostnames); error_code = errno; goto cleanup; } if (node_ptr->port_str && node_ptr->port_str[0] && (node_ptr->port_str[0] != '[') && (strchr(node_ptr->port_str, '-') || strchr(node_ptr->port_str, ','))) { xstrfmtcat(port_str, "[%s]", node_ptr->port_str); port_list = hostlist_create(port_str); xfree(port_str); } else { port_list = hostlist_create(node_ptr->port_str); } if (port_list == NULL) { error("Unable to create Port list from %s", node_ptr->port_str); error_code = errno; goto cleanup; } /* some sanity checks */ address_count = hostlist_count(address_list); alias_count = hostlist_count(alias_list); hostname_count = hostlist_count(hostname_list); port_count = hostlist_count(port_list); #ifdef HAVE_FRONT_END if ((hostname_count != alias_count) && (hostname_count != 1)) { error("NodeHostname count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } if ((address_count != alias_count) && (address_count != 1)) { error("NodeAddr count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } #else #ifdef MULTIPLE_SLURMD if ((address_count != alias_count) && (address_count != 1)) { error("NodeAddr count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } #else if (address_count < alias_count) { error("At least as many NodeAddr are required as NodeName"); goto cleanup; } if (hostname_count < alias_count) { error("At least as many NodeHostname are required " "as NodeName"); goto cleanup; } #endif /* MULTIPLE_SLURMD */ #endif /* HAVE_FRONT_END */ if ((port_count != alias_count) && (port_count > 1)) { error("Port count must equal that of NodeName " "records or there must be no more than one"); goto cleanup; } /* now build the individual node structures */ while ((alias = hostlist_shift(alias_list))) { if (address_count > 0) { address_count--; if (address) free(address); address = hostlist_shift(address_list); } if (hostname_count > 0) { hostname_count--; if (hostname) free(hostname); hostname = hostlist_shift(hostname_list); } if (port_count > 0) { int port_int; port_count--; if (port_str) free(port_str); port_str = hostlist_shift(port_list); port_int = atoi(port_str); if ((port_int <= 0) || (port_int > 0xffff)) fatal("Invalid Port %s", node_ptr->port_str); port = port_int; } /* find_node_record locks this to get the * alias so we need to unlock */ node_rec = find_node_record(alias); if (node_rec == NULL) { node_rec = create_node_record(config_ptr, alias); if ((state_val != NO_VAL) && (state_val != NODE_STATE_UNKNOWN)) node_rec->node_state = state_val; node_rec->last_response = (time_t) 0; node_rec->comm_name = xstrdup(address); node_rec->node_hostname = xstrdup(hostname); node_rec->port = port; node_rec->weight = node_ptr->weight; node_rec->features = xstrdup(node_ptr->feature); node_rec->reason = xstrdup(node_ptr->reason); } else { /* FIXME - maybe should be fatal? */ error("Reconfiguration for node %s, ignoring!", alias); } free(alias); } /* free allocated storage */ cleanup: if (address) free(address); if (hostname) free(hostname); if (port_str) free(port_str); if (address_list) hostlist_destroy(address_list); if (alias_list) hostlist_destroy(alias_list); if (hostname_list) hostlist_destroy(hostname_list); if (port_list) hostlist_destroy(port_list); return error_code; }
static void _progress_fan_in(pmixp_coll_t *coll) { pmixp_srv_cmd_t type; const char *addr = pmixp_info_srv_addr(); char *hostlist = NULL; int rc, is_p2p = 0; Buf root_buf; PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d", pmixp_info_namespace(), pmixp_info_nodeid(), coll->contrib_local, coll->contrib_cntr); /* lock the collective */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (PMIXP_COLL_FAN_IN != coll->state) { /* In case of race condition between libpmix and * slurm threads progress_fan_in can be called * after we moved to the next step. */ goto unlock; } if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) { /* Not yet ready to go to the next step */ goto unlock; } /* The root of the collective will have parent_host == NULL */ if (NULL != coll->parent_host) { hostlist = xstrdup(coll->parent_host); type = PMIXP_MSG_FAN_IN; PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state", pmixp_info_namespace(), pmixp_info_nodeid()); is_p2p = 1; } else { if (0 < hostlist_count(coll->all_children)) { hostlist = hostlist_ranged_string_xmalloc( coll->all_children); type = PMIXP_MSG_FAN_OUT; pmixp_debug_hang(0); } rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf); xassert(0 == rc); PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)", pmixp_info_namespace(), pmixp_info_nodeid()); } PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(), pmixp_info_nodeid(), hostlist); /* Check for the singletone case */ if (NULL != hostlist) { if( 0 == coll->seq && NULL != coll->parent_host ){ /* This is the first message sent to the parent. * There might be a race condition where parent * is not ready to receive the messages. * Use zero-size message to check parent status first * and then send the full message. */ pmixp_server_health_chk(hostlist, addr); } rc = pmixp_server_send(hostlist, type, coll->seq, addr, get_buf_data(coll->buf), get_buf_offset(coll->buf), is_p2p); if (SLURM_SUCCESS != rc) { PMIXP_ERROR( "Cannot send data (size = %lu), to hostlist:\n%s", (uint64_t) get_buf_offset(coll->buf), hostlist); /* return error indication to PMIx. Nodes that haven't received data * will exit by a timeout. * FIXME: do we need to do something with successfuly finished nodes? */ goto unlock; } } /* transit to the next state */ _fan_in_finished(coll); /* if we are root - push data to PMIx here. * Originally there was a homogenuous solution: root nodename was in the hostlist. * However this may lead to the undesired side effects: we are blocked here sending * data and cannot receive (it will be triggered in this thread after we will leave * this callback), so we have to rely on buffering on the SLURM side. * Better not to do so. */ if (NULL == coll->parent_host) { /* if I am the root - pass the data to PMIx and reset collective here */ /* copy payload excluding reserved server header */ _progres_fan_out(coll, root_buf); } unlock: if (NULL != hostlist) { xfree(hostlist); } /* lock the */ slurm_mutex_unlock(&coll->lock); }