/* FIXME: client_id could be hostname or IP. * We probably want both to work for an exports match. */ static int _match_export_hosts (Export *x, Npconn *conn) { char *client_id = np_conn_get_client_id (conn); hostlist_t hl = NULL; int res = 0; /* no match */ /* no client_id restrictions */ if (!x->hosts) { res = 1; goto done; } if (!(hl = hostlist_create (x->hosts))) { np_uerror (ENOMEM); goto done; } /* client_id found in exports */ if (hostlist_find (hl, client_id) != -1) { res = 1; goto done; } done: if (hl) hostlist_destroy (hl); return res; }
/* FIXME: client_id could be hostname or IP. * We probably want both to work for an exports match. */ static int _match_export_hosts (Export *x, Npconn *conn) { char *client_id = np_conn_get_client_id (conn); hostlist_t hl = NULL; int res = 0; /* no match */ /* privport is required */ if (x->oflags & XFLAGS_PRIVPORT && !(conn->flags & CONN_FLAGS_PRIVPORT)) { np_uerror (EPERM); goto done; } /* no client_id restrictions */ if (!x->hosts) { res = 1; goto done; } if (!(hl = hostlist_create (x->hosts))) { np_uerror (ENOMEM); goto done; } /* client_id found in exports */ if (hostlist_find (hl, client_id) != -1) { res = 1; goto done; } done: if (hl) hostlist_destroy (hl); return res; }
int pmixp_coll_ring_init(pmixp_coll_t *coll, hostlist_t *hl) { #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("called"); #endif int i; pmixp_coll_ring_ctx_t *coll_ctx = NULL; pmixp_coll_ring_t *ring = &coll->state.ring; char *p; int rel_id = hostlist_find(*hl, pmixp_info_hostname()); /* compute the next absolute id of the neighbor */ p = hostlist_nth(*hl, (rel_id + 1) % coll->peers_cnt); ring->next_peerid = pmixp_info_job_hostid(p); free(p); ring->fwrd_buf_pool = list_create(pmixp_free_buf); ring->ring_buf_pool = list_create(pmixp_free_buf); for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) { coll_ctx = &ring->ctx_array[i]; coll_ctx->coll = coll; coll_ctx->in_use = false; coll_ctx->seq = coll->seq; coll_ctx->contrib_local = false; coll_ctx->contrib_prev = 0; coll_ctx->state = PMIXP_COLL_RING_SYNC; // TODO bit vector coll_ctx->contrib_map = xmalloc(sizeof(bool) * coll->peers_cnt); } return SLURM_SUCCESS; }
int wrap_hostlist_find(WRAPPERS_ARGS, hostlist_t hl, const char *hostname) { assert(file && function); if (!hl || !hostname) WRAPPERS_ERR_INVALID_PARAMETERS("hostlist_find"); /* -1 isn't an error, it indicates the host isn't found */ return hostlist_find(hl, hostname); }
int pmixp_coll_init(pmixp_coll_t *coll, pmixp_coll_type_t type, const pmixp_proc_t *procs, size_t nprocs) { int rc = SLURM_SUCCESS; hostlist_t hl; coll->seq = 0; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != pmixp_hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); rc = SLURM_ERROR; goto exit; } coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif switch(type) { case PMIXP_COLL_TYPE_FENCE_TREE: rc = pmixp_coll_tree_init(coll, &hl); break; case PMIXP_COLL_TYPE_FENCE_RING: rc = pmixp_coll_ring_init(coll, &hl); break; default: PMIXP_ERROR("Unknown coll type"); rc = SLURM_ERROR; } hostlist_destroy(hl); if (rc) { goto exit; } exit: return rc; }
extern int slurm_job_cpus_allocated_on_node(job_resources_t *job_resrcs_ptr, const char *node) { hostlist_t node_hl; int node_id; if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes) slurm_seterrno_ret(EINVAL); node_hl = hostlist_create(job_resrcs_ptr->nodes); node_id = hostlist_find(node_hl, node); hostlist_destroy(node_hl); if (node_id == -1) return (0); /* No cpus allocated on this node */ return slurm_job_cpus_allocated_on_node_id(job_resrcs_ptr, node_id); }
bitstr_t *get_requested_node_bitmap(void) { static bitstr_t *bitmap = NULL; static node_info_msg_t *old_node_ptr = NULL, *new_node_ptr; int error_code; int i = 0; node_info_t *node_ptr = NULL; if (!params.hl) return NULL; if (old_node_ptr) { error_code = slurm_load_node(old_node_ptr->last_update, &new_node_ptr, SHOW_ALL); if (error_code == SLURM_SUCCESS) slurm_free_node_info_msg(old_node_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) return bitmap; } else { error_code = slurm_load_node((time_t) NULL, &new_node_ptr, SHOW_ALL); } if (bitmap) FREE_NULL_BITMAP(bitmap); if (error_code) { slurm_perror("slurm_load_node"); return NULL; } old_node_ptr = new_node_ptr; bitmap = bit_alloc(old_node_ptr->record_count); for (i = 0; i < old_node_ptr->record_count; i++) { node_ptr = &(old_node_ptr->node_array[i]); if (hostlist_find(params.hl, node_ptr->name) != -1) bit_set(bitmap, i); } return bitmap; }
/* * validate_alloc_node - validate that the allocating node * is allowed to use this partition * IN part_ptr - pointer to a partition * IN alloc_node - allocting node of the request * RET 1 if permitted to run, 0 otherwise */ extern int validate_alloc_node(struct part_record *part_ptr, char* alloc_node) { int status; if (part_ptr->allow_alloc_nodes == NULL) return 1; /* all allocating nodes allowed */ if (alloc_node == NULL) return 1; /* if no allocating node defined * let it go */ hostlist_t hl = hostlist_create(part_ptr->allow_alloc_nodes); status=hostlist_find(hl,alloc_node); hostlist_destroy(hl); if (status == -1) status = 0; else status = 1; return status; }
extern void scontrol_print_completing_job(job_info_t *job_ptr, node_info_msg_t *node_info_msg) { int i; node_info_t *node_info; hostlist_t all_nodes, comp_nodes, down_nodes; char *node_buf; all_nodes = hostlist_create(job_ptr->nodes); comp_nodes = hostlist_create(""); down_nodes = hostlist_create(""); for (i=0; i<node_info_msg->record_count; i++) { node_info = &(node_info_msg->node_array[i]); if (IS_NODE_COMPLETING(node_info) && (_in_node_bit_list(i, job_ptr->node_inx))) hostlist_push_host(comp_nodes, node_info->name); else if (IS_NODE_DOWN(node_info) && (hostlist_find(all_nodes, node_info->name) != -1)) hostlist_push_host(down_nodes, node_info->name); } fprintf(stdout, "JobId=%u ", job_ptr->job_id); node_buf = hostlist_ranged_string_xmalloc(comp_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf); xfree(node_buf); node_buf = hostlist_ranged_string_xmalloc(down_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(DOWN)=%s ", node_buf); xfree(node_buf); fprintf(stdout, "\n"); hostlist_destroy(all_nodes); hostlist_destroy(comp_nodes); hostlist_destroy(down_nodes); }
int slurm_job_cpus_allocated_str_on_node(char *cpus, size_t cpus_len, job_resources_t *job_resrcs_ptr, const char *node) { hostlist_t node_hl; int node_id; if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes) slurm_seterrno_ret(EINVAL); node_hl = hostlist_create(job_resrcs_ptr->nodes); node_id = hostlist_find(node_hl, node); hostlist_destroy(node_hl); if (node_id == -1) return SLURM_ERROR; return slurm_job_cpus_allocated_str_on_node_id(cpus, cpus_len, job_resrcs_ptr, node_id); }
/* * setup_cluster_nodes - get cluster record list within requested * time period with used nodes. Used for deciding whether a nodelist is * overlapping with the required nodes. */ extern cluster_nodes_t * setup_cluster_nodes(pgsql_conn_t *pg_conn, slurmdb_job_cond_t *job_cond) { DEF_VARS; cluster_nodes_t *cnodes = NULL; time_t now = time(NULL); hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } temp_hl = hostlist_create(job_cond->used_nodes); if (!hostlist_count(temp_hl)) { error("we didn't get any real hosts to look for."); hostlist_destroy(temp_hl); return NULL; } query = xstrdup_printf("SELECT cluster_nodes, time_start, " "time_end FROM %s.%s WHERE node_name='' " "AND cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " AND ((time_start<%ld) " "AND (time_end>=%ld OR time_end=0))", job_cond->usage_end, job_cond->usage_start); } result = DEF_QUERY_RET; if (!result) { hostlist_destroy(temp_hl); return NULL; } h_itr = hostlist_iterator_create(temp_hl); cnodes = xmalloc(sizeof(cluster_nodes_t)); cnodes->cluster_list = list_create(_destroy_local_cluster); FOR_EACH_ROW { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create(ROW(0)); local_cluster->start = atoi(ROW(1)); local_cluster->end = atoi(ROW(2)); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while((host = hostlist_next(h_itr))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(cnodes->cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; cnodes->curr_cluster = local_cluster; } } else _destroy_local_cluster(local_cluster); } END_EACH_ROW; PQclear(result); hostlist_iterator_destroy(h_itr); if (!list_count(cnodes->cluster_list)) { destroy_cluster_nodes(cnodes); cnodes = NULL; } hostlist_destroy(temp_hl); return cnodes; }
static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, uint32_t node_scaling) { uint16_t base_state; uint16_t used_cpus = 0, error_cpus = 0; int total_cpus = 0, total_nodes = 0; /* since node_scaling could be less here, we need to use the * global node scaling which should never change. */ int single_node_cpus = (node_ptr->cpus / g_node_scaling); base_state = node_ptr->node_state & NODE_STATE_BASE; if (sinfo_ptr->nodes_total == 0) { /* first node added */ sinfo_ptr->node_state = node_ptr->node_state; sinfo_ptr->features = node_ptr->features; sinfo_ptr->gres = node_ptr->gres; sinfo_ptr->reason = node_ptr->reason; sinfo_ptr->reason_time= node_ptr->reason_time; sinfo_ptr->reason_uid = node_ptr->reason_uid; sinfo_ptr->min_cpus = node_ptr->cpus; sinfo_ptr->max_cpus = node_ptr->cpus; sinfo_ptr->min_sockets = node_ptr->sockets; sinfo_ptr->max_sockets = node_ptr->sockets; sinfo_ptr->min_cores = node_ptr->cores; sinfo_ptr->max_cores = node_ptr->cores; sinfo_ptr->min_threads = node_ptr->threads; sinfo_ptr->max_threads = node_ptr->threads; sinfo_ptr->min_disk = node_ptr->tmp_disk; sinfo_ptr->max_disk = node_ptr->tmp_disk; sinfo_ptr->min_mem = node_ptr->real_memory; sinfo_ptr->max_mem = node_ptr->real_memory; sinfo_ptr->min_weight = node_ptr->weight; sinfo_ptr->max_weight = node_ptr->weight; sinfo_ptr->min_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpus_per_node = sinfo_ptr->part_info-> max_cpus_per_node; sinfo_ptr->version = node_ptr->version; } else if (hostlist_find(sinfo_ptr->nodes, node_ptr->name) != -1) { /* we already have this node in this record, * just return, don't duplicate */ return; } else { if (sinfo_ptr->min_cpus > node_ptr->cpus) sinfo_ptr->min_cpus = node_ptr->cpus; if (sinfo_ptr->max_cpus < node_ptr->cpus) sinfo_ptr->max_cpus = node_ptr->cpus; if (sinfo_ptr->min_sockets > node_ptr->sockets) sinfo_ptr->min_sockets = node_ptr->sockets; if (sinfo_ptr->max_sockets < node_ptr->sockets) sinfo_ptr->max_sockets = node_ptr->sockets; if (sinfo_ptr->min_cores > node_ptr->cores) sinfo_ptr->min_cores = node_ptr->cores; if (sinfo_ptr->max_cores < node_ptr->cores) sinfo_ptr->max_cores = node_ptr->cores; if (sinfo_ptr->min_threads > node_ptr->threads) sinfo_ptr->min_threads = node_ptr->threads; if (sinfo_ptr->max_threads < node_ptr->threads) sinfo_ptr->max_threads = node_ptr->threads; if (sinfo_ptr->min_disk > node_ptr->tmp_disk) sinfo_ptr->min_disk = node_ptr->tmp_disk; if (sinfo_ptr->max_disk < node_ptr->tmp_disk) sinfo_ptr->max_disk = node_ptr->tmp_disk; if (sinfo_ptr->min_mem > node_ptr->real_memory) sinfo_ptr->min_mem = node_ptr->real_memory; if (sinfo_ptr->max_mem < node_ptr->real_memory) sinfo_ptr->max_mem = node_ptr->real_memory; if (sinfo_ptr->min_weight> node_ptr->weight) sinfo_ptr->min_weight = node_ptr->weight; if (sinfo_ptr->max_weight < node_ptr->weight) sinfo_ptr->max_weight = node_ptr->weight; if (sinfo_ptr->min_cpu_load > node_ptr->cpu_load) sinfo_ptr->min_cpu_load = node_ptr->cpu_load; if (sinfo_ptr->max_cpu_load < node_ptr->cpu_load) sinfo_ptr->max_cpu_load = node_ptr->cpu_load; } hostlist_push_host(sinfo_ptr->nodes, node_ptr->name); if (params.match_flags.node_addr_flag) hostlist_push_host(sinfo_ptr->node_addr, node_ptr->node_addr); if (params.match_flags.hostnames_flag) hostlist_push_host(sinfo_ptr->hostnames, node_ptr->node_hostname); total_cpus = node_ptr->cpus; total_nodes = node_scaling; select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &used_cpus); select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &error_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) { if (!params.match_flags.state_flag && (used_cpus || error_cpus)) { /* We only get one shot at this (because all states * are combined together), so we need to make * sure we get all the subgrps accounted. (So use * g_node_scaling for safe measure) */ total_nodes = g_node_scaling; sinfo_ptr->nodes_alloc += used_cpus; sinfo_ptr->nodes_other += error_cpus; sinfo_ptr->nodes_idle += (total_nodes - (used_cpus + error_cpus)); used_cpus *= single_node_cpus; error_cpus *= single_node_cpus; } else { /* process only for this subgrp and then return */ total_cpus = total_nodes * single_node_cpus; if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || (node_ptr->node_state & NODE_STATE_COMPLETING)) { sinfo_ptr->nodes_alloc += total_nodes; sinfo_ptr->cpus_alloc += total_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->nodes_other += total_nodes; sinfo_ptr->cpus_other += total_cpus; } else { sinfo_ptr->nodes_idle += total_nodes; sinfo_ptr->cpus_idle += total_cpus; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_total += total_cpus; return; } } else { if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || IS_NODE_COMPLETING(node_ptr)) sinfo_ptr->nodes_alloc += total_nodes; else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) sinfo_ptr->nodes_other += total_nodes; else sinfo_ptr->nodes_idle += total_nodes; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_alloc += used_cpus; sinfo_ptr->cpus_total += total_cpus; total_cpus -= used_cpus + error_cpus; if (error_cpus) { sinfo_ptr->cpus_idle += total_cpus; sinfo_ptr->cpus_other += error_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->cpus_other += total_cpus; } else sinfo_ptr->cpus_idle += total_cpus; }
/* * _filter_out - Determine if the specified node should be filtered out or * reported. * node_ptr IN - node to consider filtering out * RET - true if node should not be reported, false otherwise */ static bool _filter_out(node_info_t *node_ptr) { static hostlist_t host_list = NULL; if (params.nodes) { if (host_list == NULL) host_list = hostlist_create(params.nodes); if (hostlist_find (host_list, node_ptr->name) == -1) return true; } if (params.dead_nodes && !IS_NODE_NO_RESPOND(node_ptr)) return true; if (params.responding_nodes && IS_NODE_NO_RESPOND(node_ptr)) return true; if (params.state_list) { int *node_state; bool match = false; uint16_t base_state; ListIterator iterator; uint16_t cpus = 0; node_info_t tmp_node, *tmp_node_ptr = &tmp_node; iterator = list_iterator_create(params.state_list); while ((node_state = list_next(iterator))) { tmp_node_ptr->node_state = *node_state; if (*node_state == NODE_STATE_DRAIN) { /* We search for anything that has the * drain flag set */ if (IS_NODE_DRAIN(node_ptr)) { match = true; break; } } else if (IS_NODE_DRAINING(tmp_node_ptr)) { /* We search for anything that gets mapped to * DRAINING in node_state_string */ if (IS_NODE_DRAINING(node_ptr)) { match = true; break; } } else if (IS_NODE_DRAINED(tmp_node_ptr)) { /* We search for anything that gets mapped to * DRAINED in node_state_string */ if (IS_NODE_DRAINED(node_ptr)) { match = true; break; } } else if (*node_state & NODE_STATE_FLAGS) { if (*node_state & node_ptr->node_state) { match = true; break; } } else if (*node_state == NODE_STATE_ERROR) { slurm_get_select_nodeinfo( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &cpus); if (cpus) { match = true; break; } } else if (*node_state == NODE_STATE_ALLOCATED) { slurm_get_select_nodeinfo( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &cpus); if (params.cluster_flags & CLUSTER_FLAG_BG && !cpus && (IS_NODE_ALLOCATED(node_ptr) || IS_NODE_COMPLETING(node_ptr))) cpus = node_ptr->cpus; if (cpus) { match = true; break; } } else if (*node_state == NODE_STATE_IDLE) { base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); if (base_state == NODE_STATE_IDLE) { match = true; break; } } else { base_state = node_ptr->node_state & NODE_STATE_BASE; if (base_state == *node_state) { match = true; break; } } } list_iterator_destroy(iterator); if (!match) return true; } return false; }
/* * _build_sinfo_data - make a sinfo_data entry for each unique node * configuration and add it to the sinfo_list for later printing. * sinfo_list IN/OUT - list of unique sinfo_data records to report * partition_msg IN - partition info message * node_msg IN - node info message * RET zero or error code */ static int _build_sinfo_data(List sinfo_list, partition_info_msg_t *partition_msg, node_info_msg_t *node_msg) { pthread_attr_t attr_sinfo; pthread_t thread_sinfo; build_part_info_t *build_struct_ptr; node_info_t *node_ptr = NULL; partition_info_t *part_ptr = NULL; int j; g_node_scaling = node_msg->node_scaling; /* by default every partition is shown, even if no nodes */ if ((!params.node_flag) && params.match_flags.partition_flag) { part_ptr = partition_msg->partition_array; for (j=0; j<partition_msg->record_count; j++, part_ptr++) { if ((!params.partition) || (_strcmp(params.partition, part_ptr->name) == 0)) { list_append(sinfo_list, _create_sinfo( part_ptr, (uint16_t) j, NULL, node_msg->node_scaling)); } } } if (params.filtering) { for (j = 0; j < node_msg->record_count; j++) { node_ptr = &(node_msg->node_array[j]); if (node_ptr->name && _filter_out(node_ptr)) xfree(node_ptr->name); } } /* make sinfo_list entries for every node in every partition */ for (j=0; j<partition_msg->record_count; j++, part_ptr++) { part_ptr = &(partition_msg->partition_array[j]); if (params.filtering && params.partition && _strcmp(part_ptr->name, params.partition)) continue; if (node_msg->record_count == 1) { /* node_name_single */ int pos = -1; uint16_t subgrp_size = 0; hostlist_t hl; node_ptr = &(node_msg->node_array[0]); if ((node_ptr->name == NULL) || (part_ptr->nodes == NULL)) continue; hl = hostlist_create(part_ptr->nodes); pos = hostlist_find(hl, node_msg->node_array[0].name); hostlist_destroy(hl); if (pos < 0) continue; if (select_g_select_nodeinfo_get( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBGRP_SIZE, 0, &subgrp_size) == SLURM_SUCCESS && subgrp_size) { _handle_subgrps(sinfo_list, (uint16_t) j, part_ptr, node_ptr, node_msg-> node_scaling); } else { _insert_node_ptr(sinfo_list, (uint16_t) j, part_ptr, node_ptr, node_msg-> node_scaling); } continue; } /* Process each partition using a separate thread */ build_struct_ptr = xmalloc(sizeof(build_part_info_t)); build_struct_ptr->node_msg = node_msg; build_struct_ptr->part_num = (uint16_t) j; build_struct_ptr->part_ptr = part_ptr; build_struct_ptr->sinfo_list = sinfo_list; slurm_mutex_lock(&sinfo_cnt_mutex); sinfo_cnt++; slurm_mutex_unlock(&sinfo_cnt_mutex); slurm_attr_init(&attr_sinfo); if (pthread_attr_setdetachstate (&attr_sinfo, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); while (pthread_create(&thread_sinfo, &attr_sinfo, _build_part_info, (void *) build_struct_ptr)) { error("pthread_create error %m"); usleep(10000); /* sleep and retry */ } slurm_attr_destroy(&attr_sinfo); } slurm_mutex_lock(&sinfo_cnt_mutex); while (sinfo_cnt) { pthread_cond_wait(&sinfo_cnt_cond, &sinfo_cnt_mutex); } slurm_mutex_unlock(&sinfo_cnt_mutex); _sort_hostlist(sinfo_list); return SLURM_SUCCESS; }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; uint32_t nodeid = 0, nodes = 0; int parent_id, depth, max_depth, tmp; int width, my_nspace = -1; char *p; int i, *ch_nodeids = NULL; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->procs = xmalloc(sizeof(*procs) * nprocs); memcpy(coll->procs, procs, sizeof(*procs) * nprocs); coll->nprocs = nprocs; coll->my_nspace = my_nspace; if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } width = slurm_get_tree_width(); nodes = hostlist_count(hl); nodeid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(nodeid, nodes, width, &parent_id, &tmp, &depth, &max_depth); coll->children_cnt = tmp; coll->nodeid = nodeid; /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_cntr = 0; coll->contrib_local = false; ch_nodeids = xmalloc(sizeof(int) * width); coll->ch_contribs = xmalloc(sizeof(int) * width); coll->children_cnt = reverse_tree_direct_children(nodeid, nodes, width, depth, ch_nodeids); /* create the hostlist with extract direct children's hostnames */ coll->ch_hosts = hostlist_create(""); for (i = 0; i < coll->children_cnt; i++) { char *hname = hostlist_nth(hl, ch_nodeids[i]); hostlist_push(coll->ch_hosts, hname); } /* just in case, shouldn't be needed */ hostlist_uniq(coll->ch_hosts); xfree(ch_nodeids); if (parent_id == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want ourselfs there) */ coll->parent_host = NULL; hostlist_delete_host(hl, pmixp_info_hostname()); coll->all_children = hl; } else if (parent_id >= 0) { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ p = hostlist_nth(hl, parent_id); coll->parent_host = xstrdup(p); /* use empty hostlist here */ coll->all_children = hostlist_create(""); free(p); hostlist_destroy(hl); } /* Collective data */ coll->buf = pmixp_server_new_buf(); coll->serv_offs = get_buf_offset(coll->buf); if (SLURM_SUCCESS != _pack_ranges(coll)) { PMIXP_ERROR("Cannot pack ranges to coll message header!"); goto err_exit; } /* Callback information */ coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
/* * The pack_node_list may not be ordered across multiple components, which can * cause problems for some MPI implementations. Put the pack_node_list records * in alphabetic order and reorder pack_task_cnts pack_tids to match */ static void _reorder_pack_recs(char **in_node_list, uint16_t **in_task_cnts, uint32_t ***in_tids, int total_nnodes) { hostlist_t in_hl, out_hl; uint16_t *out_task_cnts = NULL; uint32_t **out_tids = NULL; char *hostname; int i, j; in_hl = hostlist_create(*in_node_list); if (!in_hl) { error("%s: Invalid hostlist(%s)", __func__, *in_node_list); return; } out_hl = hostlist_copy(in_hl); hostlist_sort(out_hl); hostlist_uniq(out_hl); i = hostlist_count(out_hl); if (i != total_nnodes) { error("%s: Invalid hostlist(%s) count(%d)", __func__, *in_node_list, total_nnodes); goto fini; } out_task_cnts = xmalloc(sizeof(uint16_t) * total_nnodes); out_tids = xmalloc(sizeof(uint32_t *) * total_nnodes); for (i = 0; i < total_nnodes; i++) { hostname = hostlist_nth(out_hl, i); if (!hostname) { error("%s: Invalid hostlist(%s) count(%d)", __func__, *in_node_list, total_nnodes); break; } j = hostlist_find(in_hl, hostname); if (j == -1) { error("%s: Invalid hostlist(%s) parsing", __func__, *in_node_list); free(hostname); break; } out_task_cnts[i] = in_task_cnts[0][j]; out_tids[i] = in_tids[0][j]; free(hostname); } if (i >= total_nnodes) { /* Success */ xfree(*in_node_list); *in_node_list = hostlist_ranged_string_xmalloc(out_hl); xfree(*in_task_cnts); *in_task_cnts = out_task_cnts; out_task_cnts = NULL; xfree(*in_tids); *in_tids = out_tids; out_tids = NULL; } #if 0 info("NODE_LIST[%d]:%s", total_nnodes, *in_node_list); for (i = 0; i < total_nnodes; i++) { info("TASK_CNT[%d]:%u", i, in_task_cnts[0][i]); for (j = 0; j < in_task_cnts[0][i]; j++) { info("TIDS[%d][%d]: %u", i, j, in_tids[0][i][j]); } } #endif fini: hostlist_destroy(in_hl); hostlist_destroy(out_hl); xfree(out_task_cnts); xfree(out_tids); }
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf) { int nodeid; char *data = NULL; uint32_t size; char *state = NULL; PMIXP_DEBUG("%s:%d: get contribution from node %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); /* fix the collective status if need */ if (PMIXP_COLL_SYNC == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_IN; coll->ts = time(NULL); } else if( PMIXP_COLL_FAN_OUT == coll->state) { PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN" " (next collective!)", pmixp_info_namespace(), pmixp_info_nodeid(), nodename); coll->state = PMIXP_COLL_FAN_OUT_IN; coll->ts_next = time(NULL); } xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state); /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ nodeid = hostlist_find(coll->ch_hosts, nodename); xassert(0 <= nodeid); if (0 > nodeid) { /* protect ourselfs if we are running with no asserts */ goto proceed; } if (0 < coll->ch_contribs[nodeid]) { /* May be 0 or 1. If grater - transmission skew, ignore. */ PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s", nodeid, nodename); /* this is duplication, skip. */ goto proceed; } data = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); grow_buf(coll->buf, size); memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size); set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size); /* increase number of individual contributions */ coll->ch_contribs[nodeid]++; /* increase number of total contributions */ coll->contrib_cntr++; proceed: /* unlock the structure */ slurm_mutex_unlock(&coll->lock); if( PMIXP_COLL_FAN_IN == coll->state ){ /* make a progress if we are in fan-in state */ _progress_fan_in(coll); } switch( coll->state ){ case PMIXP_COLL_SYNC: state = "sync"; break; case PMIXP_COLL_FAN_IN: state = "fan-in"; break; case PMIXP_COLL_FAN_OUT: state = "fan-out"; break; case PMIXP_COLL_FAN_OUT_IN: state = "fan-out-in"; break; } PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s", pmixp_info_namespace(), pmixp_info_nodeid(), nodename, state); return SLURM_SUCCESS; }
void * cerebrod_listener(void *arg) { char buf[CEREBRO_MAX_PACKET_LEN]; _cerebrod_listener_initialize(); for (;;) { struct cerebrod_message *msg; char nodename_buf[CEREBRO_MAX_NODENAME_LEN+1]; char nodename_key[CEREBRO_MAX_NODENAME_LEN+1]; struct timeval tv; int in_cluster_flag, i, count; fd_set readfds; int recv_len = 0; int maxfd = 0; FD_ZERO(&readfds); Pthread_mutex_lock(&listener_fds_lock); for (i = 0; i < conf.listen_message_config_len; i++) { if (listener_fds[i] > maxfd) maxfd = listener_fds[i]; FD_SET(listener_fds[i], &readfds); } count = Select(maxfd + 1, &readfds, NULL, NULL, NULL); for (i = 0; i < conf.listen_message_config_len; i++) { if (FD_ISSET(listener_fds[i], &readfds)) { if ((recv_len = recvfrom(listener_fds[i], buf, CEREBRO_MAX_PACKET_LEN, 0, NULL, NULL)) < 0) listener_fds[i] = cerebrod_reinit_socket(listener_fds[i], i, _listener_setup_socket, "listener: recvfrom"); break; } } Pthread_mutex_unlock(&listener_fds_lock); /* No packet read */ if (recv_len <= 0) continue; if (recv_len >= CEREBRO_MAX_PACKET_LEN) { CEREBROD_DBG(("received truncated packet")); continue; } if (_cerebrod_message_check_version(buf, recv_len) < 0) { CEREBROD_DBG(("received invalid version packet")); continue; } if (!(msg = _cerebrod_message_unmarshall(buf, recv_len))) { CEREBROD_DBG(("received unmarshallable packet")); continue; } _cerebrod_message_dump(msg, "Received Message"); /* Guarantee ending '\0' character */ memset(nodename_buf, '\0', CEREBRO_MAX_NODENAME_LEN+1); memcpy(nodename_buf, msg->nodename, CEREBRO_MAX_NODENAME_LEN); if (!strlen(nodename_buf)) { CEREBROD_DBG(("received null nodename")); cerebrod_message_destroy(msg); continue; } if (found_clusterlist_module) { if ((in_cluster_flag = clusterlist_module_node_in_cluster(clusterlist_handle, nodename_buf)) < 0) CEREBROD_EXIT(("clusterlist_module_node_in_cluster: %s", nodename_buf)); /* Second chance, is this data being forwarded from another host */ if (!in_cluster_flag) { if (Hostlist_find(conf.forward_host_accept, nodename_buf) >= 0) in_cluster_flag++; } } else /* must assume it is in the cluster */ /* Note, there is no need to handle 'forward_host_accept' under this case, * since we don't know if it is in the cluster or not anyways. */ in_cluster_flag = 1; if (!in_cluster_flag) { CEREBROD_DBG(("received non-cluster packet: %s", nodename_buf)); cerebrod_message_destroy(msg); continue; } memset(nodename_key, '\0', CEREBRO_MAX_NODENAME_LEN+1); if (found_clusterlist_module) { if (clusterlist_module_get_nodename(clusterlist_handle, nodename_buf, nodename_key, CEREBRO_MAX_NODENAME_LEN+1) < 0) { CEREBROD_DBG(("clusterlist_module_get_nodename: %s", nodename_buf)); cerebrod_message_destroy(msg); continue; } } else memcpy(nodename_key, nodename_buf, CEREBRO_MAX_NODENAME_LEN+1); Gettimeofday(&tv, NULL); cerebrod_listener_data_update(nodename_key, msg, tv.tv_sec); /* Forward data as necessary. Note, there is no need to * marshall data, it should already be marshalled when we * read it earlier. */ for (i = 0; i < conf.forward_message_config_len; i++) { /* if the forward destination is local to the machine, don't forward */ if (conf.forward_message_config[i].ip_is_local) continue; if (!forwarding_info[i].hosts || hostlist_find(forwarding_info[i].hosts, nodename_key) >= 0) { struct sockaddr *addr; struct sockaddr_in msgaddr; unsigned int addrlen; int rv; memset(&msgaddr, '\0', sizeof(struct sockaddr_in)); msgaddr.sin_family = AF_INET; msgaddr.sin_port = htons(conf.forward_message_config[i].destination_port); memcpy(&msgaddr.sin_addr, &conf.forward_message_config[i].ip_in_addr, sizeof(struct in_addr)); addr = (struct sockaddr *)&msgaddr; addrlen = sizeof(struct sockaddr_in); _cerebrod_message_dump(msg, "Forwarding Message"); Pthread_mutex_lock(&forwarding_info[i].lock); if ((rv = sendto(forwarding_info[i].fd, buf, recv_len, 0, addr, addrlen)) != recv_len) { if (rv < 0) forwarding_info[i].fd = cerebrod_reinit_socket(forwarding_info[i].fd, i, _forwarding_setup_socket, "forwarding: sendto"); else CEREBROD_ERR(("sendto: invalid bytes sent")); } Pthread_mutex_unlock(&forwarding_info[i].lock); } } cerebrod_message_destroy(msg); } return NULL; /* NOT REACHED */ }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; int max_depth, width, depth, i; char *p; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif width = slurm_get_tree_width(); coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(coll->my_peerid, coll->peers_cnt, width, &coll->prnt_peerid, &coll->chldrn_cnt, &depth, &max_depth); /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_children = 0; coll->contrib_local = false; coll->chldrn_ids = xmalloc(sizeof(int) * width); coll->contrib_chld = xmalloc(sizeof(int) * width); coll->chldrn_cnt = reverse_tree_direct_children(coll->my_peerid, coll->peers_cnt, width, depth, coll->chldrn_ids); if (coll->prnt_peerid == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want * ourselfs there) */ coll->prnt_host = NULL; coll->all_chldrn_hl = hostlist_copy(hl); hostlist_delete_host(coll->all_chldrn_hl, pmixp_info_hostname()); coll->chldrn_str = hostlist_ranged_string_xmalloc(coll->all_chldrn_hl); } else { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ /* * setup parent id's */ p = hostlist_nth(hl, coll->prnt_peerid); coll->prnt_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->prnt_peerid = pmixp_info_job_hostid(coll->prnt_host); /* * setup root id's * (we need this for the SLURM API communication case) */ p = hostlist_nth(hl, 0); coll->root_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->root_peerid = pmixp_info_job_hostid(coll->root_host); /* use empty hostlist here */ coll->all_chldrn_hl = hostlist_create(""); coll->chldrn_str = NULL; } /* fixup children peer ids to te global ones */ for(i=0; i<coll->chldrn_cnt; i++){ p = hostlist_nth(hl, coll->chldrn_ids[i]); coll->chldrn_ids[i] = pmixp_info_job_hostid(p); free(p); } hostlist_destroy(hl); /* Collective state */ coll->ufwd_buf = pmixp_server_buf_new(); coll->dfwd_buf = pmixp_server_buf_new(); _reset_coll_ufwd(coll); _reset_coll_dfwd(coll); coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
int pmixp_coll_contrib_parent(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { #ifdef PMIXP_COLL_DEBUG char *nodename = NULL; int lpeerid = -1; #endif char *data_src = NULL, *data_dst = NULL; uint32_t size; int expected_peerid; /* lock the structure */ slurm_mutex_lock(&coll->lock); if (pmixp_info_srv_direct_conn()) { expected_peerid = coll->prnt_peerid; } else { expected_peerid = coll->root_peerid; } /* Sanity check */ pmixp_coll_sanity_check(coll); if (expected_peerid != peerid) { char *nodename = pmixp_info_job_host(peerid); /* protect ourselfs if we are running with no asserts */ PMIXP_ERROR("%p: parent contrib from bad nodeid=%s:%u, " "expect=%d", coll, nodename, peerid, expected_peerid); xfree(nodename); goto proceed; } #ifdef PMIXP_COLL_DEBUG nodename = pmixp_info_job_host(peerid); lpeerid = hostlist_find(coll->peers_hl, nodename); /* Mark this event */ PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d): state=%s, size=%u", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: case PMIXP_COLL_COLLECT: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: prev contrib from %s:%d(%d): " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if ((coll->seq - 1) != seq) { /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } goto proceed; case PMIXP_COLL_UPFWD_WSC:{ /* we are not actually ready to receive this contribution as * the upward portion of the collective wasn't received yet. * This should not happen as SAPI (SLURM API) is blocking and * we chould transit to PMIXP_COLL_UPFWD_WPC immediately */ /* FATAL: should not happen in normal workflow */ char *nodename = pmixp_info_job_host(peerid); PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "contrib_seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq - 1) == seq); abort(); } case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WPC: /* we were waiting for this */ break; case PMIXP_COLL_DOWNFWD: /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: double contrib from %s:%d(%d) " "seq=%u, cur_seq=%u, state=%s", coll, nodename, peerid, lpeerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d: " "seq = %d, coll->seq = %d, state=%s", coll, nodename, peerid, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert((coll->seq - 1) == seq); xfree(nodename); abort(); } goto proceed; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_prnt) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contributions from parent %s:%d", coll, nodename, peerid); xfree(nodename); /* this is duplication, skip. */ goto proceed; } coll->contrib_prnt = true; data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->dfwd_buf, size); data_dst = get_buf_data(coll->dfwd_buf) + get_buf_offset(coll->dfwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->dfwd_buf, get_buf_offset(coll->dfwd_buf) + size); proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG if (nodename) { PMIXP_DEBUG("%p: finish: node=%s:%d(%d), state=%s", coll, nodename, peerid, lpeerid, pmixp_coll_state2str(coll->state)); xfree(nodename); } #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }
int pmixp_coll_contrib_child(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq, Buf buf) { char *data_src = NULL, *data_dst = NULL; uint32_t size; int chld_id; /* lock the structure */ slurm_mutex_lock(&coll->lock); pmixp_coll_sanity_check(coll); if (0 > (chld_id = _chld_id(coll, peerid))) { char *nodename = pmixp_info_job_host(peerid); char *avail_ids = _chld_ids_str(coll); PMIXP_DEBUG("%p: contribution from the non-child node " "%s:%d, acceptable ids: %s", coll, nodename, peerid, avail_ids); xfree(nodename); xfree(avail_ids); } #ifdef PMIXP_COLL_DEBUG char *nodename = pmixp_info_job_host(peerid); int lpeerid = hostlist_find(coll->peers_hl, nodename); PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d:%d):, state=%s, size=%u", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state), remaining_buf(buf)); #endif switch (coll->state) { case PMIXP_COLL_SYNC: /* change the state */ coll->ts = time(NULL); /* fall-thru */ case PMIXP_COLL_COLLECT: /* sanity check */ if (coll->seq != seq) { char *nodename = pmixp_info_job_host(peerid); /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d " "(child #%d) seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xassert(coll->seq == seq); abort(); } break; case PMIXP_COLL_UPFWD: case PMIXP_COLL_UPFWD_WSC: /* FATAL: should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d, state = %s", coll, nodename, peerid, pmixp_coll_state2str(coll->state)); xassert(0); abort(); case PMIXP_COLL_UPFWD_WPC: case PMIXP_COLL_DOWNFWD: #ifdef PMIXP_COLL_DEBUG /* It looks like a retransmission attempt when remote side * identified transmission failure, but we actually successfuly * received the message */ PMIXP_DEBUG("%p: contrib for the next collective " "from=%s:%d(%d:%d) contrib_seq=%u, coll->seq=%u, " "state=%s", coll, nodename, peerid, lpeerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); #endif if ((coll->seq +1) != seq) { char *nodename = pmixp_info_job_host(peerid); /* should not happen in normal workflow */ PMIXP_ERROR("%p: unexpected contrib from %s:%d(x:%d) " "seq = %d, coll->seq = %d, " "state=%s", coll, nodename, peerid, chld_id, seq, coll->seq, pmixp_coll_state2str(coll->state)); xfree(nodename); xassert((coll->seq +1) == seq); abort(); } break; default: /* should not happen in normal workflow */ PMIXP_ERROR("%p: unknown collective state %s", coll, pmixp_coll_state2str(coll->state)); abort(); } /* Because of possible timeouts/delays in transmission we * can receive a contribution second time. Avoid duplications * by checking our records. */ if (coll->contrib_chld[chld_id]) { char *nodename = pmixp_info_job_host(peerid); /* May be 0 or 1. If grater - transmission skew, ignore. * NOTE: this output is not on the critical path - * don't preprocess it out */ PMIXP_DEBUG("%p: multiple contribs from %s:%d(x:%d)", coll, nodename, peerid, chld_id); /* this is duplication, skip. */ xfree(nodename); goto proceed; } data_src = get_buf_data(buf) + get_buf_offset(buf); size = remaining_buf(buf); pmixp_server_buf_reserve(coll->ufwd_buf, size); data_dst = get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf); memcpy(data_dst, data_src, size); set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size); /* increase number of individual contributions */ coll->contrib_chld[chld_id] = true; /* increase number of total contributions */ coll->contrib_children++; proceed: _progress_coll(coll); #ifdef PMIXP_COLL_DEBUG PMIXP_DEBUG("%p: finish: node=%s:%d(%d:%d), state=%s", coll, nodename, peerid, lpeerid, chld_id, pmixp_coll_state2str(coll->state)); xfree(nodename); #endif /* unlock the structure */ slurm_mutex_unlock(&coll->lock); return SLURM_SUCCESS; }
extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, slurmdb_job_cond_t *job_cond, void **curr_cluster) { List local_cluster_list = NULL; time_t now = time(NULL); MYSQL_RES *result = NULL; MYSQL_ROW row; hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; char *query = NULL; int dims = 0; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } /* get the dimensions of this cluster so we know how to deal with the hostlists */ query = xstrdup_printf("select dimensions, flags from %s where " "name='%s'", cluster_table, (char *)list_peek(job_cond->cluster_list)); debug4("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); return NULL; } xfree(query); if (!(row = mysql_fetch_row(result))) { error("Couldn't get the dimensions of cluster '%s'.", (char *)list_peek(job_cond->cluster_list)); mysql_free_result(result); return NULL; } /* On a Cray System when dealing with hostlists as we are here this always needs to be 1. */ if (slurm_atoul(row[1]) & CLUSTER_FLAG_CRAY_A) dims = 1; else dims = atoi(row[0]); mysql_free_result(result); temp_hl = hostlist_create_dims(job_cond->used_nodes, dims); if (hostlist_count(temp_hl) <= 0) { error("we didn't get any real hosts to look for."); goto no_hosts; } h_itr = hostlist_iterator_create(temp_hl); query = xstrdup_printf("select cluster_nodes, time_start, " "time_end from \"%s_%s\" where node_name='' " "&& cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " && ((time_start < %ld) " "&& (time_end >= %ld || time_end = 0))", job_cond->usage_end, job_cond->usage_start); } if (debug_flags & DEBUG_FLAG_DB_JOB) DB_DEBUG(mysql_conn->conn, "query\n%s", query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); goto no_hosts; } xfree(query); local_cluster_list = list_create(_destroy_local_cluster); while ((row = mysql_fetch_row(result))) { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create_dims(row[0], dims); local_cluster->start = slurm_atoul(row[1]); local_cluster->end = slurm_atoul(row[2]); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while ((host = hostlist_next_dims(h_itr, dims))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(local_cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; (*curr_cluster) = local_cluster; } else if (!(*curr_cluster) || (((local_cluster_t *)(*curr_cluster))->end < local_cluster->end)) { (*curr_cluster) = local_cluster; } } else _destroy_local_cluster(local_cluster); } mysql_free_result(result); if (!list_count(local_cluster_list)) { FREE_NULL_LIST(local_cluster_list); local_cluster_list = NULL; goto no_hosts; } no_hosts: hostlist_iterator_destroy(h_itr); hostlist_destroy(temp_hl); return local_cluster_list; }
/* Return false if this node's data needs to be added to sinfo's table of * data to print. Return true if it is duplicate/redundant data. */ static bool _match_node_data(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr) { uint32_t tmp = 0; if (params.node_flag) return false; if (params.match_flags.hostnames_flag && (hostlist_find(sinfo_ptr->hostnames, node_ptr->node_hostname) == -1)) return false; if (params.match_flags.node_addr_flag && (hostlist_find(sinfo_ptr->node_addr, node_ptr->node_addr) == -1)) return false; if (sinfo_ptr->nodes && params.match_flags.features_flag && (xstrcmp(node_ptr->features, sinfo_ptr->features))) return false; if (sinfo_ptr->nodes && params.match_flags.features_act_flag && (xstrcmp(node_ptr->features_act, sinfo_ptr->features_act))) return false; if (sinfo_ptr->nodes && params.match_flags.gres_flag && (xstrcmp(node_ptr->gres, sinfo_ptr->gres))) return false; if (sinfo_ptr->nodes && params.match_flags.reason_flag && (xstrcmp(node_ptr->reason, sinfo_ptr->reason))) return false; if (sinfo_ptr->nodes && params.match_flags.reason_timestamp_flag && (node_ptr->reason_time != sinfo_ptr->reason_time)) return false; if (sinfo_ptr->nodes && params.match_flags.reason_user_flag && node_ptr->reason_uid != sinfo_ptr->reason_uid) { return false; } if (params.match_flags.state_flag) { char *state1, *state2; state1 = node_state_string(node_ptr->node_state); state2 = node_state_string(sinfo_ptr->node_state); if (xstrcmp(state1, state2)) return false; } select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_MEM_ALLOC, NODE_STATE_ALLOCATED, &tmp); if (params.match_flags.alloc_mem_flag && (tmp != sinfo_ptr->alloc_memory)) return false; /* If no need to exactly match sizes, just return here * otherwise check cpus, disk, memory and weigth individually */ if (!params.exact_match) return true; if (params.match_flags.cpus_flag && (node_ptr->cpus != sinfo_ptr->min_cpus)) return false; if (params.match_flags.sockets_flag && (node_ptr->sockets != sinfo_ptr->min_sockets)) return false; if (params.match_flags.cores_flag && (node_ptr->cores != sinfo_ptr->min_cores)) return false; if (params.match_flags.threads_flag && (node_ptr->threads != sinfo_ptr->min_threads)) return false; if (params.match_flags.sct_flag && ((node_ptr->sockets != sinfo_ptr->min_sockets) || (node_ptr->cores != sinfo_ptr->min_cores) || (node_ptr->threads != sinfo_ptr->min_threads))) return false; if (params.match_flags.disk_flag && (node_ptr->tmp_disk != sinfo_ptr->min_disk)) return false; if (params.match_flags.memory_flag && (node_ptr->real_memory != sinfo_ptr->min_mem)) return false; if (params.match_flags.weight_flag && (node_ptr->weight != sinfo_ptr->min_weight)) return false; if (params.match_flags.cpu_load_flag && (node_ptr->cpu_load != sinfo_ptr->min_cpu_load)) return false; if (params.match_flags.free_mem_flag && (node_ptr->free_mem != sinfo_ptr->min_free_mem)) return false; if (params.match_flags.version_flag && (node_ptr->version != sinfo_ptr->version)) return false; return true; }
/* * _set_collectors call the split_hostlist API on the all nodes hostlist * to set the node to be used as a collector for unsolicited node aggregation. * * If this node is a forwarding node (first node in any hostlist), * then its collector and backup are the ControlMachine and it's backup. * * Otherwise, we find the hostlist containing this node. * The forwarding node in that hostlist becomes a collector, the next node * which is not this node becomes the backup. * That list is split, we iterate through it and searching for a list in * which this node is a forwarding node. If found, we set the collector and * backup, else this process is repeated. */ static void _set_collectors(char *this_node_name) { slurm_ctl_conf_t *conf; hostlist_t nodes; hostlist_t* hll = NULL; char *parent = NULL, *backup = NULL; char addrbuf[32]; int i, j, f = -1; int hl_count = 0; uint16_t parent_port; uint16_t backup_port; bool found = false; bool ctldparent = true; #ifdef HAVE_FRONT_END return; /* on a FrontEnd system this would never be useful. */ #endif if (!run_in_daemon("slurmd")) return; /* Only compute nodes have collectors */ /* Set the initial iteration, collector is controller, * full list is split */ xassert(this_node_name); conf = slurm_conf_lock(); nodes = _get_all_nodes(); parent = strdup(conf->control_addr); if (conf->backup_addr) { backup = strdup(conf->backup_addr); } parent_port = conf->slurmctld_port; backup_port = parent_port; slurm_conf_unlock(); while (!found) { if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) { error("unable to split forward hostlist"); goto clean; /* collector addrs remains null */ } /* Find which hostlist contains this node */ for (i=0; i < hl_count; i++) { f = hostlist_find(hll[i], this_node_name); if (f != -1) break; } if (i == hl_count) { fatal("ROUTE -- %s not found in node_record_table", this_node_name); } if (f == 0) { /* we are a forwarded to node, * so our parent is parent */ if (hostlist_count(hll[i]) > 1) this_is_collector = true; xfree(msg_collect_node); msg_collect_node = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) slurm_set_addr(msg_collect_node, parent_port, parent); else { slurm_conf_get_addr(parent, msg_collect_node); msg_collect_node->sin_port = htons(parent_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); info("ROUTE -- message collector address is %s", addrbuf); } xfree(msg_collect_backup); if (backup) { msg_collect_backup = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_backup, backup_port, backup); } else { slurm_conf_get_addr(backup, msg_collect_backup); msg_collect_backup->sin_port = htons(backup_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr( msg_collect_backup, addrbuf, 32); info("ROUTE -- message collector backup" " address is %s", addrbuf); } } else { if (debug_flags & DEBUG_FLAG_ROUTE) { info("ROUTE -- no message collector " "backup"); } } found = true; goto clean; } /* We are not a forwarding node, the first node in this list * will split the forward_list. * We also know that the forwarding node is not a controller. * * clean up parent context */ ctldparent = false; hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); nodes = hostlist_copy(hll[i]); for (j=0; j < hl_count; j++) { hostlist_destroy(hll[j]); } xfree(hll); /* set our parent, backup, and continue search */ parent = hostlist_shift(nodes); backup = hostlist_nth(nodes, 0); if (strcmp(backup, this_node_name) == 0) { free(backup); backup = NULL; if (hostlist_count(nodes) > 1) backup = hostlist_nth(nodes, 1); } parent_port = slurm_conf_get_port(parent); if (backup) { backup_port = slurm_conf_get_port(backup); } else backup_port = 0; } clean: if (debug_flags & DEBUG_FLAG_ROUTE) { if (this_is_collector) info("ROUTE -- %s is a collector node", this_node_name); else info("ROUTE -- %s is a leaf node", this_node_name); } hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); for (i=0; i < hl_count; i++) { hostlist_destroy(hll[i]); } xfree(hll); }
static int _resources_set(char ***env) { char *p = NULL; /* Initialize all memory pointers that would be allocated to NULL * So in case of error exit we will know what to xfree */ _pmixp_job_info.job_hl = hostlist_create(""); _pmixp_job_info.step_hl = hostlist_create(""); _pmixp_job_info.hostname = NULL; /* Save step host list */ p = getenvp(*env, PMIXP_STEP_NODES_ENV); if (!p) { PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found", PMIXP_STEP_NODES_ENV); goto err_exit; } hostlist_push(_pmixp_job_info.step_hl, p); /* Extract our node name */ p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id); _pmixp_job_info.hostname = xstrdup(p); free(p); /* Determine job-wide node id and job-wide node count */ p = getenvp(*env, PMIXP_JOB_NODES_ENV); if (p == NULL) { p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP); if (p == NULL) { /* shouldn't happen if we are under SLURM! */ PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!", PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP); goto err_exit; } } hostlist_push(_pmixp_job_info.job_hl, p); _pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl); _pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl, _pmixp_job_info.hostname); /* FIXME!! ------------------------------------------------------------- */ /* TODO: _get_task_count not always works well. if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) { _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; } xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job); */ _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; /* Save task-to-node mapping */ p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV); if (p == NULL) { /* Direct modex won't work */ PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!", PMIXP_SLURM_MAPPING_ENV); goto err_exit; } _pmixp_job_info.task_map_packed = xstrdup(p); return SLURM_SUCCESS; err_exit: hostlist_destroy(_pmixp_job_info.job_hl); hostlist_destroy(_pmixp_job_info.step_hl); if (NULL != _pmixp_job_info.hostname) { xfree(_pmixp_job_info.hostname); } return SLURM_ERROR; }
extern int slurm_hostlist_find(hostlist_t hl, const char *hostname) { return hostlist_find(hl, hostname); }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }