static int _hostset_from_ranges(const pmix_proc_t *procs, size_t nprocs, hostlist_t *hl_out) { int i; hostlist_t hl = hostlist_create(""); pmixp_namespace_t *nsptr = NULL; for (i = 0; i < nprocs; i++) { char *node = NULL; hostlist_t tmp; nsptr = pmixp_nspaces_find(procs[i].nspace); if (NULL == nsptr) { goto err_exit; } if (procs[i].rank == PMIX_RANK_WILDCARD) { tmp = hostlist_copy(nsptr->hl); } else { tmp = pmixp_nspace_rankhosts(nsptr, &procs[i].rank, 1); } while (NULL != (node = hostlist_pop(tmp))) { hostlist_push(hl, node); free(node); } hostlist_destroy(tmp); } hostlist_uniq(hl); *hl_out = hl; return SLURM_SUCCESS; err_exit: hostlist_destroy(hl); return SLURM_ERROR; }
int pmixp_coll_init(pmixp_coll_t *coll, pmixp_coll_type_t type, const pmixp_proc_t *procs, size_t nprocs) { int rc = SLURM_SUCCESS; hostlist_t hl; coll->seq = 0; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != pmixp_hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); rc = SLURM_ERROR; goto exit; } coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif switch(type) { case PMIXP_COLL_TYPE_FENCE_TREE: rc = pmixp_coll_tree_init(coll, &hl); break; case PMIXP_COLL_TYPE_FENCE_RING: rc = pmixp_coll_ring_init(coll, &hl); break; default: PMIXP_ERROR("Unknown coll type"); rc = SLURM_ERROR; } hostlist_destroy(hl); if (rc) { goto exit; } exit: return rc; }
/* * _set_collectors call the split_hostlist API on the all nodes hostlist * to set the node to be used as a collector for unsolicited node aggregation. * * If this node is a forwarding node (first node in any hostlist), * then its collector and backup are the ControlMachine and it's backup. * * Otherwise, we find the hostlist containing this node. * The forwarding node in that hostlist becomes a collector, the next node * which is not this node becomes the backup. * That list is split, we iterate through it and searching for a list in * which this node is a forwarding node. If found, we set the collector and * backup, else this process is repeated. */ static void _set_collectors(char *this_node_name) { slurm_ctl_conf_t *conf; hostlist_t nodes; hostlist_t* hll = NULL; char *parent = NULL, *backup = NULL; char addrbuf[32]; int i, j, f = -1; int hl_count = 0; uint16_t parent_port; uint16_t backup_port; bool found = false; bool ctldparent = true; #ifdef HAVE_FRONT_END return; /* on a FrontEnd system this would never be useful. */ #endif if (!run_in_daemon("slurmd")) return; /* Only compute nodes have collectors */ /* Set the initial iteration, collector is controller, * full list is split */ xassert(this_node_name); conf = slurm_conf_lock(); nodes = _get_all_nodes(); parent = strdup(conf->control_addr); if (conf->backup_addr) { backup = strdup(conf->backup_addr); } parent_port = conf->slurmctld_port; backup_port = parent_port; slurm_conf_unlock(); while (!found) { if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) { error("unable to split forward hostlist"); goto clean; /* collector addrs remains null */ } /* Find which hostlist contains this node */ for (i=0; i < hl_count; i++) { f = hostlist_find(hll[i], this_node_name); if (f != -1) break; } if (i == hl_count) { fatal("ROUTE -- %s not found in node_record_table", this_node_name); } if (f == 0) { /* we are a forwarded to node, * so our parent is parent */ if (hostlist_count(hll[i]) > 1) this_is_collector = true; xfree(msg_collect_node); msg_collect_node = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) slurm_set_addr(msg_collect_node, parent_port, parent); else { slurm_conf_get_addr(parent, msg_collect_node); msg_collect_node->sin_port = htons(parent_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); info("ROUTE -- message collector address is %s", addrbuf); } xfree(msg_collect_backup); if (backup) { msg_collect_backup = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_backup, backup_port, backup); } else { slurm_conf_get_addr(backup, msg_collect_backup); msg_collect_backup->sin_port = htons(backup_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr( msg_collect_backup, addrbuf, 32); info("ROUTE -- message collector backup" " address is %s", addrbuf); } } else { if (debug_flags & DEBUG_FLAG_ROUTE) { info("ROUTE -- no message collector " "backup"); } } found = true; goto clean; } /* We are not a forwarding node, the first node in this list * will split the forward_list. * We also know that the forwarding node is not a controller. * * clean up parent context */ ctldparent = false; hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); nodes = hostlist_copy(hll[i]); for (j=0; j < hl_count; j++) { hostlist_destroy(hll[j]); } xfree(hll); /* set our parent, backup, and continue search */ parent = hostlist_shift(nodes); backup = hostlist_nth(nodes, 0); if (strcmp(backup, this_node_name) == 0) { free(backup); backup = NULL; if (hostlist_count(nodes) > 1) backup = hostlist_nth(nodes, 1); } parent_port = slurm_conf_get_port(parent); if (backup) { backup_port = slurm_conf_get_port(backup); } else backup_port = 0; } clean: if (debug_flags & DEBUG_FLAG_ROUTE) { if (this_is_collector) info("ROUTE -- %s is a collector node", this_node_name); else info("ROUTE -- %s is a leaf node", this_node_name); } hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); for (i=0; i < hl_count; i++) { hostlist_destroy(hll[i]); } xfree(hll); }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; int max_depth, width, depth, i; char *p; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif width = slurm_get_tree_width(); coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(coll->my_peerid, coll->peers_cnt, width, &coll->prnt_peerid, &coll->chldrn_cnt, &depth, &max_depth); /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_children = 0; coll->contrib_local = false; coll->chldrn_ids = xmalloc(sizeof(int) * width); coll->contrib_chld = xmalloc(sizeof(int) * width); coll->chldrn_cnt = reverse_tree_direct_children(coll->my_peerid, coll->peers_cnt, width, depth, coll->chldrn_ids); if (coll->prnt_peerid == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want * ourselfs there) */ coll->prnt_host = NULL; coll->all_chldrn_hl = hostlist_copy(hl); hostlist_delete_host(coll->all_chldrn_hl, pmixp_info_hostname()); coll->chldrn_str = hostlist_ranged_string_xmalloc(coll->all_chldrn_hl); } else { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ /* * setup parent id's */ p = hostlist_nth(hl, coll->prnt_peerid); coll->prnt_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->prnt_peerid = pmixp_info_job_hostid(coll->prnt_host); /* * setup root id's * (we need this for the SLURM API communication case) */ p = hostlist_nth(hl, 0); coll->root_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->root_peerid = pmixp_info_job_hostid(coll->root_host); /* use empty hostlist here */ coll->all_chldrn_hl = hostlist_create(""); coll->chldrn_str = NULL; } /* fixup children peer ids to te global ones */ for(i=0; i<coll->chldrn_cnt; i++){ p = hostlist_nth(hl, coll->chldrn_ids[i]); coll->chldrn_ids[i] = pmixp_info_job_hostid(p); free(p); } hostlist_destroy(hl); /* Collective state */ coll->ufwd_buf = pmixp_server_buf_new(); coll->dfwd_buf = pmixp_server_buf_new(); _reset_coll_ufwd(coll); _reset_coll_dfwd(coll); coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
void pmixp_coll_ring_log(pmixp_coll_t *coll) { int i; pmixp_coll_ring_t *ring = &coll->state.ring; char *nodename, *next, *prev; char *out_str = NULL; PMIXP_ERROR("%p: %s state seq=%d", coll, pmixp_coll_type2str(coll->type), coll->seq); nodename = pmixp_info_job_host(coll->my_peerid); PMIXP_ERROR("my peerid: %d:%s", coll->my_peerid, nodename); xfree(nodename); next = pmixp_info_job_host(_ring_next_id(coll)); prev = pmixp_info_job_host(_ring_prev_id(coll)); xstrfmtcat(out_str,"neighbor id: next %d:%s, prev %d:%s", _ring_next_id(coll), next, _ring_prev_id(coll), prev); PMIXP_ERROR("%s", out_str); xfree(next); xfree(prev); xfree(out_str); for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) { pmixp_coll_ring_ctx_t *coll_ctx = &ring->ctx_array[i]; PMIXP_ERROR("Context ptr=%p, #%d, in-use=%d", coll_ctx, i, coll_ctx->in_use); if (coll_ctx->in_use) { int id; char *done_contrib, *wait_contrib; hostlist_t hl_done_contrib, hl_wait_contrib; pmixp_hostset_from_ranges(coll->pset.procs, coll->pset.nprocs, &hl_done_contrib); hl_wait_contrib = hostlist_copy(hl_done_contrib); PMIXP_ERROR("\t seq=%d contribs: loc=%d/prev=%d/fwd=%d", coll_ctx->seq, coll_ctx->contrib_local, coll_ctx->contrib_prev, coll_ctx->forward_cnt); PMIXP_ERROR("\t neighbor contribs [%d]:", coll->peers_cnt); for (id = 0; id < coll->peers_cnt; id++) { char *nodename = pmixp_info_job_host(id); if(coll_ctx->contrib_map[id]) { hostlist_delete_host(hl_wait_contrib, nodename); } else { hostlist_delete_host(hl_done_contrib, nodename); } xfree(nodename); } done_contrib = slurm_hostlist_ranged_string_xmalloc( hl_done_contrib); wait_contrib = slurm_hostlist_ranged_string_xmalloc( hl_wait_contrib); PMIXP_ERROR("\t done contrib: %s", strlen(done_contrib) ? done_contrib : "-"); PMIXP_ERROR("\t wait contrib: %s", strlen(wait_contrib) ? wait_contrib : "-"); PMIXP_ERROR("\t status=%s", pmixp_coll_ring_state2str(coll_ctx->state)); PMIXP_ERROR("\t buf size=%u, remain=%u", size_buf(coll_ctx->ring_buf), remaining_buf(coll_ctx->ring_buf)); xfree(done_contrib); xfree(wait_contrib); hostlist_destroy(hl_done_contrib); hostlist_destroy(hl_wait_contrib); } } }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }
/* * The pack_node_list may not be ordered across multiple components, which can * cause problems for some MPI implementations. Put the pack_node_list records * in alphabetic order and reorder pack_task_cnts pack_tids to match */ static void _reorder_pack_recs(char **in_node_list, uint16_t **in_task_cnts, uint32_t ***in_tids, int total_nnodes) { hostlist_t in_hl, out_hl; uint16_t *out_task_cnts = NULL; uint32_t **out_tids = NULL; char *hostname; int i, j; in_hl = hostlist_create(*in_node_list); if (!in_hl) { error("%s: Invalid hostlist(%s)", __func__, *in_node_list); return; } out_hl = hostlist_copy(in_hl); hostlist_sort(out_hl); hostlist_uniq(out_hl); i = hostlist_count(out_hl); if (i != total_nnodes) { error("%s: Invalid hostlist(%s) count(%d)", __func__, *in_node_list, total_nnodes); goto fini; } out_task_cnts = xmalloc(sizeof(uint16_t) * total_nnodes); out_tids = xmalloc(sizeof(uint32_t *) * total_nnodes); for (i = 0; i < total_nnodes; i++) { hostname = hostlist_nth(out_hl, i); if (!hostname) { error("%s: Invalid hostlist(%s) count(%d)", __func__, *in_node_list, total_nnodes); break; } j = hostlist_find(in_hl, hostname); if (j == -1) { error("%s: Invalid hostlist(%s) parsing", __func__, *in_node_list); free(hostname); break; } out_task_cnts[i] = in_task_cnts[0][j]; out_tids[i] = in_tids[0][j]; free(hostname); } if (i >= total_nnodes) { /* Success */ xfree(*in_node_list); *in_node_list = hostlist_ranged_string_xmalloc(out_hl); xfree(*in_task_cnts); *in_task_cnts = out_task_cnts; out_task_cnts = NULL; xfree(*in_tids); *in_tids = out_tids; out_tids = NULL; } #if 0 info("NODE_LIST[%d]:%s", total_nnodes, *in_node_list); for (i = 0; i < total_nnodes; i++) { info("TASK_CNT[%d]:%u", i, in_task_cnts[0][i]); for (j = 0; j < in_task_cnts[0][i]; j++) { info("TIDS[%d][%d]: %u", i, j, in_tids[0][i][j]); } } #endif fini: hostlist_destroy(in_hl); hostlist_destroy(out_hl); xfree(out_task_cnts); xfree(out_tids); }