/* * route_g_reconfigure - reset during reconfigure * * RET: SLURM_SUCCESS - int */ extern int route_g_reconfigure(void) { if (route_init(NULL) != SLURM_SUCCESS) return SLURM_ERROR; debug_flags = slurm_get_debug_flags(); tree_width = slurm_get_tree_width(); return (*(ops.reconfigure))(); }
/* ************************************************************************** */ extern int route_init(char *node_name) { int retval = SLURM_SUCCESS; char *plugin_type = "route"; char *type = NULL; if (init_run && g_context) return retval; slurm_mutex_lock(&g_context_lock); if (g_context) goto done; type = slurm_get_route_plugin(); g_context = plugin_context_create( plugin_type, type, (void **)&ops, syms, sizeof(syms)); if (!g_context) { error("cannot create %s context for %s", plugin_type, type); retval = SLURM_ERROR; goto done; } tree_width = slurm_get_tree_width(); debug_flags = slurm_get_debug_flags(); init_run = true; _set_collectors(node_name); done: slurm_mutex_unlock(&g_context_lock); xfree(type); return retval; }
void *_forward_thread(void *arg) { forward_msg_t *fwd_msg = (forward_msg_t *)arg; forward_struct_t *fwd_struct = fwd_msg->fwd_struct; Buf buffer = init_buf(BUF_SIZE); /* probably enough for header */ List ret_list = NULL; int fd = -1; ret_data_info_t *ret_data_info = NULL; char *name = NULL; hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist); slurm_addr_t addr; char *buf = NULL; int steps = 0; int start_timeout = fwd_msg->timeout; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(hl))) { if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) { error("forward_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); continue; } goto cleanup; } if ((fd = slurm_open_msg_conn(&addr)) < 0) { error("forward_thread to %s: %m", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } buf = hostlist_ranged_string_xmalloc(hl); xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = buf; fwd_msg->header.forward.cnt = hostlist_count(hl); #if 0 info("sending %d forwards (%s) to %s", fwd_msg->header.forward.cnt, fwd_msg->header.forward.nodelist, name); #endif if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); } else debug3("forward: send to %s ", name); pack_header(&fwd_msg->header, buffer); /* add forward data to buffer */ if (remaining_buf(buffer) < fwd_struct->buf_len) { int new_size = buffer->processed + fwd_struct->buf_len; new_size += 1024; /* padded for paranoia */ xrealloc_nz(buffer->head, new_size); buffer->size = new_size; } if (fwd_struct->buf_len) { memcpy(&buffer->head[buffer->processed], fwd_struct->buf, fwd_struct->buf_len); buffer->processed += fwd_struct->buf_len; } /* * forward message */ if (slurm_msg_sendto(fd, get_buf_data(buffer), get_buf_offset(buffer), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) { error("forward_thread: slurm_msg_sendto: %m"); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } /* These messages don't have a return message, but if * we got here things worked out so make note of the * list of nodes as success. */ if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) || (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) || (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) { slurm_mutex_lock(&fwd_struct->forward_mutex); ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); while ((name = hostlist_shift(hl))) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); } goto cleanup; } if (fwd_msg->header.forward.cnt > 0) { static int message_timeout = -1; if (message_timeout < 0) message_timeout = slurm_get_msg_timeout() * 1000; if (!fwd_msg->header.forward.tree_width) fwd_msg->header.forward.tree_width = slurm_get_tree_width(); steps = (fwd_msg->header.forward.cnt+1) / fwd_msg->header.forward.tree_width; fwd_msg->timeout = (message_timeout*steps); /* info("got %d * %d = %d", message_timeout, */ /* steps, fwd_msg->timeout); */ steps++; fwd_msg->timeout += (start_timeout*steps); /* info("now + %d*%d = %d", start_timeout, */ /* steps, fwd_msg->timeout); */ } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); /* info("sent %d forwards got %d back", */ /* fwd_msg->header.forward.cnt, list_count(ret_list)); */ if (!ret_list || (fwd_msg->header.forward.cnt != 0 && list_count(ret_list) <= 1)) { slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); FREE_NULL_LIST(ret_list); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; continue; } goto cleanup; } else if ((fwd_msg->header.forward.cnt+1) != list_count(ret_list)) { /* this should never be called since the above should catch the failed forwards and pipe them back down, but this is here so we never have to worry about a locked mutex */ ListIterator itr = NULL; char *tmp = NULL; int first_node_found = 0; hostlist_iterator_t host_itr = hostlist_iterator_create(hl); error("We shouldn't be here. We forwarded to %d " "but only got %d back", (fwd_msg->header.forward.cnt+1), list_count(ret_list)); while ((tmp = hostlist_next(host_itr))) { int node_found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (!ret_data_info->node_name) { first_node_found = 1; ret_data_info->node_name = xstrdup(name); } if (!xstrcmp(tmp, ret_data_info->node_name)) { node_found = 1; break; } } list_iterator_destroy(itr); if (!node_found) { mark_as_failed_forward( &fwd_struct->ret_list, tmp, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } hostlist_iterator_destroy(host_itr); if (!first_node_found) { mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; } slurm_mutex_lock(&fwd_struct->forward_mutex); if (ret_list) { while ((ret_data_info = list_pop(ret_list)) != NULL) { if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(name); } list_push(fwd_struct->ret_list, ret_data_info); debug3("got response from %s", ret_data_info->node_name); } FREE_NULL_LIST(ret_list); } free(name); cleanup: if ((fd >= 0) && slurm_close(fd) < 0) error ("close(%d): %m", fd); hostlist_destroy(hl); destroy_forward(&fwd_msg->header.forward); free_buf(buffer); slurm_cond_signal(&fwd_struct->notify); slurm_mutex_unlock(&fwd_struct->forward_mutex); xfree(fwd_msg); return (NULL); }
static int _setup_stepd_tree_info(const stepd_step_rec_t *job, char ***env) { hostlist_t hl; char srun_host[64]; uint16_t port; char *p; int tree_width; /* job info available */ memset(&tree_info, 0, sizeof(tree_info)); hl = hostlist_create(job_info.step_nodelist); p = hostlist_nth(hl, job_info.nodeid); /* strdup-ed */ tree_info.this_node = xstrdup(p); free(p); /* this only controls the upward communication tree width */ p = getenvp(*env, PMI2_TREE_WIDTH_ENV); if (p) { tree_width = atoi(p); if (tree_width < 2) { info("invalid PMI2 tree width value (%d) detected. " "fallback to default value.", tree_width); tree_width = slurm_get_tree_width(); } } else { tree_width = slurm_get_tree_width(); } /* TODO: cannot launch 0 tasks on node */ /* * In tree position calculation, root of the tree is srun with id 0. * Stepd's id will be its nodeid plus 1. */ reverse_tree_info(job_info.nodeid + 1, job_info.nnodes + 1, tree_width, &tree_info.parent_id, &tree_info.num_children, &tree_info.depth, &tree_info.max_depth); tree_info.parent_id --; /* restore real nodeid */ if (tree_info.parent_id < 0) { /* parent is srun */ tree_info.parent_node = NULL; } else { p = hostlist_nth(hl, tree_info.parent_id); tree_info.parent_node = xstrdup(p); free(p); } hostlist_destroy(hl); tree_info.pmi_port = 0; /* not used */ p = getenvp(*env, "SLURM_SRUN_COMM_HOST"); if (!p) { error("mpi/pmi2: unable to find srun comm ifhn in env"); return SLURM_ERROR; } else { strncpy(srun_host, p, 64); } p = getenvp(*env, PMI2_SRUN_PORT_ENV); if (!p) { error("mpi/pmi2: unable to find srun pmi2 port in env"); return SLURM_ERROR; } else { port = atoi(p); unsetenvp(*env, PMI2_SRUN_PORT_ENV); } tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t)); slurm_set_addr(tree_info.srun_addr, port, srun_host); /* init kvs seq to 0. TODO: reduce array size */ tree_info.children_kvs_seq = xmalloc(sizeof(uint32_t) * job_info.nnodes); return SLURM_SUCCESS; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down_ptr(node_ptr, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }
/* * restore_front_end_state - restore frontend node state * IN recover - replace job, node and/or partition data with latest * available information depending upon value * 0 = use no saved state information, rebuild everything from * slurm.conf contents * 1 = recover saved job and trigger state, * node DOWN/DRAIN/FAIL state and reason information * 2 = recover all saved state */ extern void restore_front_end_state(int recover) { #ifdef HAVE_FRONT_END slurm_conf_frontend_t *slurm_conf_fe_ptr; ListIterator iter; uint16_t state_base, state_flags, tree_width; int i; last_front_end_update = time(NULL); if (recover == 0) purge_front_end_state(); if (front_end_list == NULL) return; /* No front ends in slurm.conf */ iter = list_iterator_create(front_end_list); while ((slurm_conf_fe_ptr = (slurm_conf_frontend_t *) list_next(iter))) { if (slurm_conf_fe_ptr->frontends == NULL) { fatal("FrontendName is NULL"); return; /* Prevent CLANG false positive */ } for (i = 0; i < front_end_node_cnt; i++) { if (strcmp(front_end_nodes[i].name, slurm_conf_fe_ptr->frontends) == 0) break; } if (i >= front_end_node_cnt) { front_end_node_cnt++; xrealloc(front_end_nodes, sizeof(front_end_record_t) * front_end_node_cnt); front_end_nodes[i].name = xstrdup(slurm_conf_fe_ptr->frontends); front_end_nodes[i].magic = FRONT_END_MAGIC; } xfree(front_end_nodes[i].allow_gids); xfree(front_end_nodes[i].allow_groups); if (slurm_conf_fe_ptr->allow_groups) { front_end_nodes[i].allow_groups = xstrdup(slurm_conf_fe_ptr->allow_groups); front_end_nodes[i].allow_gids = _xlate_groups(slurm_conf_fe_ptr->allow_groups, "AllowGroups"); } xfree(front_end_nodes[i].allow_uids); xfree(front_end_nodes[i].allow_users); if (slurm_conf_fe_ptr->allow_users) { front_end_nodes[i].allow_users = xstrdup(slurm_conf_fe_ptr->allow_users); front_end_nodes[i].allow_uids = _xlate_users(slurm_conf_fe_ptr->allow_users, "AllowUsers"); } xfree(front_end_nodes[i].deny_gids); xfree(front_end_nodes[i].deny_groups); if (slurm_conf_fe_ptr->deny_groups) { front_end_nodes[i].deny_groups = xstrdup(slurm_conf_fe_ptr->deny_groups); front_end_nodes[i].deny_gids = _xlate_groups(slurm_conf_fe_ptr->deny_groups, "DenyGroups"); } xfree(front_end_nodes[i].deny_uids); xfree(front_end_nodes[i].deny_users); if (slurm_conf_fe_ptr->deny_users) { front_end_nodes[i].deny_users = xstrdup(slurm_conf_fe_ptr->deny_users); front_end_nodes[i].deny_uids = _xlate_users(slurm_conf_fe_ptr->deny_users, "DenyUsers"); } xfree(front_end_nodes[i].comm_name); if (slurm_conf_fe_ptr->addresses) { front_end_nodes[i].comm_name = xstrdup(slurm_conf_fe_ptr->addresses); } else { front_end_nodes[i].comm_name = xstrdup(front_end_nodes[i].name); } state_base = front_end_nodes[i].node_state & NODE_STATE_BASE; state_flags = front_end_nodes[i].node_state & NODE_STATE_FLAGS; if ((state_base == 0) || (state_base == NODE_STATE_UNKNOWN)) { front_end_nodes[i].node_state = slurm_conf_fe_ptr->node_state | state_flags; } if ((front_end_nodes[i].reason == NULL) && (slurm_conf_fe_ptr->reason != NULL)) { front_end_nodes[i].reason = xstrdup(slurm_conf_fe_ptr->reason); } if (slurm_conf_fe_ptr->port) front_end_nodes[i].port = slurm_conf_fe_ptr->port; else front_end_nodes[i].port = slurmctld_conf.slurmd_port; slurm_set_addr(&front_end_nodes[i].slurm_addr, front_end_nodes[i].port, front_end_nodes[i].comm_name); } list_iterator_destroy(iter); if (front_end_node_cnt == 0) fatal("No front end nodes defined"); tree_width = slurm_get_tree_width(); if (front_end_node_cnt > tree_width) { fatal("front_end_node_cnt > tree_width (%u > %u)", front_end_node_cnt, tree_width); } if (slurmctld_conf.debug_flags & DEBUG_FLAG_FRONT_END) log_front_end_state(); #endif }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; int max_depth, width, depth, i; char *p; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->pset.procs = xmalloc(sizeof(*procs) * nprocs); coll->pset.nprocs = nprocs; memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs); if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } #ifdef PMIXP_COLL_DEBUG /* if we debug collectives - store a copy of a full * hostlist to resolve participant id to the hostname */ coll->peers_hl = hostlist_copy(hl); #endif width = slurm_get_tree_width(); coll->peers_cnt = hostlist_count(hl); coll->my_peerid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(coll->my_peerid, coll->peers_cnt, width, &coll->prnt_peerid, &coll->chldrn_cnt, &depth, &max_depth); /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_children = 0; coll->contrib_local = false; coll->chldrn_ids = xmalloc(sizeof(int) * width); coll->contrib_chld = xmalloc(sizeof(int) * width); coll->chldrn_cnt = reverse_tree_direct_children(coll->my_peerid, coll->peers_cnt, width, depth, coll->chldrn_ids); if (coll->prnt_peerid == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want * ourselfs there) */ coll->prnt_host = NULL; coll->all_chldrn_hl = hostlist_copy(hl); hostlist_delete_host(coll->all_chldrn_hl, pmixp_info_hostname()); coll->chldrn_str = hostlist_ranged_string_xmalloc(coll->all_chldrn_hl); } else { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ /* * setup parent id's */ p = hostlist_nth(hl, coll->prnt_peerid); coll->prnt_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->prnt_peerid = pmixp_info_job_hostid(coll->prnt_host); /* * setup root id's * (we need this for the SLURM API communication case) */ p = hostlist_nth(hl, 0); coll->root_host = xstrdup(p); free(p); /* reset prnt_peerid to the global peer */ coll->root_peerid = pmixp_info_job_hostid(coll->root_host); /* use empty hostlist here */ coll->all_chldrn_hl = hostlist_create(""); coll->chldrn_str = NULL; } /* fixup children peer ids to te global ones */ for(i=0; i<coll->chldrn_cnt; i++){ p = hostlist_nth(hl, coll->chldrn_ids[i]); coll->chldrn_ids[i] = pmixp_info_job_hostid(p); free(p); } hostlist_destroy(hl); /* Collective state */ coll->ufwd_buf = pmixp_server_buf_new(); coll->dfwd_buf = pmixp_server_buf_new(); _reset_coll_ufwd(coll); _reset_coll_dfwd(coll); coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; uint32_t nodeid = 0, nodes = 0; int parent_id, depth, max_depth, tmp; int width, my_nspace = -1; char *p; int i, *ch_nodeids = NULL; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->procs = xmalloc(sizeof(*procs) * nprocs); memcpy(coll->procs, procs, sizeof(*procs) * nprocs); coll->nprocs = nprocs; coll->my_nspace = my_nspace; if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } width = slurm_get_tree_width(); nodes = hostlist_count(hl); nodeid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(nodeid, nodes, width, &parent_id, &tmp, &depth, &max_depth); coll->children_cnt = tmp; coll->nodeid = nodeid; /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_cntr = 0; coll->contrib_local = false; ch_nodeids = xmalloc(sizeof(int) * width); coll->ch_contribs = xmalloc(sizeof(int) * width); coll->children_cnt = reverse_tree_direct_children(nodeid, nodes, width, depth, ch_nodeids); /* create the hostlist with extract direct children's hostnames */ coll->ch_hosts = hostlist_create(""); for (i = 0; i < coll->children_cnt; i++) { char *hname = hostlist_nth(hl, ch_nodeids[i]); hostlist_push(coll->ch_hosts, hname); } /* just in case, shouldn't be needed */ hostlist_uniq(coll->ch_hosts); xfree(ch_nodeids); if (parent_id == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want ourselfs there) */ coll->parent_host = NULL; hostlist_delete_host(hl, pmixp_info_hostname()); coll->all_children = hl; } else if (parent_id >= 0) { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ p = hostlist_nth(hl, parent_id); coll->parent_host = xstrdup(p); /* use empty hostlist here */ coll->all_children = hostlist_create(""); free(p); hostlist_destroy(hl); } /* Collective data */ coll->buf = pmixp_server_new_buf(); coll->serv_offs = get_buf_offset(coll->buf); if (SLURM_SUCCESS != _pack_ranges(coll)) { PMIXP_ERROR("Cannot pack ranges to coll message header!"); goto err_exit; } /* Callback information */ coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }