/* * Remove down nodes from hostlist wcoll using "nodeupdown_is_down_node" * on each member of wcoll. Supposedly, it doesn't matter whether you * pass in the canonical or altname. */ static void remove_all_down_nodes(hostlist_t wcoll) { nodeupdown_t nh = NULL; char * host = NULL; hostlist_iterator_t i = NULL; if ((nh = nodeupdown_handle_create()) == NULL) errx("%p: Unable to create nodeupdown handle.\n"); #if HAVE_NODEUPDOWN_LOAD_DATA_6 if (nodeupdown_load_data(nh, NULL, NULL, NULL, 0, 0) < 0) #else if (nodeupdown_load_data(nh, NULL, 0, 0, NULL) < 0) #endif errx("%p: nodeupdown: %s\n", nodeupdown_errormsg(nh)); i = hostlist_iterator_create(wcoll); while ((host = hostlist_next(i))) { if (nodeupdown_is_node_down(nh, host) > 0) hostlist_remove(i); free(host); } hostlist_iterator_destroy(i); if (nodeupdown_handle_destroy(nh) < 0) err("%p: nodeupdown_handle_destroy: %s\n", nodeupdown_errormsg(nh)); return; }
/* * find_child_switches creates an array of indexes to the * immediate descendants of switch sw. */ static void _find_child_switches (int sw) { int i; int cldx; /* index into array of child switches */ hostlist_iterator_t hi; hostlist_t swlist; char *swname; swlist = hostlist_create(switch_record_table[sw].switches); switch_record_table[sw].num_switches = hostlist_count(swlist); switch_record_table[sw].switch_index = xmalloc(switch_record_table[sw].num_switches * sizeof(uint16_t)); hi = hostlist_iterator_create(swlist); cldx = 0; while ((swname = hostlist_next(hi))) { /* Find switch whose name is the name of this child. * and add its index to child index array */ for (i=0; i<switch_record_cnt; i++) { if (strcmp(swname, switch_record_table[i].name) == 0) { switch_record_table[sw].switch_index[cldx] = i; switch_record_table[i].parent = sw; cldx++; break; } } free(swname); } hostlist_iterator_destroy(hi); hostlist_destroy(swlist); }
/* * hostlist2bitmap - given a hostlist, build a bitmap representation * IN hl - hostlist * IN best_effort - if set don't return an error on invalid node name entries * OUT bitmap - set to bitmap, may not have all bits set on error * RET 0 if no error, otherwise EINVAL */ extern int hostlist2bitmap (hostlist_t hl, bool best_effort, bitstr_t **bitmap) { int rc = SLURM_SUCCESS; bitstr_t *my_bitmap; char *name; hostlist_iterator_t hi; FREE_NULL_BITMAP(*bitmap); my_bitmap = (bitstr_t *) bit_alloc (node_record_count); *bitmap = my_bitmap; hi = hostlist_iterator_create(hl); while ((name = hostlist_next(hi)) != NULL) { struct node_record *node_ptr; node_ptr = _find_node_record(name, best_effort, true); if (node_ptr) { bit_set (my_bitmap, (bitoff_t) (node_ptr - node_record_table_ptr)); } else { error ("hostlist2bitmap: invalid node specified %s", name); if (!best_effort) rc = EINVAL; } free (name); } hostlist_iterator_destroy(hi); return rc; }
static void _stat(hash_t hstatus, const char *nodes) { hostlist_iterator_t hlitr; hostlist_t hlnodes; char *node; char *str; assert(hstatus); if (!(hlnodes = hostlist_create(nodes))) { perror("hostlist_create"); exit(1); } if (!(hlitr = hostlist_iterator_create(hlnodes))) { perror("hostlist_iterator_create"); exit(1); } while ((node = hostlist_next(hlitr))) { if ((str = hash_find(hstatus, node))) printf("%s: %s\n", node, str); else printf("%s: %s\n", node, "invalid hostname"); free(node); } hostlist_iterator_destroy(hlitr); hostlist_destroy(hlnodes); }
/* Send a signal RPC to a list of nodes */ static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal, char *nodelist) { agent_arg_t *agent_args; signal_tasks_msg_t *signal_tasks_msg; hostlist_iterator_t hi; char *host; struct node_record *node_ptr; signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t)); signal_tasks_msg->job_id = job_id; signal_tasks_msg->job_step_id = step_id; signal_tasks_msg->signal = signal; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_TASKS; agent_args->retry = 1; agent_args->msg_args = signal_tasks_msg; agent_args->hostlist = hostlist_create(nodelist); agent_args->node_count = hostlist_count(agent_args->hostlist); agent_args->protocol_version = SLURM_PROTOCOL_VERSION; hi = hostlist_iterator_create(agent_args->hostlist); while ((host = hostlist_next(hi))) { if ((node_ptr = find_node_record(host)) && (agent_args->protocol_version > node_ptr->protocol_version)) agent_args->protocol_version = node_ptr->protocol_version; free(host); } hostlist_iterator_destroy(hi); agent_queue_request(agent_args); }
hostlist_iterator_t wrap_hostlist_iterator_create(WRAPPERS_ARGS, hostlist_t hl) { hostlist_iterator_t rv; assert(file && function && hl); if (!(rv = hostlist_iterator_create(hl))) WRAPPERS_ERR_ERRNO("hostlist_iterator_create"); return rv; }
/* * Initialize an alpsc_ev_app_t */ static void _initialize_event(alpsc_ev_app_t *event, struct job_record *job_ptr, struct step_record *step_ptr, alpsc_ev_app_state_e state) { hostlist_t hl; hostlist_iterator_t hlit; char *node; int rv; event->apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id); event->uid = job_ptr->user_id; event->app_name = xstrdup(step_ptr->name); event->batch_id = xmalloc(20); // More than enough to hold max uint32 snprintf(event->batch_id, 20, "%"PRIu32, job_ptr->job_id); event->state = state; event->nodes = NULL; event->num_nodes = 0; // Fill in nodes and num_nodes if (step_ptr->step_layout) { hl = hostlist_create(step_ptr->step_layout->node_list); if (hl == NULL) { return; } hlit = hostlist_iterator_create(hl); if (hlit == NULL) { hostlist_destroy(hl); return; } event->nodes = xmalloc(step_ptr->step_layout->node_cnt * sizeof(int32_t)); while ((node = hostlist_next(hlit)) != NULL) { rv = sscanf(node, "nid%"SCNd32, &event->nodes[event->num_nodes]); if (rv) { event->num_nodes++; } else { debug("%s: couldn't parse node %s, skipping", __func__, node); } free(node); } hostlist_iterator_destroy(hlit); hostlist_destroy(hl); } else { // TODO: do we have to worry about batch scripts? } return; }
static int _delete_all (hostlist_t hl, hostlist_t dl) { int rc = 0; char * host = NULL; hostlist_iterator_t i = hostlist_iterator_create (dl); while ((host = hostlist_next (i))) { rc += hostlist_delete_host (hl, host); free (host); } hostlist_iterator_destroy (i); return (rc); }
int switch_p_build_jobinfo(switch_jobinfo_t *switch_job, slurm_step_layout_t *step_layout, char *network) { sw_gen_step_info_t *gen_step_info = (sw_gen_step_info_t *) switch_job; sw_gen_node_info_t *gen_node_info; sw_gen_node_t *node_ptr; hostlist_t hl = NULL; hostlist_iterator_t hi; char *host = NULL; int i, j; if (debug_flags & DEBUG_FLAG_SWITCH) info("switch_p_build_jobinfo() starting"); xassert(gen_step_info); xassert(gen_step_info->magic == SW_GEN_STEP_INFO_MAGIC); hl = hostlist_create(step_layout->node_list); if (!hl) fatal("hostlist_create(%s): %m", step_layout->node_list); gen_step_info->node_cnt = hostlist_count(hl); gen_step_info->node_array = xmalloc(sizeof(sw_gen_node_t *) * gen_step_info->node_cnt); hi = hostlist_iterator_create(hl); for (i = 0; (host = hostlist_next(hi)); i++) { node_ptr = xmalloc(sizeof(sw_gen_node_t)); gen_step_info->node_array[i] = node_ptr; node_ptr->node_name = xstrdup(host); gen_node_info = _find_node(host); if (gen_node_info) { /* Copy node info to this step */ node_ptr->ifa_cnt = gen_node_info->ifa_cnt; node_ptr->ifa_array = xmalloc(sizeof(sw_gen_node_t *) * node_ptr->ifa_cnt); for (j = 0; j < node_ptr->ifa_cnt; j++) { node_ptr->ifa_array[j] = xmalloc(sizeof(sw_gen_node_t)); node_ptr->ifa_array[j]->ifa_addr = xstrdup( gen_node_info->ifa_array[j]->ifa_addr); node_ptr->ifa_array[j]->ifa_family = xstrdup( gen_node_info->ifa_array[j]->ifa_family); node_ptr->ifa_array[j]->ifa_name = xstrdup( gen_node_info->ifa_array[j]->ifa_name); } } free(host); } hostlist_iterator_destroy(hi); hostlist_destroy(hl); return SLURM_SUCCESS; }
static void _onoff(hash_t hstatus, const char *nodes, const char *state) { hostlist_iterator_t hlitr; hostlist_t hlnodes; char *node; char *str; assert(hstatus); if (!(hlnodes = hostlist_create(nodes))) { perror("hostlist_create"); exit(1); } if (!(hlitr = hostlist_iterator_create(hlnodes))) { perror("hostlist_iterator_create"); exit(1); } while ((node = hostlist_next(hlitr))) { if ((str = hash_find(hstatus, node))) { printf("%s: %s\n", node, OK_STATUS); hash_remove(hstatus, node); if (!hash_insert(hstatus, (void *)node, (void *)state)) { perror("hash_insert"); exit(1); } /* XXX: Don't free 'node' here, it needs to be alloc'd for * the hash key. It's a mem-leak. Fix later. */ } else { printf("%s: %s\n", node, "invalid hostname"); free(node); } } hostlist_iterator_destroy(hlitr); hostlist_destroy(hlnodes); }
extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn, slurmdb_job_cond_t *job_cond, void **curr_cluster) { List local_cluster_list = NULL; time_t now = time(NULL); MYSQL_RES *result = NULL; MYSQL_ROW row; hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; char *query = NULL; int dims = 0; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } /* get the dimensions of this cluster so we know how to deal with the hostlists */ query = xstrdup_printf("select dimensions, flags from %s where " "name='%s'", cluster_table, (char *)list_peek(job_cond->cluster_list)); debug4("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); return NULL; } xfree(query); if (!(row = mysql_fetch_row(result))) { error("Couldn't get the dimensions of cluster '%s'.", (char *)list_peek(job_cond->cluster_list)); mysql_free_result(result); return NULL; } /* On a Cray System when dealing with hostlists as we are here this always needs to be 1. */ if (slurm_atoul(row[1]) & CLUSTER_FLAG_CRAY_A) dims = 1; else dims = atoi(row[0]); mysql_free_result(result); temp_hl = hostlist_create_dims(job_cond->used_nodes, dims); if (hostlist_count(temp_hl) <= 0) { error("we didn't get any real hosts to look for."); goto no_hosts; } h_itr = hostlist_iterator_create(temp_hl); query = xstrdup_printf("select cluster_nodes, time_start, " "time_end from \"%s_%s\" where node_name='' " "&& cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " && ((time_start < %ld) " "&& (time_end >= %ld || time_end = 0))", job_cond->usage_end, job_cond->usage_start); } if (debug_flags & DEBUG_FLAG_DB_JOB) DB_DEBUG(mysql_conn->conn, "query\n%s", query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); goto no_hosts; } xfree(query); local_cluster_list = list_create(_destroy_local_cluster); while ((row = mysql_fetch_row(result))) { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create_dims(row[0], dims); local_cluster->start = slurm_atoul(row[1]); local_cluster->end = slurm_atoul(row[2]); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while ((host = hostlist_next_dims(h_itr, dims))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(local_cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; (*curr_cluster) = local_cluster; } else if (!(*curr_cluster) || (((local_cluster_t *)(*curr_cluster))->end < local_cluster->end)) { (*curr_cluster) = local_cluster; } } else _destroy_local_cluster(local_cluster); } mysql_free_result(result); if (!list_count(local_cluster_list)) { FREE_NULL_LIST(local_cluster_list); local_cluster_list = NULL; goto no_hosts; } no_hosts: hostlist_iterator_destroy(h_itr); hostlist_destroy(temp_hl); return local_cluster_list; }
/* * setup_cluster_nodes - get cluster record list within requested * time period with used nodes. Used for deciding whether a nodelist is * overlapping with the required nodes. */ extern cluster_nodes_t * setup_cluster_nodes(pgsql_conn_t *pg_conn, slurmdb_job_cond_t *job_cond) { DEF_VARS; cluster_nodes_t *cnodes = NULL; time_t now = time(NULL); hostlist_t temp_hl = NULL; hostlist_iterator_t h_itr = NULL; if (!job_cond || !job_cond->used_nodes) return NULL; if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) { error("If you are doing a query against nodes " "you must only have 1 cluster " "you are asking for."); return NULL; } temp_hl = hostlist_create(job_cond->used_nodes); if (!hostlist_count(temp_hl)) { error("we didn't get any real hosts to look for."); hostlist_destroy(temp_hl); return NULL; } query = xstrdup_printf("SELECT cluster_nodes, time_start, " "time_end FROM %s.%s WHERE node_name='' " "AND cluster_nodes !=''", (char *)list_peek(job_cond->cluster_list), event_table); if (job_cond->usage_start) { if (!job_cond->usage_end) job_cond->usage_end = now; xstrfmtcat(query, " AND ((time_start<%ld) " "AND (time_end>=%ld OR time_end=0))", job_cond->usage_end, job_cond->usage_start); } result = DEF_QUERY_RET; if (!result) { hostlist_destroy(temp_hl); return NULL; } h_itr = hostlist_iterator_create(temp_hl); cnodes = xmalloc(sizeof(cluster_nodes_t)); cnodes->cluster_list = list_create(_destroy_local_cluster); FOR_EACH_ROW { char *host = NULL; int loc = 0; local_cluster_t *local_cluster = xmalloc(sizeof(local_cluster_t)); local_cluster->hl = hostlist_create(ROW(0)); local_cluster->start = atoi(ROW(1)); local_cluster->end = atoi(ROW(2)); local_cluster->asked_bitmap = bit_alloc(hostlist_count(local_cluster->hl)); while((host = hostlist_next(h_itr))) { if ((loc = hostlist_find( local_cluster->hl, host)) != -1) bit_set(local_cluster->asked_bitmap, loc); free(host); } hostlist_iterator_reset(h_itr); if (bit_ffs(local_cluster->asked_bitmap) != -1) { list_append(cnodes->cluster_list, local_cluster); if (local_cluster->end == 0) { local_cluster->end = now; cnodes->curr_cluster = local_cluster; } } else _destroy_local_cluster(local_cluster); } END_EACH_ROW; PQclear(result); hostlist_iterator_destroy(h_itr); if (!list_count(cnodes->cluster_list)) { destroy_cluster_nodes(cnodes); cnodes = NULL; } hostlist_destroy(temp_hl); return cnodes; }
static void _nodes_setup (void) { hostlist_iterator_t itr = NULL; char *host = NULL; int i = 0; assert (fds); assert (fds_count); assert (!nodes); assert (nodes_count); assert (!nodes_index); if (!(nodes = list_create ((ListDelF)free))) err_exit ("list_create: %s", strerror (errno)); if (!(nodes_index = hash_create (nodes_count, (hash_key_f)hash_key_string, (hash_cmp_f)strcmp, NULL))) err_exit ("hash_create: %s", strerror (errno)); if (!(itr = hostlist_iterator_create (conf.hosts))) err_exit ("hostlist_iterator_create: %s", strerror (errno)); while ((host = hostlist_next (itr))) { struct ipmidetectd_info *info = NULL; struct hostent *h; char *tmpstr; char *ip; int len; char *host_copy = NULL; char *host_ptr; uint16_t port = RMCP_PRIMARY_RMCP_PORT; if (!(info = (struct ipmidetectd_info *)malloc (sizeof (struct ipmidetectd_info)))) err_exit ("malloc: %s", strerror (errno)); memset (info, '\0', sizeof (struct ipmidetectd_info)); if (strchr (host, ':')) { char *ptr; if (!(host_copy = strdup (host))) err_exit ("strdup: %s", strerror (errno)); if ((ptr = strchr (host_copy, ':'))) { char *endptr; int tmp; *ptr = '\0'; ptr++; errno = 0; tmp = strtol (ptr, &endptr, 0); if (errno || endptr[0] != '\0' || tmp <= 0 || tmp > USHRT_MAX) err_exit ("invalid port specified: %s", host); port = tmp; } host_ptr = host_copy; } else host_ptr = host; if (!(info->hostname = strdup (host_ptr))) err_exit ("strdup: %s", strerror (errno)); /* Use random number for starting sequence number to avoid probability of * duplicates and "hanging" BMC issue. */ if ((len = ipmi_get_random (&(info->sequence_number), sizeof (info->sequence_number))) < 0) err_exit ("ipmi_get_random: %s", strerror (errno)); if (len != sizeof (info->sequence_number)) err_exit ("ipmi_get_random: invalid len returned"); info->fd = fds[i/IPMIDETECTD_NODES_PER_SOCKET]; if (!(h = gethostbyname (host_ptr))) { #if HAVE_HSTRERROR err_exit ("gethostbyname: %s", hstrerror (h_errno)); #else /* !HAVE_HSTRERROR */ err_exit ("gethostbyname: h_errno = %d", h_errno); #endif /* !HAVE_HSTRERROR */ } info->destaddr.sin_family = AF_INET; info->destaddr.sin_addr = *((struct in_addr *)h->h_addr); info->destaddr.sin_port = htons (port); free (host_copy); free (host); if (!list_append (nodes, info)) err_exit ("list_append: %s", strerror (errno)); if (!(tmpstr = inet_ntoa (info->destaddr.sin_addr))) err_exit ("inet_ntoa: %s", strerror (errno)); /* strerror? */ if (!(ip = strdup (tmpstr))) err_exit ("strdup: %s", strerror (errno)); if (hash_find (nodes_index, ip)) err_exit ("Duplicate host ip: %s", ip); if (!hash_insert (nodes_index, ip, info)) err_exit ("hash_insert: %s", strerror (errno)); i++; } hostlist_iterator_destroy (itr); }
static void _prompt_loop(void) { char buf[128]; char bufnode[128]; hash_t hstatus = NULL; hostlist_t hl = NULL; hostlist_iterator_t hlitr = NULL; char *node; assert(hostname); if (!(hstatus = hash_create(HASH_SIZE, (hash_key_f)hash_key_string, (hash_cmp_f)strcmp, (hash_del_f)NULL))) { perror("hash_create"); exit(1); } if (!(hl = hostlist_create(hostname))) { perror("hostlist_create"); exit(1); } if (!(hlitr = hostlist_iterator_create(hl))) { perror("hostlist_iterator"); exit(1); } /* all nodes begin as off */ while ((node = hostlist_next(hlitr))) { if (!hash_insert(hstatus, (void *)node, OFF_STATUS)) { perror("hash_insert"); exit(1); } /* XXX: Don't free 'node' here, it needs to be alloc'd for * the hash key. It's a mem-leak. Fix later. */ } hostlist_iterator_destroy(hlitr); hostlist_destroy(hl); while (1) { if (xreadline(CMD_PROMPT, buf, sizeof(buf)) == NULL) { break; } else if (strlen(buf) == 0) { continue; } else if (!strcmp(buf, "quit")) { break; } else if (!strcmp(buf, "stat")) { _stat(hstatus, hostname); } else if (sscanf(buf, "stat %s", bufnode) == 1) { _stat(hstatus, bufnode); } else if (!strcmp(buf, "on")) { _onoff(hstatus, hostname, ON_STATUS); } else if (sscanf(buf, "on %s", bufnode) == 1) { _onoff(hstatus, bufnode, ON_STATUS); } else if (!strcmp(buf, "off")) { _onoff(hstatus, hostname, OFF_STATUS); } else if (sscanf(buf, "off %s", bufnode) == 1) { _onoff(hstatus, bufnode, OFF_STATUS); } else printf("unknown command - type \"help\"\n"); } hash_destroy(hstatus); }
int main (int argc, char *argv[]) { char *dir = NULL; char *spec, *host; char *nspec = NULL; int c, i; int nopt = 0; int vopt = 0; int fopt = 0; int aopt = 0; int dopt = 0; int rfd = -1, wfd = -1; Opt o; diod_log_init (argv[0]); o = opt_create (); opterr = 0; while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) { switch (c) { case 'f': /* --fake-mount */ fopt = 1; break; case 'n': /* --no-mtab */ nopt = 1; break; case 'v': /* --verbose */ vopt++; break; case 'o': /* --options OPT[,OPT]... */ opt_addf (o, "%s", optarg); break; case 'a': /* --9nbd-attach */ aopt++; break; case 'd': /* --9nbd-detach */ dopt++; break; default: usage (); } } /* Take care of 9nbd operations and exit. */ if (aopt) { _nbd_attach (o, argc - optind, argv + optind, nopt, vopt); exit (0); } if (dopt) { _nbd_detach (o, argc - optind, argv + optind, nopt, vopt); exit (0); } if (optind != argc - 2) usage (); if (geteuid () != 0) msg_exit ("you must be root"); spec = argv[optind++]; dir = argv[optind++]; host = _parse_spec (spec, o); _verify_mountpoint (dir); /* Remount - only pass mount flags into the VFS for an existing mount. * Take care of it here and exit. */ if (opt_find (o, "remount")) { if (opt_check_allowed_csv (o, "ro,rw,aname,remount")) msg_exit ("-oremount can only be used with ro,rw"); _diod_remount (o, spec, dir, vopt, fopt); goto done; } /* Ensure uname and access are set, and to diod-compatible values. * The uname user becomes the euid which will be used by munge auth. */ _parse_uname_access (o); if (seteuid (_uname2uid (opt_find (o, "uname"))) < 0) err_exit ("seteuid"); /* We require -otrans=fd because auth occurs in user space, then live fd * is passed to the kernel via -orfdno,wfdno. */ if (!opt_find (o, "trans")) opt_addf (o, "trans=%s", "fd"); else if (!opt_find (o, "trans=fd")) msg_exit ("only -otrans=fd transport is supported"); /* Set msize if not already set. Validate it later. */ if (!opt_find (o, "msize")) opt_addf (o, "msize=%d", DIOD_DEFAULT_MSIZE); /* Only .L version is supported. */ if (!opt_find (o, "version")) opt_addf (o, "version=%s", "9p2000.L"); else if (!opt_find (o, "version=9p2000.L")) msg_exit ("only -oversion=9p2000.L is supported (little p, big L)"); /* Set debug level. */ if (!opt_find (o, "debug")) opt_addf (o, "debug=%d", 0x1); /* send errors to dmesg */ /* Set rwdepth (number of concurrent reads with buffer > msize). * N.B. this option is not upstream yet but unknown options are ignored. */ if (!opt_find (o, "rwdepth")) opt_addf (o, "rwdepth=%d", 1); /* Server is on an inherited file descriptor. * For testing, we start server on a socketpair duped to fd 0. */ if (opt_find (o, "rfdno") || opt_find (o, "wfdno")) { if (!opt_scanf (o, "rfdno=%d", &rfd) || !opt_scanf (o, "wfdno=%d",&wfd)) msg_exit ("-orfdno,wfdno must be used together"); nopt = 1; /* force no mtab */ /* Connect to server on UNIX domain socket */ } else if (host[0] == '/') { if (opt_find (o, "port")) msg_exit ("-oport won't work with UNIX domain socket"); if ((rfd = diod_sock_connect_unix (host, 0)) < 0) exit (1); wfd = rfd; opt_addf (o, "rfdno=%d", rfd); opt_addf (o, "wfdno=%d", wfd); /* Connect to server on IANA port (or user-specified) and host. */ } else { char *port = opt_find (o, "port"); hostlist_iterator_t hi; hostlist_t hl; char *h; if (!port) port = "564"; if (!(hl = hostlist_create (host))) msg_exit ("error parsing host string: %s", host); if (!(hi = hostlist_iterator_create (hl))) msg_exit ("out of memory"); while ((h = hostlist_next (hi))) { if (vopt) msg ("trying to connect to %s:%s", h, port); if ((rfd = diod_sock_connect_inet (h, port, DIOD_SOCK_QUIET)) >= 0) break; } if (h) { /* create new 'spec' string identifying successful host */ char *p = strchr (spec , ':'); int len = strlen (h) + (p ? strlen (p) : 0) + 1; if (!(nspec = malloc (len))) msg_exit ("out of memory"); snprintf (nspec, len, "%s%s", h, p ? p : ""); } hostlist_destroy (hl); if (rfd < 0) msg_exit ("could not connect to server(s), giving up"); wfd = rfd; opt_delete (o, "port"); opt_addf (o, "rfdno=%d", rfd); opt_addf (o, "wfdno=%d", wfd); } NP_ASSERT (opt_find (o, "trans=fd")); NP_ASSERT (opt_scanf (o, "msize=%d", &i)); NP_ASSERT (opt_find (o, "version=9p2000.L")); NP_ASSERT (opt_scanf (o, "debug=%d", &i) || opt_scanf (o, "debug=%x", &i)); NP_ASSERT (opt_scanf (o, "wfdno=%d", &i) && opt_scanf (o, "rfdno=%d", &i)); NP_ASSERT ((opt_find (o, "access=user") && opt_find(o, "uname=root")) || (opt_scanf (o, "access=%d", &i) && opt_find(o, "uname"))); NP_ASSERT (!opt_find (o, "port")); _diod_mount (o, rfd, wfd, nspec ? nspec : spec, dir, vopt, fopt, nopt); done: opt_destroy (o); exit (0); }
int pstdout_launch(const char *hostnames, Pstdout_Thread pstdout_func, void *arg) { struct pstdout_thread_data **tdata = NULL; struct pstdout_state pstate; unsigned int pstate_init = 0; hostlist_iterator_t hitr = NULL; hostlist_t h = NULL; int h_count = 0; char *host = NULL; int exit_code = -1; sighandler_t sighandler_save = NULL; int sighandler_set = 0; int rc; int i; if (!pstdout_initialized) { pstdout_errnum = PSTDOUT_ERR_UNINITIALIZED; return -1; } if (!pstdout_func) { pstdout_errnum = PSTDOUT_ERR_PARAMETERS; return -1; } if ((rc = pthread_mutex_lock(&pstdout_launch_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } /* Special case */ if (!hostnames) { if (_pstdout_state_init(&pstate, NULL) < 0) goto cleanup; pstate_init++; exit_code = pstdout_func(&pstate, NULL, arg); pstdout_errnum = PSTDOUT_ERR_SUCCESS; goto cleanup; } if (!(h = hostlist_create(hostnames))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } h_count = hostlist_count(h); /* Sanity check */ if (h_count <= 0) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "h_count = %d\n", h_count); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } /* Special case */ if (h_count == 1) { if (_pstdout_state_init(&pstate, hostnames) < 0) goto cleanup; pstate_init++; exit_code = pstdout_func(&pstate, hostnames, arg); pstdout_errnum = PSTDOUT_ERR_SUCCESS; goto cleanup; } if ((sighandler_save = signal(SIGINT, _pstdout_sigint)) == SIG_ERR) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "signal\n"); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } sighandler_set++; if (!(hitr = hostlist_iterator_create(h))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } if (!(tdata = (struct pstdout_thread_data **)malloc(sizeof(struct pstdout_thread_data *) * h_count))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } memset(tdata, '\0', sizeof(struct pstdout_thread_data *) * h_count); i = 0; while ((host = hostlist_next(hitr))) { if (!(tdata[i] = (struct pstdout_thread_data *)malloc(sizeof(struct pstdout_thread_data)))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } memset(tdata[i], '\0', sizeof(struct pstdout_thread_data)); if (!(tdata[i]->hostname = strdup(host))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } tdata[i]->pstdout_func = pstdout_func; tdata[i]->arg = arg; if ((rc = pthread_attr_init(&(tdata[i]->attr)))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_attr_init: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } if ((rc = pthread_attr_setdetachstate(&(tdata[i]->attr), PTHREAD_CREATE_DETACHED))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_attr_setdetachstate: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } free(host); i++; } host = NULL; hostlist_iterator_destroy(hitr); hitr = NULL; hostlist_destroy(h); h = NULL; /* Launch threads up to fanout */ for (i = 0; i < h_count; i++) { if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } if (pstdout_threadcount == pstdout_fanout) { if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } if ((rc = pthread_create(&(tdata[i]->tid), &(tdata[i]->attr), _pstdout_func_entry, (void *) tdata[i]))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_create: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } pstdout_threadcount++; if ((rc = pthread_mutex_unlock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } /* Wait for Threads to finish */ if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } while (pstdout_threadcount > 0) { if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } if (_pstdout_output_consolidated_finish() < 0) goto cleanup; /* Determine exit code */ exit_code = 0; for (i = 0; i < h_count; i++) { if (tdata[i]->exit_code > exit_code) exit_code = tdata[i]->exit_code; } cleanup: /* Cannot pass NULL for key, so just pass dummy key */ list_delete_all(pstdout_consolidated_stdout, _pstdout_consolidated_data_delete_all, ""); list_delete_all(pstdout_consolidated_stderr, _pstdout_consolidated_data_delete_all, ""); if (pstate_init) _pstdout_state_cleanup(&pstate); if (tdata) { for (i = 0; i < h_count; i++) { if (tdata[i]) { free(tdata[i]->hostname); pthread_attr_destroy(&(tdata[i]->attr)); free(tdata[i]); } } free(tdata); } if (hitr) hostlist_iterator_destroy(hitr); if (h) hostlist_destroy(h); free(host); if ((rc = pthread_mutex_unlock(&pstdout_launch_mutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc)); /* Don't change error code, just move on */ } if (sighandler_set) signal(SIGINT, sighandler_save); return exit_code; }
int main(int argc, char *argv[]) { extern int optind; extern char *optarg; int c; PlugList pl = NULL; char *hwplugs = NULL; char *nodelist = NULL; char *pluglist = NULL; char *findplug = NULL; err_init(basename(argv[0])); while ((c = getopt(argc, argv, "p:f:")) != EOF) { switch (c) { case 'p': hwplugs = optarg; break; case 'f': findplug = optarg; break; default: usage(); } } if (argc - optind == 0) usage(); nodelist = argv[optind++]; if (argc - optind == 1) pluglist = argv[optind++]; if (argc - optind != 0) usage(); if (hwplugs) { hostlist_t hl = hostlist_create(hwplugs); hostlist_iterator_t itr = hostlist_iterator_create(hl); List l = list_create((ListDelF)xfree); char *plug; while ((plug = hostlist_next(itr))) list_append(l, xstrdup(plug)); hostlist_iterator_destroy(itr); hostlist_destroy(hl); pl = pluglist_create(l); list_destroy(l); } else pl = pluglist_create(NULL); switch (pluglist_map(pl, nodelist, pluglist)) { case EPL_DUPNODE: fprintf(stderr, "duplicate node\n"); break; case EPL_UNKPLUG: fprintf(stderr, "unknown plug\n"); break; case EPL_DUPPLUG: fprintf(stderr, "duplicate plug\n"); break; case EPL_NOPLUGS: fprintf(stderr, "more nodes than plugs\n"); break; case EPL_NONODES: fprintf(stderr, "more plugs than nodes\n"); break; case EPL_SUCCESS: break; } if (findplug) { Plug *plug = pluglist_find(pl, findplug); if (plug) printf("plug=%s node=%s\n", plug->name, plug->node ? plug->node : "NULL"); else printf("plug %s: not found\n", findplug); } else { PlugListIterator itr = pluglist_iterator_create(pl); Plug *plug; while ((plug = pluglist_next(itr))) { printf("plug=%s node=%s\n", plug->name, plug->node ? plug->node : "NULL"); } pluglist_iterator_destroy(itr); } exit(0); }
struct ipmipower_connection * ipmipower_connection_array_create(const char *hostname, unsigned int *len) { char *str = NULL; int index = 0; hostlist_t hl = NULL; hostlist_iterator_t itr = NULL; struct ipmipower_connection *ics; int size = sizeof(struct ipmipower_connection); int hl_count; int errcount = 0; int emfilecount = 0; assert(hostname && len); *len = 0; if (!(hl = hostlist_create(hostname))) { ipmipower_output(MSG_TYPE_HOSTNAME_INVALID, hostname); return NULL; } if (!(itr = hostlist_iterator_create(hl))) ierr_exit("hostlist_iterator_create() error"); hostlist_uniq(hl); hl_count = hostlist_count(hl); ics = (struct ipmipower_connection *)Malloc(size * hl_count); memset(ics, '\0', (size * hl_count)); while ((str = hostlist_next(itr))) { ics[index].ipmi_fd = -1; ics[index].ping_fd = -1; /* cleanup only at the end, gather all error outputs for * later */ if (_connection_setup(&ics[index], str) < 0) { if (errno == EMFILE && !emfilecount) { cbuf_printf(ttyout, "file descriptor limit reached\n"); emfilecount++; } errcount++; } free(str); index++; } hostlist_iterator_destroy(itr); hostlist_destroy(hl); if (errcount) { int i; for (i = 0; i < hl_count; i++) { close(ics[i].ipmi_fd); close(ics[i].ping_fd); if (ics[i].ipmi_in) cbuf_destroy(ics[i].ipmi_in); if (ics[i].ipmi_out) cbuf_destroy(ics[i].ipmi_out); if (ics[i].ping_in) cbuf_destroy(ics[i].ping_in); if (ics[i].ping_out) cbuf_destroy(ics[i].ping_out); } Free(ics); return NULL; } *len = hl_count; return ics; }
int main (int argc, char *argv[]) { char *server = NULL; int msize = 65536; uid_t uid = geteuid (); int topt = 0; Npcfsys *fs = NULL; Npcfid *fid, *afid, *root; int c, fd; char buf[80], *host, *p; hostlist_t hl; hostlist_iterator_t itr; int lopt = 0; diod_log_init (argv[0]); opterr = 0; while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) { switch (c) { case 's': /* --server HOST[:PORT] or /path/to/socket */ server = optarg; break; case 'm': /* --msize SIZE */ msize = strtoul (optarg, NULL, 10); break; case 'u': /* --uid UID */ uid = strtoul (optarg, NULL, 10); break; case 't': /* --timeout SECS */ topt = strtoul (optarg, NULL, 10); break; case 'l': /* --long */ lopt = 1; break; default: usage (); } } if (signal (SIGPIPE, SIG_IGN) == SIG_ERR) err_exit ("signal"); if (signal (SIGALRM, sigalarm) == SIG_ERR) err_exit ("signal"); if (topt > 0) alarm (topt); if ((fd = diod_sock_connect (server, 0)) < 0) exit (1); if (!(fs = npc_start (fd, fd, msize, 0))) errn_exit (np_rerror (), "error negotiating protocol with server"); if (!(afid = npc_auth (fs, "ctl", uid, diod_auth)) && np_rerror () != 0) errn_exit (np_rerror (), "error authenticating to server"); if (!(root = npc_attach (fs, afid, "ctl", uid))) errn_exit (np_rerror (), "error attaching to aname=ctl"); if (!(fid = npc_open_bypath (root, "connections", O_RDONLY))) errn_exit (np_rerror (), "open connections"); if (!(hl = hostlist_create (NULL))) err_exit ("hostlist_create"); while (npc_gets (fid, buf, sizeof(buf))) { if ((p = strchr (buf, ' '))) *p = '\0'; if (!lopt && (p = strchr (buf, '.'))) *p = '\0'; if (!hostlist_push_host (hl, buf)) err_exit ("hostlist_push_host"); } hostlist_uniq (hl); if (lopt) { if (!(itr = hostlist_iterator_create (hl))) err_exit ("hostlist_iterator_create"); while ((host = hostlist_next (itr))) printf ("%s\n", host); hostlist_iterator_destroy (itr); } else { char s[1024]; if (hostlist_ranged_string (hl, sizeof (s), s) < 0) msg_exit ("hostlist output would be too long (use -l)"); printf ("%s\n", s); } hostlist_destroy (hl); if (npc_clunk (fid) < 0) errn_exit (np_rerror (), "clunk connections"); if (npc_clunk (root) < 0) errn_exit (np_rerror (), "error clunking ctl"); if (npc_clunk (afid) < 0) errn_exit (np_rerror (), "error clunking afid"); npc_finish (fs); exit(0); }
/* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; int step_inx = 0, step_hosts_cnt = 0; struct node_record **step_hosts_ptrs = NULL; struct node_record *host_ptr = NULL; debug2("job list is %s", step_layout->node_list); if (!arbitrary_nodes) { error("no hostlist given for arbitrary dist"); return SLURM_ERROR; } debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %u tasks have %d in the nodelist. " "Check your nodelist, or set the -n option to be %d", step_layout->task_cnt, hostlist_count(step_alloc_hosts), hostlist_count(step_alloc_hosts)); hostlist_destroy(step_alloc_hosts); return SLURM_ERROR; } job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); itr_task = hostlist_iterator_create(step_alloc_hosts); /* * Build array of pointers so that we can do pointer comparisons rather * than strcmp's on nodes. */ step_hosts_cnt = hostlist_count(step_alloc_hosts); step_hosts_ptrs = xmalloc(sizeof(struct node_record *) * step_hosts_cnt); step_inx = 0; while((host = hostlist_next(itr_task))) { step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host); free(host); } while((host = hostlist_next(itr))) { host_ptr = find_node_record(host); step_layout->tasks[i] = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tasks[i]++; task_cnt++; } if (task_cnt >= step_layout->task_cnt) break; } debug3("%s got %u tasks", host, step_layout->tasks[i]); if (step_layout->tasks[i] == 0) goto reset_hosts; step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); taskid = 0; j = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tids[i][j] = taskid; j++; } taskid++; if (j >= step_layout->tasks[i]) break; } i++; reset_hosts: free(host); if (i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); xfree(step_hosts_ptrs); if (task_cnt != step_layout->task_cnt) { error("Asked for %u tasks but placed %d. Check your nodelist", step_layout->task_cnt, task_cnt); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; char *host_task = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; debug2("job list is %s", step_layout->node_list); job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); if (!arbitrary_nodes) { error("no hostlist given for arbitrary dist"); return SLURM_ERROR; } debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %u tasks have %d in the nodelist. " "Check your nodelist, or set the -n option to be %d", step_layout->task_cnt, hostlist_count(step_alloc_hosts), hostlist_count(step_alloc_hosts)); return SLURM_ERROR; } itr_task = hostlist_iterator_create(step_alloc_hosts); while((host = hostlist_next(itr))) { step_layout->tasks[i] = 0; while((host_task = hostlist_next(itr_task))) { if (!strcmp(host, host_task)) { step_layout->tasks[i]++; task_cnt++; } free(host_task); if (task_cnt >= step_layout->task_cnt) break; } debug3("%s got %u tasks", host, step_layout->tasks[i]); if (step_layout->tasks[i] == 0) goto reset_hosts; step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); taskid = 0; j = 0; hostlist_iterator_reset(itr_task); while((host_task = hostlist_next(itr_task))) { if (!strcmp(host, host_task)) { step_layout->tids[i][j] = taskid; j++; } taskid++; free(host_task); if (j >= step_layout->tasks[i]) break; } i++; reset_hosts: hostlist_iterator_reset(itr_task); free(host); if (i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); if (task_cnt != step_layout->task_cnt) { error("Asked for %u tasks but placed %d. Check your nodelist", step_layout->task_cnt, task_cnt); return SLURM_ERROR; } return SLURM_SUCCESS; }
void *_forward_thread(void *arg) { forward_msg_t *fwd_msg = (forward_msg_t *)arg; forward_struct_t *fwd_struct = fwd_msg->fwd_struct; Buf buffer = init_buf(BUF_SIZE); /* probably enough for header */ List ret_list = NULL; int fd = -1; ret_data_info_t *ret_data_info = NULL; char *name = NULL; hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist); slurm_addr_t addr; char *buf = NULL; int steps = 0; int start_timeout = fwd_msg->timeout; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(hl))) { if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) { error("forward_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); continue; } goto cleanup; } if ((fd = slurm_open_msg_conn(&addr)) < 0) { error("forward_thread to %s: %m", name); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(&fwd_struct->forward_mutex); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } buf = hostlist_ranged_string_xmalloc(hl); xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = buf; fwd_msg->header.forward.cnt = hostlist_count(hl); #if 0 info("sending %d forwards (%s) to %s", fwd_msg->header.forward.cnt, fwd_msg->header.forward.nodelist, name); #endif if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); } else debug3("forward: send to %s ", name); pack_header(&fwd_msg->header, buffer); /* add forward data to buffer */ if (remaining_buf(buffer) < fwd_struct->buf_len) { int new_size = buffer->processed + fwd_struct->buf_len; new_size += 1024; /* padded for paranoia */ xrealloc_nz(buffer->head, new_size); buffer->size = new_size; } if (fwd_struct->buf_len) { memcpy(&buffer->head[buffer->processed], fwd_struct->buf, fwd_struct->buf_len); buffer->processed += fwd_struct->buf_len; } /* * forward message */ if (slurm_msg_sendto(fd, get_buf_data(buffer), get_buf_offset(buffer), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) { error("forward_thread: slurm_msg_sendto: %m"); slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _forward_msg_internal(hl, NULL, fwd_struct, &fwd_msg->header, 0, hostlist_count(hl)); continue; } goto cleanup; } /* These messages don't have a return message, but if * we got here things worked out so make note of the * list of nodes as success. */ if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) || (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) || (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) { slurm_mutex_lock(&fwd_struct->forward_mutex); ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); while ((name = hostlist_shift(hl))) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_struct->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); } goto cleanup; } if (fwd_msg->header.forward.cnt > 0) { static int message_timeout = -1; if (message_timeout < 0) message_timeout = slurm_get_msg_timeout() * 1000; if (!fwd_msg->header.forward.tree_width) fwd_msg->header.forward.tree_width = slurm_get_tree_width(); steps = (fwd_msg->header.forward.cnt+1) / fwd_msg->header.forward.tree_width; fwd_msg->timeout = (message_timeout*steps); /* info("got %d * %d = %d", message_timeout, */ /* steps, fwd_msg->timeout); */ steps++; fwd_msg->timeout += (start_timeout*steps); /* info("now + %d*%d = %d", start_timeout, */ /* steps, fwd_msg->timeout); */ } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); /* info("sent %d forwards got %d back", */ /* fwd_msg->header.forward.cnt, list_count(ret_list)); */ if (!ret_list || (fwd_msg->header.forward.cnt != 0 && list_count(ret_list) <= 1)) { slurm_mutex_lock(&fwd_struct->forward_mutex); mark_as_failed_forward(&fwd_struct->ret_list, name, errno); free(name); FREE_NULL_LIST(ret_list); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_struct->buf_len); slurm_mutex_unlock(&fwd_struct->forward_mutex); slurm_close(fd); fd = -1; continue; } goto cleanup; } else if ((fwd_msg->header.forward.cnt+1) != list_count(ret_list)) { /* this should never be called since the above should catch the failed forwards and pipe them back down, but this is here so we never have to worry about a locked mutex */ ListIterator itr = NULL; char *tmp = NULL; int first_node_found = 0; hostlist_iterator_t host_itr = hostlist_iterator_create(hl); error("We shouldn't be here. We forwarded to %d " "but only got %d back", (fwd_msg->header.forward.cnt+1), list_count(ret_list)); while ((tmp = hostlist_next(host_itr))) { int node_found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (!ret_data_info->node_name) { first_node_found = 1; ret_data_info->node_name = xstrdup(name); } if (!xstrcmp(tmp, ret_data_info->node_name)) { node_found = 1; break; } } list_iterator_destroy(itr); if (!node_found) { mark_as_failed_forward( &fwd_struct->ret_list, tmp, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } hostlist_iterator_destroy(host_itr); if (!first_node_found) { mark_as_failed_forward( &fwd_struct->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; } slurm_mutex_lock(&fwd_struct->forward_mutex); if (ret_list) { while ((ret_data_info = list_pop(ret_list)) != NULL) { if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(name); } list_push(fwd_struct->ret_list, ret_data_info); debug3("got response from %s", ret_data_info->node_name); } FREE_NULL_LIST(ret_list); } free(name); cleanup: if ((fd >= 0) && slurm_close(fd) < 0) error ("close(%d): %m", fd); hostlist_destroy(hl); destroy_forward(&fwd_msg->header.forward); free_buf(buffer); slurm_cond_signal(&fwd_struct->notify); slurm_mutex_unlock(&fwd_struct->forward_mutex); xfree(fwd_msg); return (NULL); }
static int eliminate_nodes (char **hosts) { hostlist_t hl = NULL; hostlist_t hlnew = NULL; hostlist_iterator_t hitr = NULL; ipmidetect_t id = NULL; char *host = NULL; char hostbuf[HOSTLIST_BUFLEN + 1]; int rv = -1; assert (hosts); assert (*hosts); if (!(id = ipmidetect_handle_create ())) { fprintf (stderr, "ipmidetect_handle_create\n"); goto cleanup; } if (ipmidetect_load_data (id, NULL, 0, 0) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT) fprintf (stderr, "Error connecting to ipmidetect daemon\n"); else fprintf (stderr, "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!(hl = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hlnew = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hitr = hostlist_iterator_create (hl))) { fprintf (stderr, "hostlist_iterator_create: %s\n", strerror (errno)); goto cleanup; } while ((host = hostlist_next (hitr))) { int ret; if ((ret = ipmidetect_is_node_detected (id, host)) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND) fprintf (stderr, "Node '%s' unrecognized by ipmidetect\n", host); else fprintf (stderr, "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!ret) hostlist_delete (hlnew, host); free (host); } host = NULL; if (!hostlist_count (hlnew)) { rv = 0; goto cleanup; } memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1); if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0) { fprintf (stderr, "hostlist_ranged_string: truncation\n"); goto cleanup; } free (*hosts); if (!(*hosts = strdup (hostbuf))) { fprintf (stderr, "strdup: %s\n", strerror (errno)); goto cleanup; } rv = hostlist_count (hlnew); cleanup: if (id) ipmidetect_handle_destroy (id); if (hitr) hostlist_iterator_destroy (hitr); if (hl) hostlist_destroy (hl); if (hlnew) hostlist_destroy (hlnew); free (host); return (rv); }