Beispiel #1
0
/*
 * find_child_switches creates an array of indexes to the
 * immediate descendants of switch sw.
 */
static void _find_child_switches (int sw)
{
	int i;
	int cldx; /* index into array of child switches */
	hostlist_iterator_t hi;
	hostlist_t swlist;
	char *swname;

	swlist = hostlist_create(switch_record_table[sw].switches);
	switch_record_table[sw].num_switches = hostlist_count(swlist);
	switch_record_table[sw].switch_index =
			xmalloc(switch_record_table[sw].num_switches
				* sizeof(uint16_t));

	hi = hostlist_iterator_create(swlist);
	cldx = 0;
	while ((swname = hostlist_next(hi))) {
		/* Find switch whose name is the name of this child.
		 * and add its index to child index array */
		for (i=0; i<switch_record_cnt; i++) {
			if (strcmp(swname, switch_record_table[i].name) == 0) {
				switch_record_table[sw].switch_index[cldx] = i;
				switch_record_table[i].parent = sw;
				cldx++;
				break;
			}
		}
		free(swname);
	}
	hostlist_iterator_destroy(hi);
	hostlist_destroy(swlist);
}
Beispiel #2
0
/* Send a signal RPC to a specific node */
static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal,
		char *node_name, slurm_addr_t node_addr)
{
	agent_arg_t *agent_args;
	kill_tasks_msg_t *kill_tasks_msg;

	kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t));
	kill_tasks_msg->job_id		= job_id;
	kill_tasks_msg->job_step_id	= step_id;
	kill_tasks_msg->signal		= signal;

	agent_args = xmalloc(sizeof(agent_arg_t));
	agent_args->msg_type		= REQUEST_SIGNAL_TASKS;
	agent_args->retry		= 1;
	agent_args->msg_args		= kill_tasks_msg;
	agent_args->hostlist = hostlist_create(node_name);
	agent_args->node_count		= 1;

	if ((node_ptr = find_node_record(node_name)))
		agent_args->protocol_version = node_ptr->protocol_version;

	hostlist_iterator_destroy(hi);

	agent_queue_request(agent_args);
}
Beispiel #3
0
/*
 *  Remove down nodes from hostlist wcoll using "nodeupdown_is_down_node"
 *    on each member of wcoll. Supposedly, it doesn't matter whether you
 *    pass in the canonical or altname.
 */
static void
remove_all_down_nodes(hostlist_t wcoll)
{
    nodeupdown_t  nh   = NULL;
    char *        host = NULL;
    hostlist_iterator_t i = NULL;
    
    if ((nh = nodeupdown_handle_create()) == NULL)
        errx("%p: Unable to create nodeupdown handle.\n");

#if HAVE_NODEUPDOWN_LOAD_DATA_6
    if (nodeupdown_load_data(nh, NULL, NULL, NULL, 0, 0) < 0) 
#else
    if (nodeupdown_load_data(nh, NULL, 0, 0, NULL) < 0) 
#endif
        errx("%p: nodeupdown: %s\n", nodeupdown_errormsg(nh));

    i = hostlist_iterator_create(wcoll);
    while ((host = hostlist_next(i))) {
        if (nodeupdown_is_node_down(nh, host) > 0)
            hostlist_remove(i);
        free(host);
    }
    hostlist_iterator_destroy(i);

    if (nodeupdown_handle_destroy(nh) < 0)
        err("%p: nodeupdown_handle_destroy: %s\n", nodeupdown_errormsg(nh));

    return;
}
Beispiel #4
0
/*
 * hostlist2bitmap - given a hostlist, build a bitmap representation
 * IN hl          - hostlist
 * IN best_effort - if set don't return an error on invalid node name entries
 * OUT bitmap     - set to bitmap, may not have all bits set on error
 * RET 0 if no error, otherwise EINVAL
 */
extern int hostlist2bitmap (hostlist_t hl, bool best_effort, bitstr_t **bitmap)
{
	int rc = SLURM_SUCCESS;
	bitstr_t *my_bitmap;
	char *name;
	hostlist_iterator_t hi;

	FREE_NULL_BITMAP(*bitmap);
	my_bitmap = (bitstr_t *) bit_alloc (node_record_count);
	*bitmap = my_bitmap;

	hi = hostlist_iterator_create(hl);
	while ((name = hostlist_next(hi)) != NULL) {
		struct node_record *node_ptr;
		node_ptr = _find_node_record(name, best_effort, true);
		if (node_ptr) {
			bit_set (my_bitmap, (bitoff_t) (node_ptr -
							node_record_table_ptr));
		} else {
			error ("hostlist2bitmap: invalid node specified %s",
			       name);
			if (!best_effort)
				rc = EINVAL;
		}
		free (name);
	}

	hostlist_iterator_destroy(hi);
	return rc;

}
Beispiel #5
0
static void
_stat(hash_t hstatus, const char *nodes)
{
    hostlist_iterator_t hlitr;
    hostlist_t hlnodes;
    char *node;
    char *str;

    assert(hstatus);

    if (!(hlnodes = hostlist_create(nodes))) {
        perror("hostlist_create");
        exit(1);
    }
    if (!(hlitr = hostlist_iterator_create(hlnodes))) {
        perror("hostlist_iterator_create");
        exit(1);
    }
    while ((node = hostlist_next(hlitr))) {
        if ((str = hash_find(hstatus, node)))
            printf("%s: %s\n", node, str);
        else
            printf("%s: %s\n", node, "invalid hostname");
        free(node);
    }
    hostlist_iterator_destroy(hlitr);
    hostlist_destroy(hlnodes);
}
Beispiel #6
0
/* Send a signal RPC to a list of nodes */
static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal,
		      char *nodelist)
{
	agent_arg_t *agent_args;
	signal_tasks_msg_t *signal_tasks_msg;
	hostlist_iterator_t hi;
	char *host;
	struct node_record *node_ptr;

	signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
	signal_tasks_msg->job_id		= job_id;
	signal_tasks_msg->job_step_id	= step_id;
	signal_tasks_msg->signal		= signal;

	agent_args = xmalloc(sizeof(agent_arg_t));
	agent_args->msg_type		= REQUEST_SIGNAL_TASKS;
	agent_args->retry		= 1;
	agent_args->msg_args		= signal_tasks_msg;
	agent_args->hostlist		= hostlist_create(nodelist);
	agent_args->node_count		= hostlist_count(agent_args->hostlist);
	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
	hi = hostlist_iterator_create(agent_args->hostlist);
	while ((host = hostlist_next(hi))) {
		if ((node_ptr = find_node_record(host)) &&
		    (agent_args->protocol_version > node_ptr->protocol_version))
			agent_args->protocol_version =
				node_ptr->protocol_version;
		free(host);
	}
	hostlist_iterator_destroy(hi);

	agent_queue_request(agent_args);
}
void
wrap_hostlist_iterator_destroy(WRAPPERS_ARGS, hostlist_iterator_t i)
{
  assert(file && function);

  if (!i)
    WRAPPERS_ERR_INVALID_PARAMETERS("hostlist_itreator_destroy");

  hostlist_iterator_destroy(i);
  return;
}
Beispiel #8
0
/*
 * Initialize an alpsc_ev_app_t
 */
static void _initialize_event(alpsc_ev_app_t *event,
			      struct job_record *job_ptr,
			      struct step_record *step_ptr,
			      alpsc_ev_app_state_e state)
{
	hostlist_t hl;
	hostlist_iterator_t hlit;
	char *node;
	int rv;

	event->apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id);
	event->uid = job_ptr->user_id;
	event->app_name = xstrdup(step_ptr->name);
	event->batch_id = xmalloc(20);	// More than enough to hold max uint32
	snprintf(event->batch_id, 20, "%"PRIu32, job_ptr->job_id);
	event->state = state;
	event->nodes = NULL;
	event->num_nodes = 0;

	// Fill in nodes and num_nodes
	if (step_ptr->step_layout) {
		hl = hostlist_create(step_ptr->step_layout->node_list);
		if (hl == NULL) {
			return;
		}
		hlit = hostlist_iterator_create(hl);
		if (hlit == NULL) {
			hostlist_destroy(hl);
			return;
		}

		event->nodes = xmalloc(step_ptr->step_layout->node_cnt
				       * sizeof(int32_t));

		while ((node = hostlist_next(hlit)) != NULL) {
			rv = sscanf(node, "nid%"SCNd32,
				    &event->nodes[event->num_nodes]);
			if (rv) {
				event->num_nodes++;
			} else {
				debug("%s: couldn't parse node %s, skipping",
				      __func__, node);
			}
			free(node);
		}

		hostlist_iterator_destroy(hlit);
		hostlist_destroy(hl);
	} else {
		// TODO: do we have to worry about batch scripts?
	}
	return;
}
Beispiel #9
0
static int 
_delete_all (hostlist_t hl, hostlist_t dl)
{
    int                 rc   = 0;
    char *              host = NULL;
    hostlist_iterator_t i    = hostlist_iterator_create (dl);

    while ((host = hostlist_next (i))) {
        rc += hostlist_delete_host (hl, host);
        free (host);
    }
    hostlist_iterator_destroy (i);
    return (rc);
}
Beispiel #10
0
int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
			   slurm_step_layout_t *step_layout, char *network)
{
	sw_gen_step_info_t *gen_step_info = (sw_gen_step_info_t *) switch_job;
	sw_gen_node_info_t *gen_node_info;
	sw_gen_node_t *node_ptr;
	hostlist_t hl = NULL;
	hostlist_iterator_t hi;
	char *host = NULL;
	int i, j;

	if (debug_flags & DEBUG_FLAG_SWITCH)
		info("switch_p_build_jobinfo() starting");
	xassert(gen_step_info);
	xassert(gen_step_info->magic == SW_GEN_STEP_INFO_MAGIC);
	hl = hostlist_create(step_layout->node_list);
	if (!hl)
		fatal("hostlist_create(%s): %m", step_layout->node_list);
	gen_step_info->node_cnt = hostlist_count(hl);
	gen_step_info->node_array = xmalloc(sizeof(sw_gen_node_t *) *
					    gen_step_info->node_cnt);
	hi = hostlist_iterator_create(hl);
	for (i = 0; (host = hostlist_next(hi)); i++) {
		node_ptr = xmalloc(sizeof(sw_gen_node_t));
		gen_step_info->node_array[i] = node_ptr;
		node_ptr->node_name = xstrdup(host);
		gen_node_info = _find_node(host);
		if (gen_node_info) {	/* Copy node info to this step */
			node_ptr->ifa_cnt = gen_node_info->ifa_cnt;
			node_ptr->ifa_array = xmalloc(sizeof(sw_gen_node_t *) *
						      node_ptr->ifa_cnt);
			for (j = 0; j < node_ptr->ifa_cnt; j++) {
				node_ptr->ifa_array[j] =
					xmalloc(sizeof(sw_gen_node_t));
				node_ptr->ifa_array[j]->ifa_addr = xstrdup(
					gen_node_info->ifa_array[j]->ifa_addr);
				node_ptr->ifa_array[j]->ifa_family = xstrdup(
					gen_node_info->ifa_array[j]->ifa_family);
				node_ptr->ifa_array[j]->ifa_name = xstrdup(
					gen_node_info->ifa_array[j]->ifa_name);
			}
		}
		free(host);
	}
	hostlist_iterator_destroy(hi);
	hostlist_destroy(hl);

	return SLURM_SUCCESS;
}
Beispiel #11
0
static void
_onoff(hash_t hstatus, const char *nodes, const char *state)
{
    hostlist_iterator_t hlitr;
    hostlist_t hlnodes;
    char *node;
    char *str;

    assert(hstatus);

    if (!(hlnodes = hostlist_create(nodes))) {
        perror("hostlist_create");
        exit(1);
    }
    if (!(hlitr = hostlist_iterator_create(hlnodes))) {
        perror("hostlist_iterator_create");
        exit(1);
    }
    while ((node = hostlist_next(hlitr))) {
        if ((str = hash_find(hstatus, node))) {
            printf("%s: %s\n", node, OK_STATUS);
            hash_remove(hstatus, node);
            if (!hash_insert(hstatus, (void *)node, (void *)state)) {
                perror("hash_insert");
                exit(1);
            }
            /* XXX: Don't free 'node' here, it needs to be alloc'd for
             * the hash key.  It's a mem-leak.  Fix later.
             */
        } else {
            printf("%s: %s\n", node, "invalid hostname");
            free(node);
        }
    }
    hostlist_iterator_destroy(hlitr);
    hostlist_destroy(hlnodes);
}
extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn,
					slurmdb_job_cond_t *job_cond,
					void **curr_cluster)
{
	List local_cluster_list = NULL;
	time_t now = time(NULL);
	MYSQL_RES *result = NULL;
	MYSQL_ROW row;
	hostlist_t temp_hl = NULL;
	hostlist_iterator_t h_itr = NULL;
	char *query = NULL;
	int dims = 0;

	if (!job_cond || !job_cond->used_nodes)
		return NULL;

	if (!job_cond->cluster_list
	    || list_count(job_cond->cluster_list) != 1) {
		error("If you are doing a query against nodes "
		      "you must only have 1 cluster "
		      "you are asking for.");
		return NULL;
	}

	/* get the dimensions of this cluster so we know how to deal
	   with the hostlists */
	query = xstrdup_printf("select dimensions, flags from %s where "
			       "name='%s'",
			       cluster_table,
			       (char *)list_peek(job_cond->cluster_list));

	debug4("%d(%s:%d) query\n%s",
	       mysql_conn->conn, THIS_FILE, __LINE__, query);
	if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) {
		xfree(query);
		return NULL;
	}
	xfree(query);

	if (!(row = mysql_fetch_row(result))) {
		error("Couldn't get the dimensions of cluster '%s'.",
		      (char *)list_peek(job_cond->cluster_list));
		mysql_free_result(result);
		return NULL;
	}

	/* On a Cray System when dealing with hostlists as we are here
	   this always needs to be 1.
	*/
	if (slurm_atoul(row[1]) & CLUSTER_FLAG_CRAY_A)
		dims = 1;
	else
		dims = atoi(row[0]);

	mysql_free_result(result);

	temp_hl = hostlist_create_dims(job_cond->used_nodes, dims);
	if (hostlist_count(temp_hl) <= 0) {
		error("we didn't get any real hosts to look for.");
		goto no_hosts;
	}
	h_itr = hostlist_iterator_create(temp_hl);

	query = xstrdup_printf("select cluster_nodes, time_start, "
			       "time_end from \"%s_%s\" where node_name='' "
			       "&& cluster_nodes !=''",
			       (char *)list_peek(job_cond->cluster_list),
			       event_table);

	if (job_cond->usage_start) {
		if (!job_cond->usage_end)
			job_cond->usage_end = now;

		xstrfmtcat(query,
			   " && ((time_start < %ld) "
			   "&& (time_end >= %ld || time_end = 0))",
			   job_cond->usage_end, job_cond->usage_start);
	}

	if (debug_flags & DEBUG_FLAG_DB_JOB)
		DB_DEBUG(mysql_conn->conn, "query\n%s", query);
	if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) {
		xfree(query);
		goto no_hosts;
	}
	xfree(query);

	local_cluster_list = list_create(_destroy_local_cluster);
	while ((row = mysql_fetch_row(result))) {
		char *host = NULL;
		int loc = 0;
		local_cluster_t *local_cluster =
			xmalloc(sizeof(local_cluster_t));
		local_cluster->hl = hostlist_create_dims(row[0], dims);
		local_cluster->start = slurm_atoul(row[1]);
		local_cluster->end   = slurm_atoul(row[2]);
		local_cluster->asked_bitmap =
			bit_alloc(hostlist_count(local_cluster->hl));
		while ((host = hostlist_next_dims(h_itr, dims))) {
			if ((loc = hostlist_find(
				     local_cluster->hl, host)) != -1)
				bit_set(local_cluster->asked_bitmap, loc);
			free(host);
		}
		hostlist_iterator_reset(h_itr);
		if (bit_ffs(local_cluster->asked_bitmap) != -1) {
			list_append(local_cluster_list, local_cluster);
			if (local_cluster->end == 0) {
				local_cluster->end = now;
				(*curr_cluster) = local_cluster;
			} else if (!(*curr_cluster)
				   || (((local_cluster_t *)(*curr_cluster))->end
				       < local_cluster->end)) {
				(*curr_cluster) = local_cluster;
			}
		} else
			_destroy_local_cluster(local_cluster);
	}
	mysql_free_result(result);

	if (!list_count(local_cluster_list)) {
		FREE_NULL_LIST(local_cluster_list);
		local_cluster_list = NULL;
		goto no_hosts;
	}

no_hosts:

	hostlist_iterator_destroy(h_itr);
	hostlist_destroy(temp_hl);

	return local_cluster_list;
}
Beispiel #13
0
static void
_prompt_loop(void)
{
    char buf[128];
    char bufnode[128];
    hash_t hstatus = NULL;
    hostlist_t hl = NULL;
    hostlist_iterator_t hlitr = NULL;
    char *node;

    assert(hostname);

    if (!(hstatus = hash_create(HASH_SIZE,
                                (hash_key_f)hash_key_string,
                                (hash_cmp_f)strcmp,
                                (hash_del_f)NULL))) {
        perror("hash_create");
        exit(1);
    }
    if (!(hl = hostlist_create(hostname))) {
        perror("hostlist_create");
        exit(1);
    }
    if (!(hlitr = hostlist_iterator_create(hl))) {
        perror("hostlist_iterator");
        exit(1);
    }
    /* all nodes begin as off */
    while ((node = hostlist_next(hlitr))) {
        if (!hash_insert(hstatus, (void *)node, OFF_STATUS)) {
            perror("hash_insert");
            exit(1);
        }
        /* XXX: Don't free 'node' here, it needs to be alloc'd for
         * the hash key.  It's a mem-leak.  Fix later.
         */
    }
    hostlist_iterator_destroy(hlitr);
    hostlist_destroy(hl);

    while (1) {
        if (xreadline(CMD_PROMPT, buf, sizeof(buf)) == NULL) {
            break;
        } else if (strlen(buf) == 0) {
            continue;
        } else if (!strcmp(buf, "quit")) {
            break;
        } else if (!strcmp(buf, "stat")) {
            _stat(hstatus, hostname);
        } else if (sscanf(buf, "stat %s", bufnode) == 1) {
            _stat(hstatus, bufnode);
        } else if (!strcmp(buf, "on")) {
            _onoff(hstatus, hostname, ON_STATUS);
        } else if (sscanf(buf, "on %s", bufnode) == 1) {
            _onoff(hstatus, bufnode, ON_STATUS);
        } else if (!strcmp(buf, "off")) {
            _onoff(hstatus, hostname, OFF_STATUS);
        } else if (sscanf(buf, "off %s", bufnode) == 1) {
            _onoff(hstatus, bufnode, OFF_STATUS);
        } else
            printf("unknown command - type \"help\"\n");
    }

    hash_destroy(hstatus);
}
Beispiel #14
0
/*
 * setup_cluster_nodes - get cluster record list within requested
 *   time period with used nodes. Used for deciding whether a nodelist is
 *   overlapping with the required nodes.
 */
extern cluster_nodes_t *
setup_cluster_nodes(pgsql_conn_t *pg_conn, slurmdb_job_cond_t *job_cond)
{
	DEF_VARS;
	cluster_nodes_t *cnodes = NULL;
	time_t now = time(NULL);
	hostlist_t temp_hl = NULL;
	hostlist_iterator_t h_itr = NULL;

	if (!job_cond || !job_cond->used_nodes)
		return NULL;

	if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) {
		error("If you are doing a query against nodes "
		      "you must only have 1 cluster "
		      "you are asking for.");
		return NULL;
	}

	temp_hl = hostlist_create(job_cond->used_nodes);
	if (!hostlist_count(temp_hl)) {
		error("we didn't get any real hosts to look for.");
		hostlist_destroy(temp_hl);
		return NULL;
	}

	query = xstrdup_printf("SELECT cluster_nodes, time_start, "
			       "time_end FROM %s.%s WHERE node_name='' "
			       "AND cluster_nodes !=''",
			       (char *)list_peek(job_cond->cluster_list),
			       event_table);

	if (job_cond->usage_start) {
		if (!job_cond->usage_end)
			job_cond->usage_end = now;

		xstrfmtcat(query, " AND ((time_start<%ld) "
			   "AND (time_end>=%ld OR time_end=0))",
			   job_cond->usage_end, job_cond->usage_start);
	}

	result = DEF_QUERY_RET;
	if (!result) {
		hostlist_destroy(temp_hl);
		return NULL;
	}

	h_itr = hostlist_iterator_create(temp_hl);
	cnodes = xmalloc(sizeof(cluster_nodes_t));
	cnodes->cluster_list = list_create(_destroy_local_cluster);
	FOR_EACH_ROW {
		char *host = NULL;
		int loc = 0;
		local_cluster_t *local_cluster =
			xmalloc(sizeof(local_cluster_t));
		local_cluster->hl = hostlist_create(ROW(0));
		local_cluster->start = atoi(ROW(1));
		local_cluster->end   = atoi(ROW(2));
		local_cluster->asked_bitmap =
			bit_alloc(hostlist_count(local_cluster->hl));
		while((host = hostlist_next(h_itr))) {
			if ((loc = hostlist_find(
				    local_cluster->hl, host)) != -1)
				bit_set(local_cluster->asked_bitmap, loc);
			free(host);
		}
		hostlist_iterator_reset(h_itr);
		if (bit_ffs(local_cluster->asked_bitmap) != -1) {
			list_append(cnodes->cluster_list, local_cluster);
			if (local_cluster->end == 0) {
				local_cluster->end = now;
				cnodes->curr_cluster = local_cluster;
			}
		} else
			_destroy_local_cluster(local_cluster);
	} END_EACH_ROW;
	PQclear(result);
	hostlist_iterator_destroy(h_itr);
	if (!list_count(cnodes->cluster_list)) {
		destroy_cluster_nodes(cnodes);
		cnodes = NULL;
	}

	hostlist_destroy(temp_hl);
	return cnodes;
}
Beispiel #15
0
static void
_nodes_setup (void)
{
  hostlist_iterator_t itr = NULL;
  char *host = NULL;
  int i = 0;

  assert (fds);
  assert (fds_count);
  assert (!nodes);
  assert (nodes_count);
  assert (!nodes_index);

  if (!(nodes = list_create ((ListDelF)free)))
    err_exit ("list_create: %s", strerror (errno));

  if (!(nodes_index = hash_create (nodes_count,
                                   (hash_key_f)hash_key_string,
                                   (hash_cmp_f)strcmp,
                                   NULL)))
    err_exit ("hash_create: %s", strerror (errno));

  if (!(itr = hostlist_iterator_create (conf.hosts)))
    err_exit ("hostlist_iterator_create: %s", strerror (errno));

  while ((host = hostlist_next (itr)))
    {
      struct ipmidetectd_info *info = NULL;
      struct hostent *h;
      char *tmpstr;
      char *ip;
      int len;
      char *host_copy = NULL;
      char *host_ptr;
      uint16_t port = RMCP_PRIMARY_RMCP_PORT;

      if (!(info = (struct ipmidetectd_info *)malloc (sizeof (struct ipmidetectd_info))))
        err_exit ("malloc: %s", strerror (errno));
      memset (info, '\0', sizeof (struct ipmidetectd_info));

      if (strchr (host, ':'))
        {
          char *ptr;

          if (!(host_copy = strdup (host)))
            err_exit ("strdup: %s", strerror (errno));
          
          if ((ptr = strchr (host_copy, ':')))
            {
              char *endptr;
              int tmp;

              *ptr = '\0';
              ptr++;
              
              errno = 0;
              tmp = strtol (ptr, &endptr, 0);
              if (errno
                  || endptr[0] != '\0'
                  || tmp <= 0
                  || tmp > USHRT_MAX)
                err_exit ("invalid port specified: %s", host);
              
              port = tmp;
            }

          host_ptr = host_copy;
        }
      else
        host_ptr = host;

      if (!(info->hostname = strdup (host_ptr)))
        err_exit ("strdup: %s", strerror (errno));

      /* Use random number for starting sequence number to avoid probability of
       * duplicates and "hanging" BMC issue.
       */
      if ((len = ipmi_get_random (&(info->sequence_number),
                                  sizeof (info->sequence_number))) < 0)
        err_exit ("ipmi_get_random: %s", strerror (errno));
      if (len != sizeof (info->sequence_number))
        err_exit ("ipmi_get_random: invalid len returned");

      info->fd = fds[i/IPMIDETECTD_NODES_PER_SOCKET];

      if (!(h = gethostbyname (host_ptr)))
        {
#if HAVE_HSTRERROR
          err_exit ("gethostbyname: %s", hstrerror (h_errno));
#else /* !HAVE_HSTRERROR */
          err_exit ("gethostbyname: h_errno = %d", h_errno);
#endif /* !HAVE_HSTRERROR */
        }

      info->destaddr.sin_family = AF_INET;
      info->destaddr.sin_addr = *((struct in_addr *)h->h_addr);
      info->destaddr.sin_port = htons (port);
      free (host_copy);
      free (host);

      if (!list_append (nodes, info))
        err_exit ("list_append: %s", strerror (errno));

      if (!(tmpstr = inet_ntoa (info->destaddr.sin_addr)))
        err_exit ("inet_ntoa: %s", strerror (errno)); /* strerror? */

      if (!(ip = strdup (tmpstr)))
        err_exit ("strdup: %s", strerror (errno));

      if (hash_find (nodes_index, ip))
        err_exit ("Duplicate host ip: %s", ip);

      if (!hash_insert (nodes_index, ip, info))
        err_exit ("hash_insert: %s", strerror (errno));

      i++;
    }

  hostlist_iterator_destroy (itr);
}
struct ipmipower_connection *
ipmipower_connection_array_create(const char *hostname, unsigned int *len) 
{
  char *str = NULL;
  int index = 0;
  hostlist_t hl = NULL;
  hostlist_iterator_t itr = NULL;
  struct ipmipower_connection *ics;
  int size = sizeof(struct ipmipower_connection);
  int hl_count;
  int errcount = 0;
  int emfilecount = 0;

  assert(hostname && len); 

  *len = 0;
  
  if (!(hl = hostlist_create(hostname)))
    {
      ipmipower_output(MSG_TYPE_HOSTNAME_INVALID, hostname);
      return NULL;
    }
  
  if (!(itr = hostlist_iterator_create(hl)))
    ierr_exit("hostlist_iterator_create() error"); 
  
  hostlist_uniq(hl);

  hl_count = hostlist_count(hl);

  ics = (struct ipmipower_connection *)Malloc(size * hl_count);
  
  memset(ics, '\0', (size * hl_count));
  
  while ((str = hostlist_next(itr))) 
    {
      ics[index].ipmi_fd = -1;
      ics[index].ping_fd = -1;
      
      /* cleanup only at the end, gather all error outputs for
       * later 
       */
      if (_connection_setup(&ics[index], str) < 0) 
        {
          if (errno == EMFILE && !emfilecount)
            {
              cbuf_printf(ttyout, "file descriptor limit reached\n");
              emfilecount++;
            }
          errcount++;
        }
       
      free(str);
      index++;
    }

  hostlist_iterator_destroy(itr);
  hostlist_destroy(hl);

  if (errcount)
    {
      int i;
      for (i = 0; i < hl_count; i++) 
        {
          close(ics[i].ipmi_fd);
          close(ics[i].ping_fd);
          if (ics[i].ipmi_in)
            cbuf_destroy(ics[i].ipmi_in);
          if (ics[i].ipmi_out)
            cbuf_destroy(ics[i].ipmi_out);
          if (ics[i].ping_in)
            cbuf_destroy(ics[i].ping_in);
          if (ics[i].ping_out)
            cbuf_destroy(ics[i].ping_out);
        }
      Free(ics);
      return NULL;
    }

  *len = hl_count;
  return ics;
}
Beispiel #17
0
int
pstdout_launch(const char *hostnames, Pstdout_Thread pstdout_func, void *arg)
{
    struct pstdout_thread_data **tdata = NULL;
    struct pstdout_state pstate;
    unsigned int pstate_init = 0;
    hostlist_iterator_t hitr = NULL;
    hostlist_t h = NULL;
    int h_count = 0;
    char *host = NULL;
    int exit_code = -1;
    sighandler_t sighandler_save = NULL;
    int sighandler_set = 0;
    int rc;
    int i;

    if (!pstdout_initialized)
    {
        pstdout_errnum = PSTDOUT_ERR_UNINITIALIZED;
        return -1;
    }

    if (!pstdout_func)
    {
        pstdout_errnum = PSTDOUT_ERR_PARAMETERS;
        return -1;
    }

    if ((rc = pthread_mutex_lock(&pstdout_launch_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    /* Special case */
    if (!hostnames)
    {
        if (_pstdout_state_init(&pstate, NULL) < 0)
            goto cleanup;
        pstate_init++;

        exit_code = pstdout_func(&pstate, NULL, arg);
        pstdout_errnum = PSTDOUT_ERR_SUCCESS;
        goto cleanup;
    }

    if (!(h = hostlist_create(hostnames)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }
    h_count = hostlist_count(h);

    /* Sanity check */
    if (h_count <= 0)
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "h_count = %d\n", h_count);
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    /* Special case */
    if (h_count == 1)
    {
        if (_pstdout_state_init(&pstate, hostnames) < 0)
            goto cleanup;
        pstate_init++;

        exit_code = pstdout_func(&pstate, hostnames, arg);
        pstdout_errnum = PSTDOUT_ERR_SUCCESS;
        goto cleanup;
    }

    if ((sighandler_save = signal(SIGINT, _pstdout_sigint)) == SIG_ERR)
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "signal\n");
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }
    sighandler_set++;

    if (!(hitr = hostlist_iterator_create(h)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }

    if (!(tdata = (struct pstdout_thread_data **)malloc(sizeof(struct pstdout_thread_data *) * h_count)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }
    memset(tdata, '\0', sizeof(struct pstdout_thread_data *) * h_count);

    i = 0;
    while ((host = hostlist_next(hitr)))
    {
        if (!(tdata[i] = (struct pstdout_thread_data *)malloc(sizeof(struct pstdout_thread_data))))
        {
            pstdout_errnum = PSTDOUT_ERR_OUTMEM;
            goto cleanup;
        }
        memset(tdata[i], '\0', sizeof(struct pstdout_thread_data));

        if (!(tdata[i]->hostname = strdup(host)))
        {
            pstdout_errnum = PSTDOUT_ERR_OUTMEM;
            goto cleanup;
        }
        tdata[i]->pstdout_func = pstdout_func;
        tdata[i]->arg = arg;

        if ((rc = pthread_attr_init(&(tdata[i]->attr))))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_attr_init: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        if ((rc = pthread_attr_setdetachstate(&(tdata[i]->attr), PTHREAD_CREATE_DETACHED)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_attr_setdetachstate: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        free(host);
        i++;
    }
    host = NULL;

    hostlist_iterator_destroy(hitr);
    hitr = NULL;

    hostlist_destroy(h);
    h = NULL;

    /* Launch threads up to fanout */
    for (i = 0; i < h_count; i++)
    {
        if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        if (pstdout_threadcount == pstdout_fanout)
        {
            if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex)))
            {
                if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                    fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc));
                pstdout_errnum = PSTDOUT_ERR_INTERNAL;
                goto cleanup;
            }
        }

        if ((rc = pthread_create(&(tdata[i]->tid),
                                 &(tdata[i]->attr),
                                 _pstdout_func_entry,
                                 (void *) tdata[i])))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_create: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        pstdout_threadcount++;

        if ((rc = pthread_mutex_unlock(&pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }
    }

    /* Wait for Threads to finish */

    if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    while (pstdout_threadcount > 0)
    {
        if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }
    }

    if (_pstdout_output_consolidated_finish() < 0)
        goto cleanup;

    /* Determine exit code */
    exit_code = 0;
    for (i = 0; i < h_count; i++)
    {
        if (tdata[i]->exit_code > exit_code)
            exit_code = tdata[i]->exit_code;
    }

cleanup:
    /* Cannot pass NULL for key, so just pass dummy key */
    list_delete_all(pstdout_consolidated_stdout, _pstdout_consolidated_data_delete_all, "");
    list_delete_all(pstdout_consolidated_stderr, _pstdout_consolidated_data_delete_all, "");
    if (pstate_init)
        _pstdout_state_cleanup(&pstate);
    if (tdata)
    {
        for (i = 0; i < h_count; i++)
        {
            if (tdata[i])
            {
                free(tdata[i]->hostname);
                pthread_attr_destroy(&(tdata[i]->attr));
                free(tdata[i]);
            }
        }
        free(tdata);
    }
    if (hitr)
        hostlist_iterator_destroy(hitr);
    if (h)
        hostlist_destroy(h);
    free(host);
    if ((rc = pthread_mutex_unlock(&pstdout_launch_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc));
        /* Don't change error code, just move on */
    }
    if (sighandler_set)
        signal(SIGINT, sighandler_save);
    return exit_code;
}
Beispiel #18
0
int main(int argc, char *argv[])
{
	extern int optind;
	extern char *optarg;
	int c;
	PlugList pl = NULL;
	char *hwplugs = NULL;
	char *nodelist = NULL;
	char *pluglist = NULL;
	char *findplug = NULL;

	err_init(basename(argv[0]));

	while ((c = getopt(argc, argv, "p:f:")) != EOF) {
		switch (c) {
		case 'p':
			hwplugs = optarg;
			break;
		case 'f':
			findplug = optarg;
			break;
		default:
			usage();
		}
	}
	if (argc - optind == 0)
		usage();
	nodelist = argv[optind++];
	if (argc - optind == 1)
		pluglist = argv[optind++];
	if (argc - optind != 0)
		usage();

	if (hwplugs) {
		hostlist_t hl = hostlist_create(hwplugs);
		hostlist_iterator_t itr = hostlist_iterator_create(hl);
		List l = list_create((ListDelF)xfree);
		char *plug;

		while ((plug = hostlist_next(itr)))
			list_append(l, xstrdup(plug));

		hostlist_iterator_destroy(itr);
		hostlist_destroy(hl);
		pl = pluglist_create(l);
		list_destroy(l);
	} else
		pl = pluglist_create(NULL);

	switch (pluglist_map(pl, nodelist, pluglist)) {
		case EPL_DUPNODE:
			fprintf(stderr, "duplicate node\n");
			break;
		case EPL_UNKPLUG:
			fprintf(stderr, "unknown plug\n");
			break;
		case EPL_DUPPLUG:
			fprintf(stderr, "duplicate plug\n");
			break;
		case EPL_NOPLUGS:
			fprintf(stderr, "more nodes than plugs\n");
			break;
		case EPL_NONODES:
			fprintf(stderr, "more plugs than nodes\n");
			break;
		case EPL_SUCCESS:
			break;
	}

	if (findplug) {
		Plug *plug = pluglist_find(pl, findplug);

		if (plug)
			printf("plug=%s node=%s\n", plug->name,
					plug->node ? plug->node : "NULL");
		else
			printf("plug %s: not found\n", findplug);
	} else {
		PlugListIterator itr = pluglist_iterator_create(pl);
		Plug *plug;

		while ((plug = pluglist_next(itr))) {
			printf("plug=%s node=%s\n", plug->name,
					plug->node ? plug->node : "NULL");
		}
		pluglist_iterator_destroy(itr);
	}

	exit(0);
}
Beispiel #19
0
/* use specific set run tasks on each host listed in hostfile
 * XXX: Need to handle over-subscribe.
 */
static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
				 const char *arbitrary_nodes)
{
	int i=0, j, taskid = 0, task_cnt=0;
	hostlist_iterator_t itr = NULL, itr_task = NULL;
	char *host = NULL;

	hostlist_t job_alloc_hosts = NULL;
	hostlist_t step_alloc_hosts = NULL;

	int step_inx = 0, step_hosts_cnt = 0;
	struct node_record **step_hosts_ptrs = NULL;
	struct node_record *host_ptr = NULL;

	debug2("job list is %s", step_layout->node_list);
	if (!arbitrary_nodes) {
		error("no hostlist given for arbitrary dist");
		return SLURM_ERROR;
	}

	debug2("list is %s", arbitrary_nodes);
	step_alloc_hosts = hostlist_create(arbitrary_nodes);
	if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) {
		error("Asked for %u tasks have %d in the nodelist.  "
		      "Check your nodelist, or set the -n option to be %d",
		      step_layout->task_cnt,
		      hostlist_count(step_alloc_hosts),
		      hostlist_count(step_alloc_hosts));
		hostlist_destroy(step_alloc_hosts);
		return SLURM_ERROR;
	}

	job_alloc_hosts = hostlist_create(step_layout->node_list);
	itr             = hostlist_iterator_create(job_alloc_hosts);
	itr_task        = hostlist_iterator_create(step_alloc_hosts);

	/*
	 * Build array of pointers so that we can do pointer comparisons rather
	 * than strcmp's on nodes.
	 */
	step_hosts_cnt  = hostlist_count(step_alloc_hosts);
	step_hosts_ptrs = xmalloc(sizeof(struct node_record *) *
				  step_hosts_cnt);

	step_inx = 0;
	while((host = hostlist_next(itr_task))) {
		step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host);
		free(host);
	}

	while((host = hostlist_next(itr))) {
		host_ptr = find_node_record(host);
		step_layout->tasks[i] = 0;

		for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) {
			if (host_ptr == step_hosts_ptrs[step_inx]) {
				step_layout->tasks[i]++;
				task_cnt++;
			}
			if (task_cnt >= step_layout->task_cnt)
				break;
		}
		debug3("%s got %u tasks", host, step_layout->tasks[i]);
		if (step_layout->tasks[i] == 0)
			goto reset_hosts;
		step_layout->tids[i] = xmalloc(sizeof(uint32_t)
					       * step_layout->tasks[i]);
		taskid = 0;
		j = 0;

		for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) {
			if (host_ptr == step_hosts_ptrs[step_inx]) {
				step_layout->tids[i][j] = taskid;
				j++;
			}
			taskid++;
			if (j >= step_layout->tasks[i])
				break;
		}
		i++;
	reset_hosts:
		free(host);
		if (i > step_layout->task_cnt)
			break;
	}
	hostlist_iterator_destroy(itr);
	hostlist_iterator_destroy(itr_task);
	hostlist_destroy(job_alloc_hosts);
	hostlist_destroy(step_alloc_hosts);
	xfree(step_hosts_ptrs);

	if (task_cnt != step_layout->task_cnt) {
		error("Asked for %u tasks but placed %d. Check your nodelist",
		      step_layout->task_cnt, task_cnt);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}
Beispiel #20
0
int
main (int argc, char *argv[])
{
    char *server = NULL;
    int msize = 65536;
    uid_t uid = geteuid ();
    int topt = 0;
    Npcfsys *fs = NULL;
    Npcfid *fid, *afid, *root;
    int c, fd;
    char buf[80], *host, *p;
    hostlist_t hl;
    hostlist_iterator_t itr;
    int lopt = 0;

    diod_log_init (argv[0]);

    opterr = 0;
    while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) {
        switch (c) {
        case 's':   /* --server HOST[:PORT] or /path/to/socket */
            server = optarg;
            break;
        case 'm':   /* --msize SIZE */
            msize = strtoul (optarg, NULL, 10);
            break;
        case 'u':   /* --uid UID */
            uid = strtoul (optarg, NULL, 10);
            break;
        case 't':   /* --timeout SECS */
            topt = strtoul (optarg, NULL, 10);
            break;
        case 'l':   /* --long */
            lopt = 1;
            break;
        default:
            usage ();
        }
    }

    if (signal (SIGPIPE, SIG_IGN) == SIG_ERR)
        err_exit ("signal");
    if (signal (SIGALRM, sigalarm) == SIG_ERR)
        err_exit ("signal");

    if (topt > 0)
        alarm (topt);

    if ((fd = diod_sock_connect (server, 0)) < 0)
        exit (1);

    if (!(fs = npc_start (fd, fd, msize, 0)))
        errn_exit (np_rerror (), "error negotiating protocol with server");
    if (!(afid = npc_auth (fs, "ctl", uid, diod_auth)) && np_rerror () != 0)
        errn_exit (np_rerror (), "error authenticating to server");
    if (!(root = npc_attach (fs, afid, "ctl", uid)))
        errn_exit (np_rerror (), "error attaching to aname=ctl");
    if (!(fid = npc_open_bypath (root, "connections", O_RDONLY)))
        errn_exit (np_rerror (), "open connections");

    if (!(hl = hostlist_create (NULL)))
        err_exit ("hostlist_create");
    while (npc_gets (fid, buf, sizeof(buf))) {
        if ((p = strchr (buf, ' ')))
            *p = '\0';
        if (!lopt && (p = strchr (buf, '.')))
            *p = '\0';
        if (!hostlist_push_host (hl, buf))
            err_exit ("hostlist_push_host");
    }
    hostlist_uniq (hl);
    if (lopt) {
        if (!(itr = hostlist_iterator_create (hl)))
            err_exit ("hostlist_iterator_create");
        while ((host = hostlist_next (itr)))
            printf ("%s\n", host);
        hostlist_iterator_destroy (itr);
    } else {
        char s[1024];

        if (hostlist_ranged_string (hl, sizeof (s), s) < 0)
            msg_exit ("hostlist output would be too long (use -l)");
        printf ("%s\n", s);
    }
    hostlist_destroy (hl);

    if (npc_clunk (fid) < 0)
        errn_exit (np_rerror (), "clunk connections");
    if (npc_clunk (root) < 0)
        errn_exit (np_rerror (), "error clunking ctl");
    if (npc_clunk (afid) < 0)
        errn_exit (np_rerror (), "error clunking afid");
    npc_finish (fs);

    exit(0);
}
Beispiel #21
0
/* use specific set run tasks on each host listed in hostfile
 * XXX: Need to handle over-subscribe.
 */
static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
				 const char *arbitrary_nodes)
{
	int i=0, j, taskid = 0, task_cnt=0;
	hostlist_iterator_t itr = NULL, itr_task = NULL;
	char *host = NULL;
	char *host_task = NULL;
	hostlist_t job_alloc_hosts = NULL;
	hostlist_t step_alloc_hosts = NULL;

	debug2("job list is %s", step_layout->node_list);
	job_alloc_hosts = hostlist_create(step_layout->node_list);
	itr = hostlist_iterator_create(job_alloc_hosts);
	if (!arbitrary_nodes) {
		error("no hostlist given for arbitrary dist");
		return SLURM_ERROR;
	}

	debug2("list is %s", arbitrary_nodes);
	step_alloc_hosts = hostlist_create(arbitrary_nodes);
	if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) {
		error("Asked for %u tasks have %d in the nodelist.  "
		      "Check your nodelist, or set the -n option to be %d",
		      step_layout->task_cnt,
		      hostlist_count(step_alloc_hosts),
		      hostlist_count(step_alloc_hosts));
		return SLURM_ERROR;
	}
	itr_task = hostlist_iterator_create(step_alloc_hosts);
	while((host = hostlist_next(itr))) {
		step_layout->tasks[i] = 0;
		while((host_task = hostlist_next(itr_task))) {
			if (!strcmp(host, host_task)) {
				step_layout->tasks[i]++;
				task_cnt++;
			}
			free(host_task);
			if (task_cnt >= step_layout->task_cnt)
				break;
		}
		debug3("%s got %u tasks", host, step_layout->tasks[i]);
		if (step_layout->tasks[i] == 0)
			goto reset_hosts;
		step_layout->tids[i] = xmalloc(sizeof(uint32_t)
					       * step_layout->tasks[i]);
		taskid = 0;
		j = 0;
		hostlist_iterator_reset(itr_task);
		while((host_task = hostlist_next(itr_task))) {
			if (!strcmp(host, host_task)) {
				step_layout->tids[i][j] = taskid;
				j++;
			}
			taskid++;
			free(host_task);
			if (j >= step_layout->tasks[i])
				break;
		}
		i++;
	reset_hosts:
		hostlist_iterator_reset(itr_task);
		free(host);
		if (i > step_layout->task_cnt)
			break;
	}
	hostlist_iterator_destroy(itr);
	hostlist_iterator_destroy(itr_task);
	hostlist_destroy(job_alloc_hosts);
	hostlist_destroy(step_alloc_hosts);
	if (task_cnt != step_layout->task_cnt) {
		error("Asked for %u tasks but placed %d. Check your nodelist",
		      step_layout->task_cnt, task_cnt);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}
Beispiel #22
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}
static int
eliminate_nodes (char **hosts)
{
  hostlist_t hl = NULL;
  hostlist_t hlnew = NULL;
  hostlist_iterator_t hitr = NULL;
  ipmidetect_t id = NULL;
  char *host = NULL;
  char hostbuf[HOSTLIST_BUFLEN + 1];
  int rv = -1;

  assert (hosts);
  assert (*hosts);

  if (!(id = ipmidetect_handle_create ()))
    {
      fprintf (stderr,
               "ipmidetect_handle_create\n");
      goto cleanup;
    }

  if (ipmidetect_load_data (id,
                            NULL,
                            0,
                            0) < 0)
    {
      if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT
          || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT)
        fprintf (stderr,
                 "Error connecting to ipmidetect daemon\n");
      else
        fprintf (stderr,
                 "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id));
      goto cleanup;
    }

  if (!(hl = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hlnew = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hitr = hostlist_iterator_create (hl)))
    {
      fprintf (stderr,
               "hostlist_iterator_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  while ((host = hostlist_next (hitr)))
    {
      int ret;

      if ((ret = ipmidetect_is_node_detected (id, host)) < 0)
        {
          if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND)
            fprintf (stderr,
                     "Node '%s' unrecognized by ipmidetect\n", host);
          else
            fprintf (stderr,
                     "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id));
          goto cleanup;
        }

      if (!ret)
        hostlist_delete (hlnew, host);

      free (host);
    }
  host = NULL;

  if (!hostlist_count (hlnew))
    {
      rv = 0;
      goto cleanup;
    }
 
  memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1);
 
  if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0)
    {
      fprintf (stderr,
               "hostlist_ranged_string: truncation\n");
      goto cleanup;
    }

  free (*hosts);
  if (!(*hosts = strdup (hostbuf)))
    {
      fprintf (stderr, "strdup: %s\n", strerror (errno));
      goto cleanup;
    }

  rv = hostlist_count (hlnew);
 cleanup:
  if (id)
    ipmidetect_handle_destroy (id);
  if (hitr)
    hostlist_iterator_destroy (hitr);
  if (hl)
    hostlist_destroy (hl);
  if (hlnew)
    hostlist_destroy (hlnew);
  free (host);
  return (rv);
}