Ejemplo n.º 1
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	inv = get_full_inventory(version);
	if (inv == NULL)
		/* FIXME: should retry here if the condition is transient */
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else
			node_ptr->node_rank = inv->nodes_total - rank_count++;
		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);

	return SLURM_SUCCESS;
}
Ejemplo n.º 2
0
/**
 * basil_get_initial_state  -  set SLURM initial node state from ALPS.
 *
 * The logic is identical to basil_inventory(), with the difference that this
 * is called before valid bitmaps exist, from select_g_node_init(). It relies
 * on the following other parts:
 * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
 * - it relies on _sync_nodes_to_jobs() to
 *   o kill active jobs on nodes now marked DOWN,
 *   o reset node state to ALLOCATED if it has been marked IDLE here (which is
 *     an error case, since there is no longer an ALPS reservation for the job,
 *     this is caught by the subsequent basil_inventory()).
 * Return: SLURM_SUCCESS if ok, non-zero on error.
 */
static int basil_get_initial_state(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INITIAL INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL)
			continue;

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		node_ptr->node_state &= NODE_STATE_FLAGS;
		if (reason) {
			if (node_ptr->reason) {
				debug3("Initial DOWN node %s - %s",
					node_ptr->name, node_ptr->reason);
			} else {
				debug("Initial DOWN node %s - %s",
					node_ptr->name, reason);
				node_ptr->reason = xstrdup(reason);
			}
			node_ptr->node_state |= NODE_STATE_DOWN;
		} else {
			if (node_is_allocated(node))
				node_ptr->node_state |= NODE_STATE_ALLOCATED;
			else
				node_ptr->node_state |= NODE_STATE_IDLE;
			xfree(node_ptr->reason);
		}
	}
	free_inv(inv);
	return SLURM_SUCCESS;
}
Ejemplo n.º 3
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		if (reason) {
			if (!IS_NODE_DOWN(node_ptr)) {
				xfree(node_ptr->reason);
				debug("MARKING %s DOWN (%s)",
				      node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down(node_ptr->name, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		if (job_iter == NULL)
			fatal("list_iterator_create: malloc failure");

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {

			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		if (job_ptr == NULL) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			basil_safe_release(rsvn->rsvn_id, inv);
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch)
		/* ALPS will take some time, do not schedule now. */
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
	return rc;
}
Ejemplo n.º 4
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	node_rank_inv = 1;
	/*
	 * When obtaining the initial configuration, we can not allow ALPS to
	 * fail. If there is a problem at this stage it is better to restart
	 * SLURM completely, after investigating (and/or fixing) the cause.
	 */
	inv = get_full_inventory(version);
	if (inv == NULL)
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");
	else if (inv->batch_total < node_cnt)
		info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
		     "check DownNodes", inv->batch_total, node_cnt);

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->cpu_count != node_ptr->config_ptr->cpus)) {
			fatal("slurm.conf: node %s has %u cpus "
			      "but configured as CPUs=%u in your slurm.conf",
			      node_ptr->name, node->cpu_count,
			      node_ptr->config_ptr->cpus);
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->mem_size
			       != node_ptr->config_ptr->real_memory)) {
			fatal("slurm.conf: node %s has RealMemory=%u "
			      "but configured as RealMemory=%u in your "
			      "slurm.conf",
			      node_ptr->name, node->mem_size,
			      node_ptr->config_ptr->real_memory);
		} else {
			node_ptr->node_rank = inv->nodes_total - rank_count++;
			/*
			 * Convention: since we are using SLURM in
			 *             frontend-mode, we use
			 *             NodeHostName as follows.
			 *
			 * NodeHostName:  c#-#c#s#n# using the  NID convention
			 *                <cabinet>-<row><chassis><slot><node>
			 * - each cabinet can accommodate 3 chassis (c1..c3)
			 * - each chassis has 8 slots               (s0..s7)
			 * - each slot contains 2 or 4 nodes        (n0..n3)
			 *   o either 2 service nodes (n0/n3)
			 *   o or 4 compute nodes     (n0..n3)
			 *   o or 2 gemini chips      (g0/g1 serving n0..n3)
			 *
			 * Example: c0-0c1s0n1
			 *          - c0- = cabinet 0
			 *          - 0   = row     0
			 *          - c1  = chassis 1
			 *          - s0  = slot    0
			 *          - n1  = node    1
			 */
			xfree(node_ptr->node_hostname);
			node_ptr->node_hostname = xstrdup(node->name);
		}

		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push_host(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);
	node_rank_inv = 0;

	return SLURM_SUCCESS;
}
Ejemplo n.º 5
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;
	int rel_rc;
	time_t now = time(NULL);
	static time_t slurm_alps_mismatch_time = (time_t) 0;
	static bool logged_sync_timeout = false;
	static time_t last_inv_run = 0;

	if ((now - last_inv_run) < inv_interval)
		return SLURM_SUCCESS;

	last_inv_run = now;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/* Avoid checking for inv->batch_avail here since if we are
	   gang scheduling returning an error for a full system is
	   probably the wrong thing to do. (the schedule() function
	   in the slurmctld will never run ;)).
	*/
	if (!inv->f->node_head || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		int node_inx;
		struct node_record *node_ptr;
		char *reason = NULL;

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}
		node_inx = node_ptr - node_record_table_ptr;

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		if (reason) {
			if (node_ptr->down_time == 0)
				node_ptr->down_time = now;
			if (IS_NODE_DOWN(node_ptr)) {
				/* node still down */
			} else if ((slurmctld_conf.slurmd_timeout == 0) ||
				   ((now - node_ptr->down_time) <
				    slurmctld_conf.slurmd_timeout)) {
				node_ptr->node_state |= NODE_STATE_NO_RESPOND;
				bit_clear(avail_node_bitmap, node_inx);
			} else {
				xfree(node_ptr->reason);
				info("MARKING %s DOWN (%s)",
				     node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down_ptr(node_ptr, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);
			node_ptr->down_time = 0;
			info("MARKING %s UP", node_ptr->name);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				xfree(node_ptr->reason);
				node_ptr->reason_time = 0;
				node_ptr->reason_uid = NO_VAL;
				clusteracct_storage_g_node_up(
					acct_db_conn, node_ptr, now);
			}
		} else if (IS_NODE_NO_RESPOND(node_ptr)) {
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				bit_set(avail_node_bitmap, node_inx);
			}
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {
			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		/*
		 * Changed to ignore reservations for "UNKNOWN" batch
		 * ids (e.g. the interactive region) (Chris North)
		 */

		if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			rel_rc = basil_safe_release(rsvn->rsvn_id, inv);
			if (rel_rc) {
				error("ALPS reservation %u removal FAILED: %s",
				      rsvn->rsvn_id, basil_strerror(rel_rc));
			} else {
				debug("ALPS reservation %u removed",
				      rsvn->rsvn_id);
			}
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch) {
		/* If SLURM and ALPS state are not in synchronization,
		 * do not schedule any more jobs until waiting at least
		 * SyncTimeout seconds. */
		if (slurm_alps_mismatch_time == 0) {
			slurm_alps_mismatch_time = now;
		} else if (cray_conf->sync_timeout == 0) {
			/* Wait indefinitely */
		} else if (difftime(now, slurm_alps_mismatch_time) <
			   cray_conf->sync_timeout) {
			return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
		} else if (!logged_sync_timeout) {
			error("Could not synchronize SLURM with ALPS for %u "
			      "seconds, proceeding with job scheduling",
			      cray_conf->sync_timeout);
			logged_sync_timeout = true;
		}
	} else {
		slurm_alps_mismatch_time = 0;
		logged_sync_timeout = false;
	}
	return rc;
}
Ejemplo n.º 6
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;
    struct basil_node *node;
    int rank_count = 0, i;
    hostlist_t hl = hostlist_create(NULL);
    bool bad_node = 0;

    /*
     * When obtaining the initial configuration, we can not allow ALPS to
     * fail. If there is a problem at this stage it is better to restart
     * SLURM completely, after investigating (and/or fixing) the cause.
     */
    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get BASIL %s ranking", bv_names_long[version]);
    else if (!inv->batch_total)
        fatal("system has no usable batch compute nodes");
    else if (inv->batch_total < node_cnt)
        info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
             "check DownNodes", inv->batch_total, node_cnt);

    debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
          bv_names_long[version], inv->batch_avail, inv->batch_total);

    /*
     * Node ranking is based on a subset of the inventory: only nodes in
     * batch allocation mode which are up and not allocated. Assign a
     * 'NO_VAL' rank to all other nodes, which will translate as a very
     * high value, (unsigned)-2, to put those nodes last in the ranking.
     * The rest of the code must ensure that those nodes are never chosen.
     */
    for (i = 0; i < node_cnt; i++)
        node_array[i].node_rank = NO_VAL;

    for (node = inv->f->node_head; node; node = node->next) {
        struct node_record *node_ptr;
        char tmp[50];

        /* This will ignore interactive nodes when iterating through
         * the apbasil inventory.  If we don't do this, SLURM is
         * unable to resolve the ID to a nidXXX name since it's not in
         * the slurm.conf file.  (Chris North)
         */
        if (node->role == BNR_INTER)
            continue;

        node_ptr = _find_node_by_basil_id(node->node_id);
        if (node_ptr == NULL) {
            error("nid%05u (%s node in state %s) not in slurm.conf",
                  node->node_id, nam_noderole[node->role],
                  nam_nodestate[node->state]);
            bad_node = 1;
        } else
            node_ptr->node_rank = inv->nodes_total - rank_count++;
        sprintf(tmp, "nid%05u", node->node_id);
        hostlist_push(hl, tmp);
    }
    free_inv(inv);
    if (bad_node) {
        hostlist_sort(hl);
        char *name = hostlist_ranged_string_xmalloc(hl);
        info("It appears your slurm.conf nodelist doesn't "
             "match the alps system.  Here are the nodes alps knows "
             "about\n%s", name);
    }
    hostlist_destroy(hl);

    return SLURM_SUCCESS;
}