Exemplo n.º 1
0
/**
 * basil_reserve  -  wrapper around rsvn_new.
 * @user:       owner of the reservation
 * @batch_id:   (numeric) job ID
 * @width:      mppwidth (aprun -n)
 * @depth:      mppdepth (aprun -d)
 * @nppn:       mppnppn  (aprun -N)
 * @mem_mb:     mppmem   (aprun -m)
 * @ns_head:    list of requested mppnodes (will be freed if not NULL)
 * @accel_head: optional accelerator parameters
 * Returns reservation ID > 0 if ok, negative %enum basil_error on error.
 */
long basil_reserve(const char *user, const char *batch_id,
		   uint32_t width, uint32_t depth, uint32_t nppn,
		   uint32_t mem_mb, uint32_t nppcu, struct nodespec *ns_head,
		   struct basil_accel_param *accel_head)
{
	struct basil_reservation *rsvn;
	struct basil_parse_data bp = {0};
	/* do not free mppnodes it is stored/freed in the rsvn struct */
	char *mppnodes = ns_to_string(ns_head);
	long rc;

	free_nodespec(ns_head);
	rsvn = _rsvn_new(user, batch_id, width, depth, nppn, mem_mb,
			 nppcu, mppnodes, accel_head);
	if (rsvn == NULL)
		return -BE_INTERNAL;

	bp.method    = BM_reserve;
	bp.mdata.res = rsvn;
	bp.version   = BV_1_0;
	/*
	 * Rule:
	 * - if *res->batch_id is set, we are using Basil 1.1
	 * - if *res->batch_id == '\0' we have to fall back to Basil 1.0
	 */
	if (batch_id && *batch_id)
		bp.version = get_basil_version();

	rc = basil_request(&bp);
	if (rc >= 0)
		rc = rsvn->rsvn_id;
	free_rsvn(rsvn);
	return rc;
}
Exemplo n.º 2
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	inv = get_full_inventory(version);
	if (inv == NULL)
		/* FIXME: should retry here if the condition is transient */
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else
			node_ptr->node_rank = inv->nodes_total - rank_count++;
		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);

	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
static int rsvn_release(struct basil_reservation *res)
{
	struct basil_parse_data bp = {0};

	bp.method    = BM_release;
	bp.mdata.res = res;
	bp.version   = get_basil_version();
	/* NOTE - for simplicity we could use BV_1_0 here */

	return basil_request(&bp);
}
Exemplo n.º 4
0
/**
 * basil_switch - suspend/resume an existing reservation
 * @rsvn_id:    the reservation id
 * @suspend:    to suspend or not to suspend
 * Returns 0 if ok, a negative %basil_error otherwise.
 *
 */
int basil_switch(uint32_t rsvn_id, bool suspend)
{
	struct basil_reservation rsvn = {0};
	struct basil_parse_data bp = {0};

	rsvn.rsvn_id = rsvn_id;
	rsvn.suspended = suspend;

	bp.method    = BM_switch;
	bp.mdata.res = &rsvn;
	bp.version   = get_basil_version();
	/* NOTE - for simplicity we could use BV_1_0 here */

	return basil_request(&bp);
}
Exemplo n.º 5
0
/**
 * basil_signal_apids  -  send a signal to all APIDs of a given ALPS reservation
 * @rsvn_id:	reservation ID to target
 * @signal:	signal number
 * @inv:	recent Basil Inventory, or NULL to generate internally
 * Returns 0 if ok, a negative %basil_error otherwise.
 */
int basil_signal_apids(int32_t rsvn_id, int signal, struct basil_inventory *inv)
{
	struct basil_inventory *new_inv = inv;
	uint64_t *apid, *apids;
	char cmd[512];

	if (access(cray_conf->apkill, X_OK) < 0) {
		error("FATAL: can not execute the apkill command '%s'",
		      cray_conf->apkill);
		return -BE_SYSTEM;
	}

	if (inv == NULL)
		new_inv = get_full_inventory(get_basil_version());
	if (new_inv == NULL) {
		error("can not obtain a BASIL inventory to get APID list");
		return -(BE_INTERNAL | BE_TRANSIENT);
	}

	apids = basil_get_rsvn_aprun_apids(new_inv, rsvn_id);
	if (apids) {
		for (apid = apids; *apid; apid++) {
			debug2("ALPS resId %u, running apkill -%d %llu",
				rsvn_id, signal, (unsigned long long)*apid);
			snprintf(cmd, sizeof(cmd), "%s -%d %llu",
				 cray_conf->apkill, signal,
				 (unsigned long long)*apid);
			if (system(cmd) < 0)
				error("system(%s) failed", cmd);
		}
		xfree(apids);
	}
	if (inv == NULL)
		free_inv(new_inv);
	return BE_NONE;
}
Exemplo n.º 6
0
/**
 * basil_get_initial_state  -  set SLURM initial node state from ALPS.
 *
 * The logic is identical to basil_inventory(), with the difference that this
 * is called before valid bitmaps exist, from select_g_node_init(). It relies
 * on the following other parts:
 * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
 * - it relies on _sync_nodes_to_jobs() to
 *   o kill active jobs on nodes now marked DOWN,
 *   o reset node state to ALLOCATED if it has been marked IDLE here (which is
 *     an error case, since there is no longer an ALPS reservation for the job,
 *     this is caught by the subsequent basil_inventory()).
 * Return: SLURM_SUCCESS if ok, non-zero on error.
 */
static int basil_get_initial_state(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INITIAL INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL)
			continue;

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		node_ptr->node_state &= NODE_STATE_FLAGS;
		if (reason) {
			if (node_ptr->reason) {
				debug3("Initial DOWN node %s - %s",
					node_ptr->name, node_ptr->reason);
			} else {
				debug("Initial DOWN node %s - %s",
					node_ptr->name, reason);
				node_ptr->reason = xstrdup(reason);
			}
			node_ptr->node_state |= NODE_STATE_DOWN;
		} else {
			if (node_is_allocated(node))
				node_ptr->node_state |= NODE_STATE_ALLOCATED;
			else
				node_ptr->node_state |= NODE_STATE_IDLE;
			xfree(node_ptr->reason);
		}
	}
	free_inv(inv);
	return SLURM_SUCCESS;
}
Exemplo n.º 7
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		if (reason) {
			if (!IS_NODE_DOWN(node_ptr)) {
				xfree(node_ptr->reason);
				debug("MARKING %s DOWN (%s)",
				      node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down(node_ptr->name, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		if (job_iter == NULL)
			fatal("list_iterator_create: malloc failure");

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {

			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		if (job_ptr == NULL) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			basil_safe_release(rsvn->rsvn_id, inv);
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch)
		/* ALPS will take some time, do not schedule now. */
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
	return rc;
}
Exemplo n.º 8
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	node_rank_inv = 1;
	/*
	 * When obtaining the initial configuration, we can not allow ALPS to
	 * fail. If there is a problem at this stage it is better to restart
	 * SLURM completely, after investigating (and/or fixing) the cause.
	 */
	inv = get_full_inventory(version);
	if (inv == NULL)
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");
	else if (inv->batch_total < node_cnt)
		info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
		     "check DownNodes", inv->batch_total, node_cnt);

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->cpu_count != node_ptr->config_ptr->cpus)) {
			fatal("slurm.conf: node %s has %u cpus "
			      "but configured as CPUs=%u in your slurm.conf",
			      node_ptr->name, node->cpu_count,
			      node_ptr->config_ptr->cpus);
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->mem_size
			       != node_ptr->config_ptr->real_memory)) {
			fatal("slurm.conf: node %s has RealMemory=%u "
			      "but configured as RealMemory=%u in your "
			      "slurm.conf",
			      node_ptr->name, node->mem_size,
			      node_ptr->config_ptr->real_memory);
		} else {
			node_ptr->node_rank = inv->nodes_total - rank_count++;
			/*
			 * Convention: since we are using SLURM in
			 *             frontend-mode, we use
			 *             NodeHostName as follows.
			 *
			 * NodeHostName:  c#-#c#s#n# using the  NID convention
			 *                <cabinet>-<row><chassis><slot><node>
			 * - each cabinet can accommodate 3 chassis (c1..c3)
			 * - each chassis has 8 slots               (s0..s7)
			 * - each slot contains 2 or 4 nodes        (n0..n3)
			 *   o either 2 service nodes (n0/n3)
			 *   o or 4 compute nodes     (n0..n3)
			 *   o or 2 gemini chips      (g0/g1 serving n0..n3)
			 *
			 * Example: c0-0c1s0n1
			 *          - c0- = cabinet 0
			 *          - 0   = row     0
			 *          - c1  = chassis 1
			 *          - s0  = slot    0
			 *          - n1  = node    1
			 */
			xfree(node_ptr->node_hostname);
			node_ptr->node_hostname = xstrdup(node->name);
		}

		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push_host(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);
	node_rank_inv = 0;

	return SLURM_SUCCESS;
}
Exemplo n.º 9
0
/**
 * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates.
 *
 * Checks both SDB database and ALPS inventory for consistency. The inventory
 * part is identical to basil_inventory(), with the difference of being called
 * before valid bitmaps exist, from select_g_node_init().
 * Its dependencies are:
 * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
 * - it relies on _sync_nodes_to_jobs() to
 *   o kill active jobs on nodes now marked DOWN,
 *   o reset node state to ALLOCATED if it has been marked IDLE here (which is
 *     an error case, since there is no longer an ALPS reservation for the job,
 *     this is caught by the subsequent basil_inventory()).
 */
extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt)
{
	struct node_record *node_ptr, *end = node_ptr_array + node_cnt;
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;

	/* General mySQL */
	MYSQL		*handle;
	MYSQL_STMT	*stmt = NULL;
	/* Input parameters */
	unsigned int	node_id;
	/*
	 * Use a left outer join here since the attributes table may not be
	 * populated for a given nodeid (e.g. when the node has been disabled
	 * on the SMW via 'xtcli disable').
	 * The processor table has more authoritative information, if a nodeid
	 * is not listed there, it does not exist.
	 */
	const char query[] =	"SELECT x_coord, y_coord, z_coord, "
		"processor_type FROM processor WHERE processor_id = ? ";
	const int	PARAM_COUNT = 1;	/* node id */
	MYSQL_BIND	params[PARAM_COUNT];

	int		x_coord, y_coord, z_coord;
	char		proc_type[BASIL_STRING_SHORT];
	MYSQL_BIND	bind_cols[COLUMN_COUNT];
	my_bool		is_null[COLUMN_COUNT];
	my_bool		is_error[COLUMN_COUNT];
	int		is_gemini, i;
	time_t		now = time(NULL);

	memset(params, 0, sizeof(params));
	params[0].buffer_type = MYSQL_TYPE_LONG;
	params[0].is_unsigned = true;
	params[0].is_null     = (my_bool *)0;
	params[0].buffer      = (char *)&node_id;

	memset(bind_cols, 0, sizeof(bind_cols));
	for (i = 0; i < COLUMN_COUNT; i ++) {
		bind_cols[i].is_null = &is_null[i];
		bind_cols[i].error   = &is_error[i];

		if (i == COL_TYPE) {
			bind_cols[i].buffer_type   = MYSQL_TYPE_STRING;
			bind_cols[i].buffer_length = sizeof(proc_type);
			bind_cols[i].buffer	   = proc_type;
		} else {
			bind_cols[i].buffer_type   = MYSQL_TYPE_LONG;
			bind_cols[i].is_unsigned   = (i >= COL_TYPE);
		}
	}
	bind_cols[COL_X].buffer	     = (char *)&x_coord;
	bind_cols[COL_Y].buffer	     = (char *)&y_coord;
	bind_cols[COL_Z].buffer	     = (char *)&z_coord;

	inv = get_full_inventory(version);
	if (inv == NULL)
		fatal("failed to get initial BASIL inventory");

	info("BASIL %s initial INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	handle = cray_connect_sdb();
	if (handle == NULL)
		fatal("can not connect to XTAdmin database on the SDB");

	is_gemini = cray_is_gemini_system(handle);
	if (is_gemini < 0)
		fatal("can not determine Cray XT/XE system type");

	stmt = prepare_stmt(handle, query, params, PARAM_COUNT,
			    bind_cols, COLUMN_COUNT);
	if (stmt == NULL)
		fatal("can not prepare statement to resolve Cray coordinates");

	for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) {
		struct basil_node *node;
		char *reason = NULL;

		if ((node_ptr->name == NULL) ||
		    (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) {
			error("can not read basil_node_id from %s",
				node_ptr->name);
			continue;
		}

		if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0)
			fatal("can not resolve %s coordinates", node_ptr->name);

		if (fetch_stmt(stmt) == 0) {
#if _DEBUG
			info("proc_type:%s xyz:%u:%u:%u",
			     proc_type, x_coord, y_coord, z_coord
#endif
			if (xstrcmp(proc_type, "compute") != 0) {
				/*
				 * Switching a compute node to be a service node
				 * can not happen at runtime: requires a reboot.
				 */
				fatal("Node '%s' is a %s node. "
				      "Only compute nodes can appear in slurm.conf.",
					node_ptr->name, proc_type);
			} else if (is_null[COL_X] || is_null[COL_Y]
				   || is_null[COL_Z]) {
				/*
				 * Similar case to the one above, observed when
				 * a blade has been removed. Node will not
				 * likely show up in ALPS.
				 */
				x_coord = y_coord = z_coord = 0;
				reason = "unknown coordinates - hardware failure?";
			}

		} else if (is_gemini) {
Exemplo n.º 10
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;
	int rel_rc;
	time_t now = time(NULL);
	static time_t slurm_alps_mismatch_time = (time_t) 0;
	static bool logged_sync_timeout = false;
	static time_t last_inv_run = 0;

	if ((now - last_inv_run) < inv_interval)
		return SLURM_SUCCESS;

	last_inv_run = now;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/* Avoid checking for inv->batch_avail here since if we are
	   gang scheduling returning an error for a full system is
	   probably the wrong thing to do. (the schedule() function
	   in the slurmctld will never run ;)).
	*/
	if (!inv->f->node_head || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		int node_inx;
		struct node_record *node_ptr;
		char *reason = NULL;

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}
		node_inx = node_ptr - node_record_table_ptr;

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		if (reason) {
			if (node_ptr->down_time == 0)
				node_ptr->down_time = now;
			if (IS_NODE_DOWN(node_ptr)) {
				/* node still down */
			} else if ((slurmctld_conf.slurmd_timeout == 0) ||
				   ((now - node_ptr->down_time) <
				    slurmctld_conf.slurmd_timeout)) {
				node_ptr->node_state |= NODE_STATE_NO_RESPOND;
				bit_clear(avail_node_bitmap, node_inx);
			} else {
				xfree(node_ptr->reason);
				info("MARKING %s DOWN (%s)",
				     node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down_ptr(node_ptr, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);
			node_ptr->down_time = 0;
			info("MARKING %s UP", node_ptr->name);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				xfree(node_ptr->reason);
				node_ptr->reason_time = 0;
				node_ptr->reason_uid = NO_VAL;
				clusteracct_storage_g_node_up(
					acct_db_conn, node_ptr, now);
			}
		} else if (IS_NODE_NO_RESPOND(node_ptr)) {
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				bit_set(avail_node_bitmap, node_inx);
			}
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {
			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		/*
		 * Changed to ignore reservations for "UNKNOWN" batch
		 * ids (e.g. the interactive region) (Chris North)
		 */

		if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			rel_rc = basil_safe_release(rsvn->rsvn_id, inv);
			if (rel_rc) {
				error("ALPS reservation %u removal FAILED: %s",
				      rsvn->rsvn_id, basil_strerror(rel_rc));
			} else {
				debug("ALPS reservation %u removed",
				      rsvn->rsvn_id);
			}
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch) {
		/* If SLURM and ALPS state are not in synchronization,
		 * do not schedule any more jobs until waiting at least
		 * SyncTimeout seconds. */
		if (slurm_alps_mismatch_time == 0) {
			slurm_alps_mismatch_time = now;
		} else if (cray_conf->sync_timeout == 0) {
			/* Wait indefinitely */
		} else if (difftime(now, slurm_alps_mismatch_time) <
			   cray_conf->sync_timeout) {
			return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
		} else if (!logged_sync_timeout) {
			error("Could not synchronize SLURM with ALPS for %u "
			      "seconds, proceeding with job scheduling",
			      cray_conf->sync_timeout);
			logged_sync_timeout = true;
		}
	} else {
		slurm_alps_mismatch_time = 0;
		logged_sync_timeout = false;
	}
	return rc;
}
Exemplo n.º 11
0
/**
 * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates.
 *
 * Checks both SDB database and ALPS inventory for consistency. The inventory
 * part is identical to basil_inventory(), with the difference of being called
 * before valid bitmaps exist, from select_g_node_init().
 * Its dependencies are:
 * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
 * - it relies on _sync_nodes_to_jobs() to
 *   o kill active jobs on nodes now marked DOWN,
 *   o reset node state to ALLOCATED if it has been marked IDLE here (which is
 *     an error case, since there is no longer an ALPS reservation for the job,
 *     this is caught by the subsequent basil_inventory()).
 */
extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt)
{
    struct node_record *node_ptr, *end = node_ptr_array + node_cnt;
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;

    /* General mySQL */
    MYSQL		*handle;
    MYSQL_STMT	*stmt = NULL;
    /* Input parameters */
    unsigned int	node_id;
    /*
     * Use a left outer join here since the attributes table may not be
     * populated for a given nodeid (e.g. when the node has been disabled
     * on the SMW via 'xtcli disable').
     * The processor table has more authoritative information, if a nodeid
     * is not listed there, it does not exist.
     */
    const char query[] =	"SELECT x_coord, y_coord, z_coord,"
                            "       cab_position, cab_row, cage, slot, cpu,"
                            "	LOG2(coremask+1), availmem, "
                            "       processor_type  "
                            "FROM  processor LEFT JOIN attributes "
                            "ON    processor_id = nodeid "
                            "WHERE processor_id = ? ";
    const int	PARAM_COUNT = 1;	/* node id */
    MYSQL_BIND	params[PARAM_COUNT];

    int		x_coord, y_coord, z_coord;
    int		cab, row, cage, slot, cpu;
    unsigned int	node_cpus, node_mem;
    char		proc_type[BASIL_STRING_SHORT];
    MYSQL_BIND	bind_cols[COLUMN_COUNT];
    my_bool		is_null[COLUMN_COUNT];
    my_bool		is_error[COLUMN_COUNT];
    int		is_gemini, i;

    memset(params, 0, sizeof(params));
    params[0].buffer_type = MYSQL_TYPE_LONG;
    params[0].is_unsigned = true;
    params[0].is_null     = (my_bool *)0;
    params[0].buffer      = (char *)&node_id;

    memset(bind_cols, 0, sizeof(bind_cols));
    for (i = 0; i < COLUMN_COUNT; i ++) {
        bind_cols[i].is_null = &is_null[i];
        bind_cols[i].error   = &is_error[i];

        if (i == COL_TYPE) {
            bind_cols[i].buffer_type   = MYSQL_TYPE_STRING;
            bind_cols[i].buffer_length = sizeof(proc_type);
            bind_cols[i].buffer	   = proc_type;
        } else {
            bind_cols[i].buffer_type   = MYSQL_TYPE_LONG;
            bind_cols[i].is_unsigned   = (i >= COL_CORES);
        }
    }
    bind_cols[COL_X].buffer	     = (char *)&x_coord;
    bind_cols[COL_Y].buffer	     = (char *)&y_coord;
    bind_cols[COL_Z].buffer	     = (char *)&z_coord;
    bind_cols[COL_CAB].buffer    = (char *)&cab;
    bind_cols[COL_ROW].buffer    = (char *)&row;
    bind_cols[COL_CAGE].buffer   = (char *)&cage;
    bind_cols[COL_SLOT].buffer   = (char *)&slot;
    bind_cols[COL_CPU].buffer    = (char *)&cpu;
    bind_cols[COL_CORES].buffer  = (char *)&node_cpus;
    bind_cols[COL_MEMORY].buffer = (char *)&node_mem;

    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get initial BASIL inventory");

    info("BASIL %s initial INVENTORY: %d/%d batch nodes available",
         bv_names_long[version], inv->batch_avail, inv->batch_total);

    handle = cray_connect_sdb();
    if (handle == NULL)
        fatal("can not connect to XTAdmin database on the SDB");

    is_gemini = cray_is_gemini_system(handle);
    if (is_gemini < 0)
        fatal("can not determine Cray XT/XE system type");

    stmt = prepare_stmt(handle, query, params, PARAM_COUNT,
                        bind_cols, COLUMN_COUNT);
    if (stmt == NULL)
        fatal("can not prepare statement to resolve Cray coordinates");

    for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) {
        struct basil_node *node;
        char *reason = NULL;

        if ((node_ptr->name == NULL) ||
                (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) {
            error("can not read basil_node_id from %s",
                  node_ptr->name);
            continue;
        }

        if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0)
            fatal("can not resolve %s coordinates", node_ptr->name);

        if (fetch_stmt(stmt) == 0) {
#if _DEBUG
            info("proc_type:%s cpus:%u memory:%u",
                 proc_type, node_cpus, node_mem);
            info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u",
                 row, cage, slot, cpu, x_coord, y_coord, z_coord);
#endif
            if (strcmp(proc_type, "compute") != 0) {
                /*
                 * Switching a compute node to be a service node
                 * can not happen at runtime: requires a reboot.
                 */
                fatal("Node '%s' is a %s node. "
                      "Only compute nodes can appear in slurm.conf.",
                      node_ptr->name, proc_type);
            } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) {
                /*
                 * This can happen if a node has been disabled
                 * on the SMW (using 'xtcli disable <nid>'). The
                 * node will still be listed in the 'processor'
                 * table, but have no 'attributes' entry (NULL
                 * values for CPUs/memory). Also, the node will
                 * be invisible to ALPS, which is why we need to
                 * set it down here already.
                 */
                node_cpus = node_mem = 0;
                reason = "node data unknown - disabled on SMW?";
            } else if (is_null[COL_X] || is_null[COL_Y]
                       || is_null[COL_Z]) {
                /*
                 * Similar case to the one above, observed when
                 * a blade has been removed. Node will not
                 * likely show up in ALPS.
                 */
                x_coord = y_coord = z_coord = 0;
                reason = "unknown coordinates - hardware failure?";
            } else if (node_cpus < node_ptr->config_ptr->cpus) {
                /*
                 * FIXME: Might reconsider this policy.
                 *
                 * FastSchedule is ignored here, it requires the
                 * slurm.conf to be consistent with hardware.
                 *
                 * Assumption is that CPU/Memory do not change
                 * at runtime (Cray has no hot-swappable parts).
                 *
                 * Hence checking it in basil_inventory() would
                 * mean a lot of runtime overhead.
                 */
                fatal("slurm.conf: node %s has only Procs=%d",
                      node_ptr->name, node_cpus);
            } else if (node_mem < node_ptr->config_ptr->real_memory) {
                fatal("slurm.conf: node %s has RealMemory=%d",
                      node_ptr->name, node_mem);
            }

        } else if (is_gemini) {
            fatal("Non-existing Gemini node '%s' in slurm.conf",
                  node_ptr->name);
        } else {
            fatal("Non-existing SeaStar node '%s' in slurm.conf",
                  node_ptr->name);
        }

        if (!is_gemini) {
            /*
             * SeaStar: each node has unique coordinates
             */
            if (node_ptr->arch == NULL)
                node_ptr->arch = xstrdup("XT");
        } else {
            /*
             * Gemini: each 2 nodes share the same network
             * interface (i.e., nodes 0/1 and 2/3 each have
             * the same coordinates).
             */
            if (node_ptr->arch == NULL)
                node_ptr->arch = xstrdup("XE");
        }

        xfree(node_ptr->node_hostname);
        xfree(node_ptr->comm_name);
        /*
         * Convention: since we are using SLURM in frontend-mode,
         *             we use Node{Addr,HostName} as follows.
         *
         * NodeAddr:      <X><Y><Z> coordinates in base-36 encoding
         *
         * NodeHostName:  c#-#c#s#n# using the  NID convention
         *                <cabinet>-<row><chassis><slot><node>
         * - each cabinet can accommodate 3 chassis (c1..c3)
         * - each chassis has 8 slots               (s0..s7)
         * - each slot contains 2 or 4 nodes        (n0..n3)
         *   o either 2 service nodes (n0/n3)
         *   o or 4 compute nodes     (n0..n3)
         *   o or 2 gemini chips      (g0/g1 serving n0..n3)
         *
         * Example: c0-0c1s0n1
         *          - c0- = cabinet 0
         *          - 0   = row     0
         *          - c1  = chassis 1
         *          - s0  = slot    0
         *          - n1  = node    1
         */
        node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab,
                                  row, cage, slot, cpu);
        node_ptr->comm_name = xstrdup_printf("%c%c%c",
                                             _enc_coord(x_coord),
                                             _enc_coord(y_coord),
                                             _enc_coord(z_coord));
        dim_size[0] = MAX(dim_size[0], (x_coord - 1));
        dim_size[1] = MAX(dim_size[1], (y_coord - 1));
        dim_size[2] = MAX(dim_size[2], (z_coord - 1));
#if _DEBUG
        info("%s  %s  %s  cpus=%u, mem=%u", node_ptr->name,
             node_ptr->node_hostname, node_ptr->comm_name,
             node_cpus, node_mem);
#endif
        /*
         * Check the current state reported by ALPS inventory, unless it
         * is already evident that the node has some other problem.
         */
        if (reason == NULL) {
            for (node = inv->f->node_head; node; node = node->next)
                if (node->node_id == node_id)
                    break;
            if (node == NULL) {
                reason = "not visible to ALPS - check hardware";
            } else if (node->state == BNS_DOWN) {
                reason = "ALPS marked it DOWN";
            } else if (node->state == BNS_UNAVAIL) {
                reason = "node is UNAVAILABLE";
            } else if (node->state == BNS_ROUTE) {
                reason = "node does ROUTING";
            } else if (node->state == BNS_SUSPECT) {
                reason = "entered SUSPECT mode";
            } else if (node->state == BNS_ADMINDOWN) {
                reason = "node is ADMINDOWN";
            } else if (node->state != BNS_UP) {
                reason = "state not UP";
            } else if (node->role != BNR_BATCH) {
                reason = "mode not BATCH";
            } else if (node->arch != BNA_XT) {
                reason = "arch not XT/XE";
            }
        }

        /* Base state entirely derives from ALPS */
        node_ptr->node_state &= NODE_STATE_FLAGS;
        if (reason) {
            if (node_ptr->reason) {
                debug("Initial DOWN node %s - %s",
                      node_ptr->name, node_ptr->reason);
            } else {
                info("Initial DOWN node %s - %s",
                     node_ptr->name, reason);
                node_ptr->reason = xstrdup(reason);
            }
            node_ptr->node_state |= NODE_STATE_DOWN;
        } else {
            if (node_is_allocated(node))
                node_ptr->node_state |= NODE_STATE_ALLOCATED;
            else
                node_ptr->node_state |= NODE_STATE_IDLE;
            xfree(node_ptr->reason);
        }

        free_stmt_result(stmt);
    }

    if (stmt_close(stmt))
        error("error closing statement: %s", mysql_stmt_error(stmt));
    cray_close_sdb(handle);
    free_inv(inv);

    return SLURM_SUCCESS;
}
Exemplo n.º 12
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;
    struct basil_node *node;
    int rank_count = 0, i;
    hostlist_t hl = hostlist_create(NULL);
    bool bad_node = 0;

    /*
     * When obtaining the initial configuration, we can not allow ALPS to
     * fail. If there is a problem at this stage it is better to restart
     * SLURM completely, after investigating (and/or fixing) the cause.
     */
    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get BASIL %s ranking", bv_names_long[version]);
    else if (!inv->batch_total)
        fatal("system has no usable batch compute nodes");
    else if (inv->batch_total < node_cnt)
        info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
             "check DownNodes", inv->batch_total, node_cnt);

    debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
          bv_names_long[version], inv->batch_avail, inv->batch_total);

    /*
     * Node ranking is based on a subset of the inventory: only nodes in
     * batch allocation mode which are up and not allocated. Assign a
     * 'NO_VAL' rank to all other nodes, which will translate as a very
     * high value, (unsigned)-2, to put those nodes last in the ranking.
     * The rest of the code must ensure that those nodes are never chosen.
     */
    for (i = 0; i < node_cnt; i++)
        node_array[i].node_rank = NO_VAL;

    for (node = inv->f->node_head; node; node = node->next) {
        struct node_record *node_ptr;
        char tmp[50];

        /* This will ignore interactive nodes when iterating through
         * the apbasil inventory.  If we don't do this, SLURM is
         * unable to resolve the ID to a nidXXX name since it's not in
         * the slurm.conf file.  (Chris North)
         */
        if (node->role == BNR_INTER)
            continue;

        node_ptr = _find_node_by_basil_id(node->node_id);
        if (node_ptr == NULL) {
            error("nid%05u (%s node in state %s) not in slurm.conf",
                  node->node_id, nam_noderole[node->role],
                  nam_nodestate[node->state]);
            bad_node = 1;
        } else
            node_ptr->node_rank = inv->nodes_total - rank_count++;
        sprintf(tmp, "nid%05u", node->node_id);
        hostlist_push(hl, tmp);
    }
    free_inv(inv);
    if (bad_node) {
        hostlist_sort(hl);
        char *name = hostlist_ranged_string_xmalloc(hl);
        info("It appears your slurm.conf nodelist doesn't "
             "match the alps system.  Here are the nodes alps knows "
             "about\n%s", name);
    }
    hostlist_destroy(hl);

    return SLURM_SUCCESS;
}