/** * basil_reserve - wrapper around rsvn_new. * @user: owner of the reservation * @batch_id: (numeric) job ID * @width: mppwidth (aprun -n) * @depth: mppdepth (aprun -d) * @nppn: mppnppn (aprun -N) * @mem_mb: mppmem (aprun -m) * @ns_head: list of requested mppnodes (will be freed if not NULL) * @accel_head: optional accelerator parameters * Returns reservation ID > 0 if ok, negative %enum basil_error on error. */ long basil_reserve(const char *user, const char *batch_id, uint32_t width, uint32_t depth, uint32_t nppn, uint32_t mem_mb, uint32_t nppcu, struct nodespec *ns_head, struct basil_accel_param *accel_head) { struct basil_reservation *rsvn; struct basil_parse_data bp = {0}; /* do not free mppnodes it is stored/freed in the rsvn struct */ char *mppnodes = ns_to_string(ns_head); long rc; free_nodespec(ns_head); rsvn = _rsvn_new(user, batch_id, width, depth, nppn, mem_mb, nppcu, mppnodes, accel_head); if (rsvn == NULL) return -BE_INTERNAL; bp.method = BM_reserve; bp.mdata.res = rsvn; bp.version = BV_1_0; /* * Rule: * - if *res->batch_id is set, we are using Basil 1.1 * - if *res->batch_id == '\0' we have to fall back to Basil 1.0 */ if (batch_id && *batch_id) bp.version = get_basil_version(); rc = basil_request(&bp); if (rc >= 0) rc = rsvn->rsvn_id; free_rsvn(rsvn); return rc; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; inv = get_full_inventory(version); if (inv == NULL) /* FIXME: should retry here if the condition is transient */ fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }
static int rsvn_release(struct basil_reservation *res) { struct basil_parse_data bp = {0}; bp.method = BM_release; bp.mdata.res = res; bp.version = get_basil_version(); /* NOTE - for simplicity we could use BV_1_0 here */ return basil_request(&bp); }
/** * basil_switch - suspend/resume an existing reservation * @rsvn_id: the reservation id * @suspend: to suspend or not to suspend * Returns 0 if ok, a negative %basil_error otherwise. * */ int basil_switch(uint32_t rsvn_id, bool suspend) { struct basil_reservation rsvn = {0}; struct basil_parse_data bp = {0}; rsvn.rsvn_id = rsvn_id; rsvn.suspended = suspend; bp.method = BM_switch; bp.mdata.res = &rsvn; bp.version = get_basil_version(); /* NOTE - for simplicity we could use BV_1_0 here */ return basil_request(&bp); }
/** * basil_signal_apids - send a signal to all APIDs of a given ALPS reservation * @rsvn_id: reservation ID to target * @signal: signal number * @inv: recent Basil Inventory, or NULL to generate internally * Returns 0 if ok, a negative %basil_error otherwise. */ int basil_signal_apids(int32_t rsvn_id, int signal, struct basil_inventory *inv) { struct basil_inventory *new_inv = inv; uint64_t *apid, *apids; char cmd[512]; if (access(cray_conf->apkill, X_OK) < 0) { error("FATAL: can not execute the apkill command '%s'", cray_conf->apkill); return -BE_SYSTEM; } if (inv == NULL) new_inv = get_full_inventory(get_basil_version()); if (new_inv == NULL) { error("can not obtain a BASIL inventory to get APID list"); return -(BE_INTERNAL | BE_TRANSIENT); } apids = basil_get_rsvn_aprun_apids(new_inv, rsvn_id); if (apids) { for (apid = apids; *apid; apid++) { debug2("ALPS resId %u, running apkill -%d %llu", rsvn_id, signal, (unsigned long long)*apid); snprintf(cmd, sizeof(cmd), "%s -%d %llu", cray_conf->apkill, signal, (unsigned long long)*apid); if (system(cmd) < 0) error("system(%s) failed", cmd); } xfree(apids); } if (inv == NULL) free_inv(new_inv); return BE_NONE; }
/** * basil_get_initial_state - set SLURM initial node state from ALPS. * * The logic is identical to basil_inventory(), with the difference that this * is called before valid bitmaps exist, from select_g_node_init(). It relies * on the following other parts: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). * Return: SLURM_SUCCESS if ok, non-zero on error. */ static int basil_get_initial_state(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INITIAL INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) continue; if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->reason) { debug3("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else { debug("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); } node_ptr->node_state |= NODE_STATE_DOWN; } else { if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; xfree(node_ptr->reason); } } free_inv(inv); return SLURM_SUCCESS; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } if (reason) { if (!IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); debug("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down(node_ptr->name, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; if (job_iter == NULL) fatal("list_iterator_create: malloc failure"); while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); if (job_ptr == NULL) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); basil_safe_release(rsvn->rsvn_id, inv); slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) /* ALPS will take some time, do not schedule now. */ return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return rc; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; node_rank_inv = 1; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else if ((slurmctld_conf.fast_schedule != 2) && (node->cpu_count != node_ptr->config_ptr->cpus)) { fatal("slurm.conf: node %s has %u cpus " "but configured as CPUs=%u in your slurm.conf", node_ptr->name, node->cpu_count, node_ptr->config_ptr->cpus); } else if ((slurmctld_conf.fast_schedule != 2) && (node->mem_size != node_ptr->config_ptr->real_memory)) { fatal("slurm.conf: node %s has RealMemory=%u " "but configured as RealMemory=%u in your " "slurm.conf", node_ptr->name, node->mem_size, node_ptr->config_ptr->real_memory); } else { node_ptr->node_rank = inv->nodes_total - rank_count++; /* * Convention: since we are using SLURM in * frontend-mode, we use * NodeHostName as follows. * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ xfree(node_ptr->node_hostname); node_ptr->node_hostname = xstrdup(node->name); } sprintf(tmp, "nid%05u", node->node_id); hostlist_push_host(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); node_rank_inv = 0; return SLURM_SUCCESS; }
/** * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates. * * Checks both SDB database and ALPS inventory for consistency. The inventory * part is identical to basil_inventory(), with the difference of being called * before valid bitmaps exist, from select_g_node_init(). * Its dependencies are: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). */ extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt) { struct node_record *node_ptr, *end = node_ptr_array + node_cnt; enum basil_version version = get_basil_version(); struct basil_inventory *inv; /* General mySQL */ MYSQL *handle; MYSQL_STMT *stmt = NULL; /* Input parameters */ unsigned int node_id; /* * Use a left outer join here since the attributes table may not be * populated for a given nodeid (e.g. when the node has been disabled * on the SMW via 'xtcli disable'). * The processor table has more authoritative information, if a nodeid * is not listed there, it does not exist. */ const char query[] = "SELECT x_coord, y_coord, z_coord, " "processor_type FROM processor WHERE processor_id = ? "; const int PARAM_COUNT = 1; /* node id */ MYSQL_BIND params[PARAM_COUNT]; int x_coord, y_coord, z_coord; char proc_type[BASIL_STRING_SHORT]; MYSQL_BIND bind_cols[COLUMN_COUNT]; my_bool is_null[COLUMN_COUNT]; my_bool is_error[COLUMN_COUNT]; int is_gemini, i; time_t now = time(NULL); memset(params, 0, sizeof(params)); params[0].buffer_type = MYSQL_TYPE_LONG; params[0].is_unsigned = true; params[0].is_null = (my_bool *)0; params[0].buffer = (char *)&node_id; memset(bind_cols, 0, sizeof(bind_cols)); for (i = 0; i < COLUMN_COUNT; i ++) { bind_cols[i].is_null = &is_null[i]; bind_cols[i].error = &is_error[i]; if (i == COL_TYPE) { bind_cols[i].buffer_type = MYSQL_TYPE_STRING; bind_cols[i].buffer_length = sizeof(proc_type); bind_cols[i].buffer = proc_type; } else { bind_cols[i].buffer_type = MYSQL_TYPE_LONG; bind_cols[i].is_unsigned = (i >= COL_TYPE); } } bind_cols[COL_X].buffer = (char *)&x_coord; bind_cols[COL_Y].buffer = (char *)&y_coord; bind_cols[COL_Z].buffer = (char *)&z_coord; inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get initial BASIL inventory"); info("BASIL %s initial INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); handle = cray_connect_sdb(); if (handle == NULL) fatal("can not connect to XTAdmin database on the SDB"); is_gemini = cray_is_gemini_system(handle); if (is_gemini < 0) fatal("can not determine Cray XT/XE system type"); stmt = prepare_stmt(handle, query, params, PARAM_COUNT, bind_cols, COLUMN_COUNT); if (stmt == NULL) fatal("can not prepare statement to resolve Cray coordinates"); for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) { struct basil_node *node; char *reason = NULL; if ((node_ptr->name == NULL) || (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) { error("can not read basil_node_id from %s", node_ptr->name); continue; } if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0) fatal("can not resolve %s coordinates", node_ptr->name); if (fetch_stmt(stmt) == 0) { #if _DEBUG info("proc_type:%s xyz:%u:%u:%u", proc_type, x_coord, y_coord, z_coord #endif if (xstrcmp(proc_type, "compute") != 0) { /* * Switching a compute node to be a service node * can not happen at runtime: requires a reboot. */ fatal("Node '%s' is a %s node. " "Only compute nodes can appear in slurm.conf.", node_ptr->name, proc_type); } else if (is_null[COL_X] || is_null[COL_Y] || is_null[COL_Z]) { /* * Similar case to the one above, observed when * a blade has been removed. Node will not * likely show up in ALPS. */ x_coord = y_coord = z_coord = 0; reason = "unknown coordinates - hardware failure?"; } } else if (is_gemini) {
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }
/** * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates. * * Checks both SDB database and ALPS inventory for consistency. The inventory * part is identical to basil_inventory(), with the difference of being called * before valid bitmaps exist, from select_g_node_init(). * Its dependencies are: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). */ extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt) { struct node_record *node_ptr, *end = node_ptr_array + node_cnt; enum basil_version version = get_basil_version(); struct basil_inventory *inv; /* General mySQL */ MYSQL *handle; MYSQL_STMT *stmt = NULL; /* Input parameters */ unsigned int node_id; /* * Use a left outer join here since the attributes table may not be * populated for a given nodeid (e.g. when the node has been disabled * on the SMW via 'xtcli disable'). * The processor table has more authoritative information, if a nodeid * is not listed there, it does not exist. */ const char query[] = "SELECT x_coord, y_coord, z_coord," " cab_position, cab_row, cage, slot, cpu," " LOG2(coremask+1), availmem, " " processor_type " "FROM processor LEFT JOIN attributes " "ON processor_id = nodeid " "WHERE processor_id = ? "; const int PARAM_COUNT = 1; /* node id */ MYSQL_BIND params[PARAM_COUNT]; int x_coord, y_coord, z_coord; int cab, row, cage, slot, cpu; unsigned int node_cpus, node_mem; char proc_type[BASIL_STRING_SHORT]; MYSQL_BIND bind_cols[COLUMN_COUNT]; my_bool is_null[COLUMN_COUNT]; my_bool is_error[COLUMN_COUNT]; int is_gemini, i; memset(params, 0, sizeof(params)); params[0].buffer_type = MYSQL_TYPE_LONG; params[0].is_unsigned = true; params[0].is_null = (my_bool *)0; params[0].buffer = (char *)&node_id; memset(bind_cols, 0, sizeof(bind_cols)); for (i = 0; i < COLUMN_COUNT; i ++) { bind_cols[i].is_null = &is_null[i]; bind_cols[i].error = &is_error[i]; if (i == COL_TYPE) { bind_cols[i].buffer_type = MYSQL_TYPE_STRING; bind_cols[i].buffer_length = sizeof(proc_type); bind_cols[i].buffer = proc_type; } else { bind_cols[i].buffer_type = MYSQL_TYPE_LONG; bind_cols[i].is_unsigned = (i >= COL_CORES); } } bind_cols[COL_X].buffer = (char *)&x_coord; bind_cols[COL_Y].buffer = (char *)&y_coord; bind_cols[COL_Z].buffer = (char *)&z_coord; bind_cols[COL_CAB].buffer = (char *)&cab; bind_cols[COL_ROW].buffer = (char *)&row; bind_cols[COL_CAGE].buffer = (char *)&cage; bind_cols[COL_SLOT].buffer = (char *)&slot; bind_cols[COL_CPU].buffer = (char *)&cpu; bind_cols[COL_CORES].buffer = (char *)&node_cpus; bind_cols[COL_MEMORY].buffer = (char *)&node_mem; inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get initial BASIL inventory"); info("BASIL %s initial INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); handle = cray_connect_sdb(); if (handle == NULL) fatal("can not connect to XTAdmin database on the SDB"); is_gemini = cray_is_gemini_system(handle); if (is_gemini < 0) fatal("can not determine Cray XT/XE system type"); stmt = prepare_stmt(handle, query, params, PARAM_COUNT, bind_cols, COLUMN_COUNT); if (stmt == NULL) fatal("can not prepare statement to resolve Cray coordinates"); for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) { struct basil_node *node; char *reason = NULL; if ((node_ptr->name == NULL) || (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) { error("can not read basil_node_id from %s", node_ptr->name); continue; } if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0) fatal("can not resolve %s coordinates", node_ptr->name); if (fetch_stmt(stmt) == 0) { #if _DEBUG info("proc_type:%s cpus:%u memory:%u", proc_type, node_cpus, node_mem); info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u", row, cage, slot, cpu, x_coord, y_coord, z_coord); #endif if (strcmp(proc_type, "compute") != 0) { /* * Switching a compute node to be a service node * can not happen at runtime: requires a reboot. */ fatal("Node '%s' is a %s node. " "Only compute nodes can appear in slurm.conf.", node_ptr->name, proc_type); } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) { /* * This can happen if a node has been disabled * on the SMW (using 'xtcli disable <nid>'). The * node will still be listed in the 'processor' * table, but have no 'attributes' entry (NULL * values for CPUs/memory). Also, the node will * be invisible to ALPS, which is why we need to * set it down here already. */ node_cpus = node_mem = 0; reason = "node data unknown - disabled on SMW?"; } else if (is_null[COL_X] || is_null[COL_Y] || is_null[COL_Z]) { /* * Similar case to the one above, observed when * a blade has been removed. Node will not * likely show up in ALPS. */ x_coord = y_coord = z_coord = 0; reason = "unknown coordinates - hardware failure?"; } else if (node_cpus < node_ptr->config_ptr->cpus) { /* * FIXME: Might reconsider this policy. * * FastSchedule is ignored here, it requires the * slurm.conf to be consistent with hardware. * * Assumption is that CPU/Memory do not change * at runtime (Cray has no hot-swappable parts). * * Hence checking it in basil_inventory() would * mean a lot of runtime overhead. */ fatal("slurm.conf: node %s has only Procs=%d", node_ptr->name, node_cpus); } else if (node_mem < node_ptr->config_ptr->real_memory) { fatal("slurm.conf: node %s has RealMemory=%d", node_ptr->name, node_mem); } } else if (is_gemini) { fatal("Non-existing Gemini node '%s' in slurm.conf", node_ptr->name); } else { fatal("Non-existing SeaStar node '%s' in slurm.conf", node_ptr->name); } if (!is_gemini) { /* * SeaStar: each node has unique coordinates */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XT"); } else { /* * Gemini: each 2 nodes share the same network * interface (i.e., nodes 0/1 and 2/3 each have * the same coordinates). */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XE"); } xfree(node_ptr->node_hostname); xfree(node_ptr->comm_name); /* * Convention: since we are using SLURM in frontend-mode, * we use Node{Addr,HostName} as follows. * * NodeAddr: <X><Y><Z> coordinates in base-36 encoding * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab, row, cage, slot, cpu); node_ptr->comm_name = xstrdup_printf("%c%c%c", _enc_coord(x_coord), _enc_coord(y_coord), _enc_coord(z_coord)); dim_size[0] = MAX(dim_size[0], (x_coord - 1)); dim_size[1] = MAX(dim_size[1], (y_coord - 1)); dim_size[2] = MAX(dim_size[2], (z_coord - 1)); #if _DEBUG info("%s %s %s cpus=%u, mem=%u", node_ptr->name, node_ptr->node_hostname, node_ptr->comm_name, node_cpus, node_mem); #endif /* * Check the current state reported by ALPS inventory, unless it * is already evident that the node has some other problem. */ if (reason == NULL) { for (node = inv->f->node_head; node; node = node->next) if (node->node_id == node_id) break; if (node == NULL) { reason = "not visible to ALPS - check hardware"; } else if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } } /* Base state entirely derives from ALPS */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->reason) { debug("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else { info("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); } node_ptr->node_state |= NODE_STATE_DOWN; } else { if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; xfree(node_ptr->reason); } free_stmt_result(stmt); } if (stmt_close(stmt)) error("error closing statement: %s", mysql_stmt_error(stmt)); cray_close_sdb(handle); free_inv(inv); return SLURM_SUCCESS; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }