/** * basil_get_initial_state - set SLURM initial node state from ALPS. * * The logic is identical to basil_inventory(), with the difference that this * is called before valid bitmaps exist, from select_g_node_init(). It relies * on the following other parts: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). * Return: SLURM_SUCCESS if ok, non-zero on error. */ static int basil_get_initial_state(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INITIAL INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) continue; if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->reason) { debug3("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else { debug("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); } node_ptr->node_state |= NODE_STATE_DOWN; } else { if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; xfree(node_ptr->reason); } } free_inv(inv); return SLURM_SUCCESS; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } if (reason) { if (!IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); debug("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down(node_ptr->name, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; if (job_iter == NULL) fatal("list_iterator_create: malloc failure"); while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); if (job_ptr == NULL) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); basil_safe_release(rsvn->rsvn_id, inv); slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) /* ALPS will take some time, do not schedule now. */ return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return rc; }
/** * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates. * * Checks both SDB database and ALPS inventory for consistency. The inventory * part is identical to basil_inventory(), with the difference of being called * before valid bitmaps exist, from select_g_node_init(). * Its dependencies are: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). */ extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt) { struct node_record *node_ptr, *end = node_ptr_array + node_cnt; enum basil_version version = get_basil_version(); struct basil_inventory *inv; /* General mySQL */ MYSQL *handle; MYSQL_STMT *stmt = NULL; /* Input parameters */ unsigned int node_id; /* * Use a left outer join here since the attributes table may not be * populated for a given nodeid (e.g. when the node has been disabled * on the SMW via 'xtcli disable'). * The processor table has more authoritative information, if a nodeid * is not listed there, it does not exist. */ const char query[] = "SELECT x_coord, y_coord, z_coord," " cab_position, cab_row, cage, slot, cpu," " LOG2(coremask+1), availmem, " " processor_type " "FROM processor LEFT JOIN attributes " "ON processor_id = nodeid " "WHERE processor_id = ? "; const int PARAM_COUNT = 1; /* node id */ MYSQL_BIND params[PARAM_COUNT]; int x_coord, y_coord, z_coord; int cab, row, cage, slot, cpu; unsigned int node_cpus, node_mem; char proc_type[BASIL_STRING_SHORT]; MYSQL_BIND bind_cols[COLUMN_COUNT]; my_bool is_null[COLUMN_COUNT]; my_bool is_error[COLUMN_COUNT]; int is_gemini, i; memset(params, 0, sizeof(params)); params[0].buffer_type = MYSQL_TYPE_LONG; params[0].is_unsigned = true; params[0].is_null = (my_bool *)0; params[0].buffer = (char *)&node_id; memset(bind_cols, 0, sizeof(bind_cols)); for (i = 0; i < COLUMN_COUNT; i ++) { bind_cols[i].is_null = &is_null[i]; bind_cols[i].error = &is_error[i]; if (i == COL_TYPE) { bind_cols[i].buffer_type = MYSQL_TYPE_STRING; bind_cols[i].buffer_length = sizeof(proc_type); bind_cols[i].buffer = proc_type; } else { bind_cols[i].buffer_type = MYSQL_TYPE_LONG; bind_cols[i].is_unsigned = (i >= COL_CORES); } } bind_cols[COL_X].buffer = (char *)&x_coord; bind_cols[COL_Y].buffer = (char *)&y_coord; bind_cols[COL_Z].buffer = (char *)&z_coord; bind_cols[COL_CAB].buffer = (char *)&cab; bind_cols[COL_ROW].buffer = (char *)&row; bind_cols[COL_CAGE].buffer = (char *)&cage; bind_cols[COL_SLOT].buffer = (char *)&slot; bind_cols[COL_CPU].buffer = (char *)&cpu; bind_cols[COL_CORES].buffer = (char *)&node_cpus; bind_cols[COL_MEMORY].buffer = (char *)&node_mem; inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get initial BASIL inventory"); info("BASIL %s initial INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); handle = cray_connect_sdb(); if (handle == NULL) fatal("can not connect to XTAdmin database on the SDB"); is_gemini = cray_is_gemini_system(handle); if (is_gemini < 0) fatal("can not determine Cray XT/XE system type"); stmt = prepare_stmt(handle, query, params, PARAM_COUNT, bind_cols, COLUMN_COUNT); if (stmt == NULL) fatal("can not prepare statement to resolve Cray coordinates"); for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) { struct basil_node *node; char *reason = NULL; if ((node_ptr->name == NULL) || (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) { error("can not read basil_node_id from %s", node_ptr->name); continue; } if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0) fatal("can not resolve %s coordinates", node_ptr->name); if (fetch_stmt(stmt) == 0) { #if _DEBUG info("proc_type:%s cpus:%u memory:%u", proc_type, node_cpus, node_mem); info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u", row, cage, slot, cpu, x_coord, y_coord, z_coord); #endif if (strcmp(proc_type, "compute") != 0) { /* * Switching a compute node to be a service node * can not happen at runtime: requires a reboot. */ fatal("Node '%s' is a %s node. " "Only compute nodes can appear in slurm.conf.", node_ptr->name, proc_type); } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) { /* * This can happen if a node has been disabled * on the SMW (using 'xtcli disable <nid>'). The * node will still be listed in the 'processor' * table, but have no 'attributes' entry (NULL * values for CPUs/memory). Also, the node will * be invisible to ALPS, which is why we need to * set it down here already. */ node_cpus = node_mem = 0; reason = "node data unknown - disabled on SMW?"; } else if (is_null[COL_X] || is_null[COL_Y] || is_null[COL_Z]) { /* * Similar case to the one above, observed when * a blade has been removed. Node will not * likely show up in ALPS. */ x_coord = y_coord = z_coord = 0; reason = "unknown coordinates - hardware failure?"; } else if (node_cpus < node_ptr->config_ptr->cpus) { /* * FIXME: Might reconsider this policy. * * FastSchedule is ignored here, it requires the * slurm.conf to be consistent with hardware. * * Assumption is that CPU/Memory do not change * at runtime (Cray has no hot-swappable parts). * * Hence checking it in basil_inventory() would * mean a lot of runtime overhead. */ fatal("slurm.conf: node %s has only Procs=%d", node_ptr->name, node_cpus); } else if (node_mem < node_ptr->config_ptr->real_memory) { fatal("slurm.conf: node %s has RealMemory=%d", node_ptr->name, node_mem); } } else if (is_gemini) { fatal("Non-existing Gemini node '%s' in slurm.conf", node_ptr->name); } else { fatal("Non-existing SeaStar node '%s' in slurm.conf", node_ptr->name); } if (!is_gemini) { /* * SeaStar: each node has unique coordinates */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XT"); } else { /* * Gemini: each 2 nodes share the same network * interface (i.e., nodes 0/1 and 2/3 each have * the same coordinates). */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XE"); } xfree(node_ptr->node_hostname); xfree(node_ptr->comm_name); /* * Convention: since we are using SLURM in frontend-mode, * we use Node{Addr,HostName} as follows. * * NodeAddr: <X><Y><Z> coordinates in base-36 encoding * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab, row, cage, slot, cpu); node_ptr->comm_name = xstrdup_printf("%c%c%c", _enc_coord(x_coord), _enc_coord(y_coord), _enc_coord(z_coord)); dim_size[0] = MAX(dim_size[0], (x_coord - 1)); dim_size[1] = MAX(dim_size[1], (y_coord - 1)); dim_size[2] = MAX(dim_size[2], (z_coord - 1)); #if _DEBUG info("%s %s %s cpus=%u, mem=%u", node_ptr->name, node_ptr->node_hostname, node_ptr->comm_name, node_cpus, node_mem); #endif /* * Check the current state reported by ALPS inventory, unless it * is already evident that the node has some other problem. */ if (reason == NULL) { for (node = inv->f->node_head; node; node = node->next) if (node->node_id == node_id) break; if (node == NULL) { reason = "not visible to ALPS - check hardware"; } else if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } } /* Base state entirely derives from ALPS */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->reason) { debug("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else { info("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); } node_ptr->node_state |= NODE_STATE_DOWN; } else { if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; xfree(node_ptr->reason); } free_stmt_result(stmt); } if (stmt_close(stmt)) error("error closing statement: %s", mysql_stmt_error(stmt)); cray_close_sdb(handle); free_inv(inv); return SLURM_SUCCESS; }