/* * assign_front_end - assign a front end node for starting a job * job_ptr IN - job to assign a front end node (tests access control lists) * RET pointer to the front end node to use or NULL if none found */ extern front_end_record_t *assign_front_end(struct job_record *job_ptr) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr, *best_front_end = NULL; uint32_t state_flags; int i; if (!job_ptr->batch_host && (job_ptr->batch_flag == 0) && (front_end_ptr = find_front_end_record(job_ptr->alloc_node))) { /* Use submit host for interactive job */ if (!IS_NODE_DOWN(front_end_ptr) && !IS_NODE_DRAIN(front_end_ptr) && !IS_NODE_NO_RESPOND(front_end_ptr) && _front_end_access(front_end_ptr, job_ptr)) { best_front_end = front_end_ptr; } else { info("%s: front-end node %s not available for job %u", __func__, job_ptr->alloc_node, job_ptr->job_id); return NULL; } } else { for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (job_ptr->batch_host) { /* Find specific front-end */ if (xstrcmp(job_ptr->batch_host, front_end_ptr->name)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) break; } else { /* Find a usable front-end node */ if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) continue; } if ((best_front_end == NULL) || (front_end_ptr->job_cnt_run < best_front_end->job_cnt_run)) best_front_end = front_end_ptr; } } if (best_front_end) { state_flags = best_front_end->node_state & NODE_STATE_FLAGS; best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags; best_front_end->job_cnt_run++; return best_front_end; } else if (job_ptr->batch_host) { /* Find specific front-end node */ error("assign_front_end: front end node %s not found", job_ptr->batch_host); } else { /* Find some usable front-end node */ error("assign_front_end: no available front end nodes found"); } #endif return NULL; }
/* * sync_front_end_state - synchronize job pointers and front-end node state */ extern void sync_front_end_state(void) { #ifdef HAVE_FRONT_END ListIterator job_iterator; struct job_record *job_ptr; front_end_record_t *front_end_ptr; uint16_t state_flags; int i; for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { front_end_ptr->job_cnt_comp = 0; front_end_ptr->job_cnt_run = 0; } job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if (job_ptr->batch_host) { job_ptr->front_end_ptr = find_front_end_record(job_ptr->batch_host); if ((job_ptr->front_end_ptr == NULL) && IS_JOB_RUNNING(job_ptr)) { error("front end node %s has vanished, " "killing job %u", job_ptr->batch_host, job_ptr->job_id); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; } else if (job_ptr->front_end_ptr == NULL) { info("front end node %s has vanished", job_ptr->batch_host); } else if (IS_JOB_COMPLETING(job_ptr)) { job_ptr->front_end_ptr->job_cnt_comp++; } else if (IS_JOB_RUNNING(job_ptr)) { job_ptr->front_end_ptr->job_cnt_run++; } } else { job_ptr->front_end_ptr = NULL; } } list_iterator_destroy(job_iterator); for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((IS_NODE_IDLE(front_end_ptr) || IS_NODE_UNKNOWN(front_end_ptr)) && (front_end_ptr->job_cnt_run != 0)) { state_flags = front_end_ptr->node_state & NODE_STATE_FLAGS; front_end_ptr->node_state = NODE_STATE_ALLOCATED | state_flags; } if (IS_NODE_ALLOCATED(front_end_ptr) && (front_end_ptr->job_cnt_run == 0)) { state_flags = front_end_ptr->node_state & NODE_STATE_FLAGS; front_end_ptr->node_state = NODE_STATE_IDLE | state_flags; } if (IS_NODE_COMPLETING(front_end_ptr) && (front_end_ptr->job_cnt_comp == 0)) { front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); } if (!IS_NODE_COMPLETING(front_end_ptr) && (front_end_ptr->job_cnt_comp != 0)) { front_end_ptr->node_state |= NODE_STATE_COMPLETING; } } if (slurmctld_conf.debug_flags & DEBUG_FLAG_FRONT_END) log_front_end_state(); #endif }
/* * load_all_front_end_state - Load the front_end node state from file, recover * on slurmctld restart. Execute this after loading the configuration * file data. Data goes into common storage. * IN state_only - if true, overwrite only front_end node state and reason * Use this to overwrite the "UNKNOWN state typically used in slurm.conf * RET 0 or error code * NOTE: READ lock_slurmctld config before entry */ extern int load_all_front_end_state(bool state_only) { #ifdef HAVE_FRONT_END char *node_name = NULL, *reason = NULL, *data = NULL, *state_file; int data_allocated, data_read = 0, error_code = 0, node_cnt = 0; uint16_t node_state; uint32_t data_size = 0, name_len; uint32_t reason_uid = NO_VAL; time_t reason_time = 0; front_end_record_t *front_end_ptr; int state_fd; time_t time_stamp; Buf buffer; char *ver_str = NULL; uint16_t protocol_version = (uint16_t) NO_VAL; /* read the file */ lock_state_files (); state_fd = _open_front_end_state_file(&state_file); if (state_fd < 0) { info ("No node state file (%s) to recover", state_file); error_code = ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); while (1) { data_read = read(state_fd, &data[data_size], BUF_SIZE); if (data_read < 0) { if (errno == EINTR) continue; else { error ("Read error on %s: %m", state_file); break; } } else if (data_read == 0) /* eof */ break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close (state_fd); } xfree (state_file); unlock_state_files (); buffer = create_buf (data, data_size); safe_unpackstr_xmalloc( &ver_str, &name_len, buffer); debug3("Version string in front_end_state header is %s", ver_str); if (ver_str) { if (!strcmp(ver_str, FRONT_END_STATE_VERSION)) { protocol_version = SLURM_PROTOCOL_VERSION; } } if (protocol_version == (uint16_t) NO_VAL) { error("*****************************************************"); error("Can not recover front_end state, version incompatible"); error("*****************************************************"); xfree(ver_str); free_buf(buffer); return EFAULT; } xfree(ver_str); safe_unpack_time(&time_stamp, buffer); while (remaining_buf (buffer) > 0) { uint16_t base_state; if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { safe_unpackstr_xmalloc (&node_name, &name_len, buffer); safe_unpack16 (&node_state, buffer); safe_unpackstr_xmalloc (&reason, &name_len, buffer); safe_unpack_time (&reason_time, buffer); safe_unpack32 (&reason_uid, buffer); base_state = node_state & NODE_STATE_BASE; } else goto unpack_error; /* validity test as possible */ /* find record and perform update */ front_end_ptr = find_front_end_record(node_name); if (front_end_ptr == NULL) { error("Front_end node %s has vanished from " "configuration", node_name); } else if (state_only) { uint16_t orig_flags; orig_flags = front_end_ptr->node_state & NODE_STATE_FLAGS; node_cnt++; if (IS_NODE_UNKNOWN(front_end_ptr)) { if (base_state == NODE_STATE_DOWN) { orig_flags &= (~NODE_STATE_COMPLETING); front_end_ptr->node_state = NODE_STATE_DOWN | orig_flags; } if (node_state & NODE_STATE_DRAIN) { front_end_ptr->node_state |= NODE_STATE_DRAIN; } if (node_state & NODE_STATE_FAIL) { front_end_ptr->node_state |= NODE_STATE_FAIL; } } if (front_end_ptr->reason == NULL) { front_end_ptr->reason = reason; reason = NULL; /* Nothing to free */ front_end_ptr->reason_time = reason_time; front_end_ptr->reason_uid = reason_uid; } } else { node_cnt++; front_end_ptr->node_state = node_state; xfree(front_end_ptr->reason); front_end_ptr->reason = reason; reason = NULL; /* Nothing to free */ front_end_ptr->reason_time = reason_time; front_end_ptr->reason_uid = reason_uid; front_end_ptr->last_response = (time_t) 0; } xfree(node_name); xfree(reason); } fini: info("Recovered state of %d front_end nodes", node_cnt); free_buf (buffer); return error_code; unpack_error: error("Incomplete front_end node data checkpoint file"); error_code = EFAULT; xfree (node_name); xfree(reason); goto fini; #else return 0; #endif }