/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } if (reason) { if (!IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); debug("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down(node_ptr->name, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; if (job_iter == NULL) fatal("list_iterator_create: malloc failure"); while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); if (job_ptr == NULL) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); basil_safe_release(rsvn->rsvn_id, inv); slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) /* ALPS will take some time, do not schedule now. */ return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return rc; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down(node_ptr->name, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }
void IO_Manager::process_write_stripe (uint32_t request_id, uint32_t replica_request_id, uint32_t *chunks_written, uint32_t *replica_chunks_written, uint32_t file_id, char *pathname, uint32_t stripe_id, uint32_t stripe_size, uint32_t chunk_size, const void *buf, int offset, size_t count) { uint32_t chunk_id, bytes_written = 0, write_size = 0; int chunk_offset, node_id, replica_node_id, write_result; assert (((int)count - offset) <= (int)stripe_size); printf ("\n(BARISTA) Process Write Stripe\n"); get_first_chunk (&chunk_id, chunk_size, &chunk_offset, offset); while (bytes_written < count) { struct file_chunk cur_chunk = {file_id, stripe_id, chunk_id}; // If the chunk does not exists, create it if (!chunk_exists (cur_chunk)) { node_id = put_chunk (file_id, pathname, stripe_id, chunk_id); printf ("\tchunk doesn't exist. Preparing to send chunk to node %d\n", node_id); chunk_to_node[cur_chunk] = node_id; } // If the replica does not exist, create it if (!chunk_replica_exists (cur_chunk)) { replica_node_id = put_replica (file_id, pathname, stripe_id, chunk_id); printf ("\tchunk replica doesn't exist. Preparing to send chunk replica to node %d\n", replica_node_id); chunk_to_replica_node[cur_chunk] = replica_node_id; } // Ensure that we have the proper node and replica id's to send data to node_id = chunk_to_node[cur_chunk]; replica_node_id = chunk_to_replica_node[cur_chunk]; // Determine the size of the write if (count - bytes_written > chunk_size - chunk_offset) { write_size = chunk_size - chunk_offset; } else { write_size = count - bytes_written; } // Send the write to the node // ADD FD HERE printf ("\tprocessing chunk %d (sending to node %d)\n", chunk_id, node_id); write_result = process_write_chunk (request_id, 0, file_id, node_id, stripe_id, chunk_id, chunk_offset, (uint8_t *)buf + bytes_written, write_size); printf ("\t\treceived %d from network call.\n", write_result); // If the write failed if (write_result == NODE_FAILURE) { // Set the node to "down" and try again set_node_down (node_id); } else { // Send the write to the replica node // ADD FD HERE printf ("\tprocessing chunk replica %d (sending to node %d)\n", chunk_id, replica_node_id); write_result = process_write_chunk (replica_request_id, 0, file_id, replica_node_id, stripe_id, chunk_id, chunk_offset, (uint8_t *)buf + bytes_written, write_size); // if the replica write failed if (write_result == NODE_FAILURE) { // Set the node to "down" set_node_down (replica_node_id); // Choose a different replica replica_node_id = put_replica (file_id, pathname, stripe_id, chunk_id); // Re-write the data process_write_chunk (replica_request_id, 0, file_id, replica_node_id, stripe_id, chunk_id, chunk_offset, (uint8_t *)buf + bytes_written, write_size); } // update counters chunk_offset = 0; bytes_written += write_size; chunk_id++; (*chunks_written)++; (*replica_chunks_written)++; } } }
static void _notify_slurmctld_nodes(agent_info_t *agent_ptr, int no_resp_cnt, int retry_cnt) { ListIterator itr = NULL; ret_data_info_t *ret_data_info = NULL; state_t state; int is_ret_list = 1; /* Locks: Read config, write job, write node */ slurmctld_lock_t node_write_lock = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; thd_t *thread_ptr = agent_ptr->thread_struct; int i; /* Notify slurmctld of non-responding nodes */ if (no_resp_cnt) { /* Update node table data for non-responding nodes */ lock_slurmctld(node_write_lock); if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) { /* Requeue the request */ batch_job_launch_msg_t *launch_msg_ptr = *agent_ptr->msg_args_pptr; uint32_t job_id = launch_msg_ptr->job_id; job_complete(job_id, 0, true, false, 0); } unlock_slurmctld(node_write_lock); } if (retry_cnt && agent_ptr->retry) _queue_agent_retry(agent_ptr, retry_cnt); /* Update last_response on responding nodes */ lock_slurmctld(node_write_lock); for (i = 0; i < agent_ptr->thread_count; i++) { char *down_msg, *node_names; if (!thread_ptr[i].ret_list) { state = thread_ptr[i].state; is_ret_list = 0; goto switch_on_state; } is_ret_list = 1; itr = list_iterator_create(thread_ptr[i].ret_list); while ((ret_data_info = list_next(itr))) { state = ret_data_info->err; switch_on_state: switch(state) { case DSH_NO_RESP: if (!is_ret_list) { node_not_resp(thread_ptr[i].nodelist, thread_ptr[i]. start_time); } else { node_not_resp(ret_data_info->node_name, thread_ptr[i].start_time); } break; case DSH_FAILED: if (is_ret_list) node_names = ret_data_info->node_name; else node_names = thread_ptr[i].nodelist; #ifdef HAVE_FRONT_END down_msg = ""; #else set_node_down(node_names, "Prolog/Epilog failure"); down_msg = ", set to state DOWN"; #endif error("Prolog/Epilog failure on nodes %s%s", node_names, down_msg); break; case DSH_DONE: if (!is_ret_list) node_did_resp(thread_ptr[i].nodelist); else node_did_resp(ret_data_info->node_name); break; default: if (!is_ret_list) { error("unknown state returned for %s", thread_ptr[i].nodelist); } else { error("unknown state returned for %s", ret_data_info->node_name); } break; } if (!is_ret_list) goto finished; } list_iterator_destroy(itr); finished: ; } unlock_slurmctld(node_write_lock); if (run_scheduler) { run_scheduler = false; /* below functions all have their own locking */ if (schedule(0)) { schedule_job_save(); schedule_node_save(); } } if ((agent_ptr->msg_type == REQUEST_PING) || (agent_ptr->msg_type == REQUEST_HEALTH_CHECK) || (agent_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS)) ping_end(); }
uint32_t IO_Manager::process_read_stripe (uint32_t request_id, uint32_t file_id, char *pathname, uint32_t stripe_id, uint32_t stripe_size, uint32_t chunk_size, const void *buf, int offset, size_t count) { uint32_t chunk_id, bytes_read = 0, read_size = 0, num_chunks = 0; int chunk_offset, chunk_result, node_id; assert (((int)count - offset) <= (int)stripe_size); printf ("\n(BARISTA) Process Read Stripe\n"); get_first_chunk (&chunk_id, chunk_size, &chunk_offset, offset); while (bytes_read < count) { struct file_chunk cur_chunk = {file_id, stripe_id, chunk_id}; if (!chunk_exists (cur_chunk)) { // Current chunk does not exist. Report and error and stop the read. fprintf (stderr, "Could only read %d bytes (out of %d requested.\n", (int)bytes_read, (int)count); break; } // The chunk exists, so set the node_id node_id = chunk_to_node[cur_chunk]; // If the node isn't up, switch to the replica if (!is_node_up (node_id)) { assert (chunk_replica_exists (cur_chunk)); node_id = chunk_to_replica_node[cur_chunk]; } // Determine how much data to read from the current chunk if (count - bytes_read > chunk_size - chunk_offset) { read_size = chunk_size - chunk_offset; } else { read_size = count - bytes_read; } printf ("\tprocessing chunk %d (sending to node %d)\n", chunk_id, node_id); printf ("\t\toffset: %d, size: %d\n", chunk_offset, read_size); // Send the read to the node // ADD FD HERE chunk_result = process_read_chunk (request_id, 0, file_id, node_id, stripe_id, chunk_id, chunk_offset, (uint8_t *)buf + bytes_read, read_size); printf ("\t\treceived %d from network call.\n", chunk_result); // If the node cannot be read from if (chunk_result == NODE_FAILURE) { // Mark the node as "down" set_node_down (node_id); } // The read suceeded, so move on else { // update counters chunk_offset = 0; bytes_read += read_size; chunk_id++; num_chunks++; } } return num_chunks; }