Example #1
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		if (reason) {
			if (!IS_NODE_DOWN(node_ptr)) {
				xfree(node_ptr->reason);
				debug("MARKING %s DOWN (%s)",
				      node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down(node_ptr->name, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		if (job_iter == NULL)
			fatal("list_iterator_create: malloc failure");

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {

			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		if (job_ptr == NULL) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			basil_safe_release(rsvn->rsvn_id, inv);
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch)
		/* ALPS will take some time, do not schedule now. */
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
	return rc;
}
Example #2
0
/*
 * ping_nodes - check that all nodes and daemons are alive,
 *	get nodes in UNKNOWN state to register
 */
void ping_nodes (void)
{
    static bool restart_flag = true;	/* system just restarted */
    static int offset = 0;	/* mutex via node table write lock on entry */
    static int max_reg_threads = 0;	/* max node registration threads
					 * this can include DOWN nodes, so
					 * limit the number to avoid huge
					 * communication delays */
    int i;
    time_t now, still_live_time, node_dead_time;
    static time_t last_ping_time = (time_t) 0;
    hostlist_t down_hostlist = NULL;
    char *host_str = NULL;
    agent_arg_t *ping_agent_args = NULL;
    agent_arg_t *reg_agent_args = NULL;
#ifdef HAVE_FRONT_END
    front_end_record_t *front_end_ptr = NULL;
#else
    struct node_record *node_ptr = NULL;
#endif

    now = time (NULL);

    ping_agent_args = xmalloc (sizeof (agent_arg_t));
    ping_agent_args->msg_type = REQUEST_PING;
    ping_agent_args->retry = 0;
    ping_agent_args->hostlist = hostlist_create("");

    reg_agent_args = xmalloc (sizeof (agent_arg_t));
    reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS;
    reg_agent_args->retry = 0;
    reg_agent_args->hostlist = hostlist_create("");

    /*
     * If there are a large number of down nodes, the node ping
     * can take a long time to complete:
     *  ping_time = down_nodes * agent_timeout / agent_parallelism
     *  ping_time = down_nodes * 10_seconds / 10
     *  ping_time = down_nodes (seconds)
     * Because of this, we extend the SlurmdTimeout by the
     * time needed to complete a ping of all nodes.
     */
    if ((slurmctld_conf.slurmd_timeout == 0) ||
            (last_ping_time == (time_t) 0)) {
        node_dead_time = (time_t) 0;
    } else {
        node_dead_time = last_ping_time -
                         slurmctld_conf.slurmd_timeout;
    }
    still_live_time = now - (slurmctld_conf.slurmd_timeout / 3);
    last_ping_time  = now;

    if (max_reg_threads == 0) {
        max_reg_threads = MAX(slurm_get_tree_width(), 1);
    }
    offset += max_reg_threads;
    if ((offset > node_record_count) &&
            (offset >= (max_reg_threads * MAX_REG_FREQUENCY)))
        offset = 0;

#ifdef HAVE_FRONT_END
    for (i = 0, front_end_ptr = front_end_nodes;
            i < front_end_node_cnt; i++, front_end_ptr++) {
        if ((slurmctld_conf.slurmd_timeout == 0)	&&
                (!restart_flag)				&&
                (!IS_NODE_UNKNOWN(front_end_ptr))		&&
                (!IS_NODE_NO_RESPOND(front_end_ptr)))
            continue;

        if ((front_end_ptr->last_response != (time_t) 0)     &&
                (front_end_ptr->last_response <= node_dead_time) &&
                (!IS_NODE_DOWN(front_end_ptr))) {
            if (down_hostlist)
                (void) hostlist_push_host(down_hostlist,
                                          front_end_ptr->name);
            else {
                down_hostlist =
                    hostlist_create(front_end_ptr->name);
                if (down_hostlist == NULL)
                    fatal("hostlist_create: malloc error");
            }
            set_front_end_down(front_end_ptr, "Not responding");
            front_end_ptr->not_responding = false;
            continue;
        }

        if (restart_flag) {
            front_end_ptr->last_response =
                slurmctld_conf.last_update;
        }

        /* Request a node registration if its state is UNKNOWN or
         * on a periodic basis (about every MAX_REG_FREQUENCY ping,
         * this mechanism avoids an additional (per node) timer or
         * counter and gets updated configuration information
         * once in a while). We limit these requests since they
         * can generate a flood of incoming RPCs. */
        if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag ||
                ((i >= offset) && (i < (offset + max_reg_threads)))) {
            hostlist_push(reg_agent_args->hostlist,
                          front_end_ptr->name);
            reg_agent_args->node_count++;
            continue;
        }

        if ((!IS_NODE_NO_RESPOND(front_end_ptr)) &&
                (front_end_ptr->last_response >= still_live_time))
            continue;

        /* Do not keep pinging down nodes since this can induce
         * huge delays in hierarchical communication fail-over */
        if (IS_NODE_NO_RESPOND(front_end_ptr) &&
                IS_NODE_DOWN(front_end_ptr))
            continue;

        hostlist_push(ping_agent_args->hostlist, front_end_ptr->name);
        ping_agent_args->node_count++;
    }
#else
    for (i=0, node_ptr=node_record_table_ptr;
            i<node_record_count; i++, node_ptr++) {
        if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr))
            continue;
        if ((slurmctld_conf.slurmd_timeout == 0) &&
                (!restart_flag)			 &&
                (!IS_NODE_UNKNOWN(node_ptr))         &&
                (!IS_NODE_NO_RESPOND(node_ptr)))
            continue;

        if ((node_ptr->last_response != (time_t) 0)     &&
                (node_ptr->last_response <= node_dead_time) &&
                (!IS_NODE_DOWN(node_ptr))) {
            if (down_hostlist)
                (void) hostlist_push_host(down_hostlist,
                                          node_ptr->name);
            else {
                down_hostlist =
                    hostlist_create(node_ptr->name);
                if (down_hostlist == NULL)
                    fatal("hostlist_create: malloc error");
            }
            set_node_down(node_ptr->name, "Not responding");
            node_ptr->not_responding = false;  /* logged below */
            continue;
        }

        if (restart_flag)
            node_ptr->last_response = slurmctld_conf.last_update;

        /* Request a node registration if its state is UNKNOWN or
         * on a periodic basis (about every MAX_REG_FREQUENCY ping,
         * this mechanism avoids an additional (per node) timer or
         * counter and gets updated configuration information
         * once in a while). We limit these requests since they
         * can generate a flood of incoming RPCs. */
        if (IS_NODE_UNKNOWN(node_ptr) || restart_flag ||
                ((i >= offset) && (i < (offset + max_reg_threads)))) {
            hostlist_push(reg_agent_args->hostlist,
                          node_ptr->name);
            reg_agent_args->node_count++;
            continue;
        }

        if ((!IS_NODE_NO_RESPOND(node_ptr)) &&
                (node_ptr->last_response >= still_live_time))
            continue;

        /* Do not keep pinging down nodes since this can induce
         * huge delays in hierarchical communication fail-over */
        if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr))
            continue;

        hostlist_push(ping_agent_args->hostlist, node_ptr->name);
        ping_agent_args->node_count++;
    }
#endif

    restart_flag = false;
    if (ping_agent_args->node_count == 0) {
        hostlist_destroy(ping_agent_args->hostlist);
        xfree (ping_agent_args);
    } else {
        hostlist_uniq(ping_agent_args->hostlist);
        host_str = hostlist_ranged_string_xmalloc(
                       ping_agent_args->hostlist);
        debug("Spawning ping agent for %s", host_str);
        xfree(host_str);
        ping_begin();
        agent_queue_request(ping_agent_args);
    }

    if (reg_agent_args->node_count == 0) {
        hostlist_destroy(reg_agent_args->hostlist);
        xfree (reg_agent_args);
    } else {
        hostlist_uniq(reg_agent_args->hostlist);
        host_str = hostlist_ranged_string_xmalloc(
                       reg_agent_args->hostlist);
        debug("Spawning registration agent for %s %d hosts",
              host_str, reg_agent_args->node_count);
        xfree(host_str);
        ping_begin();
        agent_queue_request(reg_agent_args);
    }

    if (down_hostlist) {
        hostlist_uniq(down_hostlist);
        host_str = hostlist_ranged_string_xmalloc(down_hostlist);
        error("Nodes %s not responding, setting DOWN", host_str);
        xfree(host_str);
        hostlist_destroy(down_hostlist);
    }
}
Example #3
0
void IO_Manager::process_write_stripe (uint32_t request_id,
                                       uint32_t replica_request_id,
                                       uint32_t *chunks_written,
                                       uint32_t *replica_chunks_written,
                                       uint32_t file_id, char *pathname,
                                       uint32_t stripe_id, uint32_t stripe_size,
                                       uint32_t chunk_size, const void *buf,
                                       int offset, size_t count) {
  uint32_t chunk_id, bytes_written = 0, write_size = 0;
  int chunk_offset, node_id, replica_node_id, write_result;

  assert (((int)count - offset) <= (int)stripe_size);
  printf ("\n(BARISTA) Process Write Stripe\n");
  
  get_first_chunk (&chunk_id, chunk_size, &chunk_offset, offset);

  while (bytes_written < count) {
    struct file_chunk cur_chunk = {file_id, stripe_id, chunk_id};
    
    // If the chunk does not exists, create it
    if (!chunk_exists (cur_chunk)) {
      node_id = put_chunk (file_id, pathname, stripe_id, chunk_id);
      printf ("\tchunk doesn't exist. Preparing to send chunk to node %d\n", node_id);
      chunk_to_node[cur_chunk] = node_id;
    }

    // If the replica does not exist, create it
    if (!chunk_replica_exists (cur_chunk)) {
      replica_node_id = put_replica (file_id, pathname, stripe_id,
                                     chunk_id);
      printf ("\tchunk replica doesn't exist. Preparing to send chunk replica to node %d\n", 
                 replica_node_id);
      chunk_to_replica_node[cur_chunk] = replica_node_id;
    }

    // Ensure that we have the proper node and replica id's to send data to
    node_id = chunk_to_node[cur_chunk];
    replica_node_id = chunk_to_replica_node[cur_chunk];

    // Determine the size of the write
    if (count - bytes_written > chunk_size - chunk_offset) {
      write_size = chunk_size - chunk_offset;
    }
    else {
      write_size = count - bytes_written;
    }

    // Send the write to the node
                        // ADD FD HERE
    printf ("\tprocessing chunk %d (sending to node %d)\n", chunk_id, node_id);
    write_result = process_write_chunk (request_id, 0, file_id, node_id, stripe_id,
                                        chunk_id, chunk_offset, (uint8_t *)buf
                                        + bytes_written, write_size);
    printf ("\t\treceived %d from network call.\n", write_result);
    // If the write failed
    if (write_result == NODE_FAILURE) {
      // Set the node to "down" and try again
      set_node_down (node_id);
    }
    else {
      // Send the write to the replica node
                          // ADD FD HERE
      printf ("\tprocessing chunk replica %d (sending to node %d)\n", chunk_id, 
                 replica_node_id);
      write_result = process_write_chunk (replica_request_id, 0, file_id, replica_node_id, stripe_id,
                                          chunk_id, chunk_offset, (uint8_t *)buf
                                          + bytes_written, write_size);
      // if the replica write failed
      if (write_result == NODE_FAILURE) {
        // Set the node to "down"
        set_node_down (replica_node_id);
        // Choose a different replica
        replica_node_id = put_replica (file_id, pathname, stripe_id,
                                       chunk_id);
        // Re-write the data
        process_write_chunk (replica_request_id, 0, file_id, replica_node_id, stripe_id,
                             chunk_id, chunk_offset, (uint8_t *)buf
                             + bytes_written, write_size);
      }
      // update counters
      chunk_offset = 0;
      bytes_written += write_size;
      chunk_id++;
      (*chunks_written)++;
      (*replica_chunks_written)++;
    }
  }
}
Example #4
0
static void _notify_slurmctld_nodes(agent_info_t *agent_ptr,
				    int no_resp_cnt, int retry_cnt)
{
	ListIterator itr = NULL;
	ret_data_info_t *ret_data_info = NULL;
	state_t state;
	int is_ret_list = 1;
	/* Locks: Read config, write job, write node */
	slurmctld_lock_t node_write_lock =
	    { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
	thd_t *thread_ptr = agent_ptr->thread_struct;
	int i;

	/* Notify slurmctld of non-responding nodes */
	if (no_resp_cnt) {
		/* Update node table data for non-responding nodes */
		lock_slurmctld(node_write_lock);
		if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) {
			/* Requeue the request */
			batch_job_launch_msg_t *launch_msg_ptr =
					*agent_ptr->msg_args_pptr;
			uint32_t job_id = launch_msg_ptr->job_id;
			job_complete(job_id, 0, true, false, 0);
		}
		unlock_slurmctld(node_write_lock);
	}
	if (retry_cnt && agent_ptr->retry)
		_queue_agent_retry(agent_ptr, retry_cnt);

	/* Update last_response on responding nodes */
	lock_slurmctld(node_write_lock);
	for (i = 0; i < agent_ptr->thread_count; i++) {
		char *down_msg, *node_names;
		if (!thread_ptr[i].ret_list) {
			state = thread_ptr[i].state;
			is_ret_list = 0;
			goto switch_on_state;
		}
		is_ret_list = 1;

		itr = list_iterator_create(thread_ptr[i].ret_list);
		while ((ret_data_info = list_next(itr))) {
			state = ret_data_info->err;
		switch_on_state:
			switch(state) {
			case DSH_NO_RESP:
				if (!is_ret_list) {
					node_not_resp(thread_ptr[i].nodelist,
						      thread_ptr[i].
						      start_time);
				} else {
					node_not_resp(ret_data_info->node_name,
						      thread_ptr[i].start_time);
				}
				break;
			case DSH_FAILED:
				if (is_ret_list)
					node_names = ret_data_info->node_name;
				else
					node_names = thread_ptr[i].nodelist;
#ifdef HAVE_FRONT_END
				down_msg = "";
#else
				set_node_down(node_names,
					      "Prolog/Epilog failure");
				down_msg = ", set to state DOWN";
#endif
				error("Prolog/Epilog failure on nodes %s%s",
				      node_names, down_msg);
				break;
			case DSH_DONE:
				if (!is_ret_list)
					node_did_resp(thread_ptr[i].nodelist);
				else
					node_did_resp(ret_data_info->node_name);
				break;
			default:
				if (!is_ret_list) {
					error("unknown state returned for %s",
					      thread_ptr[i].nodelist);
				} else {
					error("unknown state returned for %s",
					      ret_data_info->node_name);
				}
				break;
			}
			if (!is_ret_list)
				goto finished;
		}
		list_iterator_destroy(itr);
finished:	;
	}
	unlock_slurmctld(node_write_lock);
	if (run_scheduler) {
		run_scheduler = false;
		/* below functions all have their own locking */
		if (schedule(0))	{
			schedule_job_save();
			schedule_node_save();
		}
	}
	if ((agent_ptr->msg_type == REQUEST_PING) ||
	    (agent_ptr->msg_type == REQUEST_HEALTH_CHECK) ||
	    (agent_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS))
		ping_end();
}
Example #5
0
uint32_t IO_Manager::process_read_stripe (uint32_t request_id, uint32_t file_id,
                                          char *pathname, uint32_t stripe_id,
                                          uint32_t stripe_size, uint32_t chunk_size,
                                          const void *buf, int offset,
                                          size_t count) {
  uint32_t chunk_id, bytes_read = 0, read_size = 0, num_chunks = 0;
  int chunk_offset, chunk_result, node_id;
  
  assert (((int)count - offset) <= (int)stripe_size);
  
  printf ("\n(BARISTA) Process Read Stripe\n");

  get_first_chunk (&chunk_id, chunk_size, &chunk_offset, offset);
  
  while (bytes_read < count) {
    struct file_chunk cur_chunk = {file_id, stripe_id, chunk_id};

    if (!chunk_exists (cur_chunk)) {
      // Current chunk does not exist. Report and error and stop the read.
      fprintf (stderr, "Could only read %d bytes (out of %d requested.\n",
                  (int)bytes_read, (int)count);
      break;
    }

    // The chunk exists, so set the node_id
    node_id = chunk_to_node[cur_chunk];

    // If the node isn't up, switch to the replica
    if (!is_node_up (node_id)) {
      assert (chunk_replica_exists (cur_chunk));
      node_id = chunk_to_replica_node[cur_chunk];
    }
   
    // Determine how much data to read from the current chunk
    if (count - bytes_read > chunk_size - chunk_offset) {
      read_size = chunk_size - chunk_offset;
    }
    else {
      read_size = count - bytes_read;
    }
    
    printf ("\tprocessing chunk %d (sending to node %d)\n", chunk_id, node_id);
    printf ("\t\toffset: %d, size: %d\n", chunk_offset, read_size);
    // Send the read to the node
                   // ADD FD HERE
    chunk_result = process_read_chunk (request_id, 0, file_id, node_id, stripe_id,
                                      chunk_id, chunk_offset, 
                                      (uint8_t *)buf + bytes_read,
                                      read_size);
    
    printf ("\t\treceived %d from network call.\n", chunk_result);
    // If the node cannot be read from
    if (chunk_result == NODE_FAILURE) {
      // Mark the node as "down"
      set_node_down (node_id);
    }
    // The read suceeded, so move on
    else {
      // update counters
      chunk_offset = 0;
      bytes_read += read_size;
      chunk_id++;
      num_chunks++;
    }
  }
  return num_chunks;
}