extern void *
crypto_read_public_key(const char *path)
{
	munge_ctx_t ctx;
	char *socket;
	int auth_ttl, rc;

	/*
	 * Get slurm user id once. We use it later to verify credentials.
	 */
	slurm_user = slurm_get_slurm_user_id();

	ctx = munge_ctx_create();

	socket = _auth_opts_to_socket();
	if (socket) {
		rc = munge_ctx_set(ctx, MUNGE_OPT_SOCKET, socket);
		xfree(socket);
		if (rc != EMUNGE_SUCCESS) {
			error("munge_ctx_set failure");
			munge_ctx_destroy(ctx);
			return NULL;
		}
	}

	auth_ttl = slurm_get_auth_ttl();
	if (auth_ttl)
		(void) munge_ctx_set(ctx, MUNGE_OPT_TTL, auth_ttl);

	return (void *) ctx;
}
Esempio n. 2
0
/* process RPC from slurmctld
 * IN msg: message received
 * OUT resp: resource allocation response message
 * RET 1 if resp is filled in, 0 otherwise */
static int
_handle_msg(slurm_msg_t *msg, resource_allocation_response_msg_t **resp)
{
	uid_t req_uid   = g_slurm_auth_get_uid(msg->auth_cred, NULL);
	uid_t uid       = getuid();
	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
	int rc = 0;

	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
		error ("Security violation, slurm message from uid %u",
			(unsigned int) req_uid);
		return 0;
	}

	switch (msg->msg_type) {
		case RESPONSE_RESOURCE_ALLOCATION:
			debug2("resource allocation response received");
			slurm_send_rc_msg(msg, SLURM_SUCCESS);
			*resp = msg->data;
			rc = 1;
			break;
		case SRUN_JOB_COMPLETE:
			info("Job has been cancelled");
			break;
		default:
			error("received spurious message type: %d",
			      msg->msg_type);
	}
	return rc;
}
Esempio n. 3
0
/* process RPC from slurmctld
 * IN msg: message received
 * OUT resp: resource allocation response message or List of them
 * RET 1 if resp is filled in, 0 otherwise */
static int
_handle_msg(slurm_msg_t *msg, uint16_t msg_type, void **resp)
{
	char *auth_info = slurm_get_auth_info();
	uid_t req_uid;
	uid_t uid       = getuid();
	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
	int rc = 0;

	req_uid = g_slurm_auth_get_uid(msg->auth_cred, auth_info);
	xfree(auth_info);

	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
		error ("Security violation, slurm message from uid %u",
			(unsigned int) req_uid);
		return 0;
	}

	if (msg->msg_type == msg_type) {
		debug2("resource allocation response received");
		slurm_send_rc_msg(msg, SLURM_SUCCESS);
		*resp = msg->data;    /* transfer payload to response */
		msg->data = NULL;
		rc = 1;
	} else if (msg->msg_type == SRUN_JOB_COMPLETE) {
		info("Job has been cancelled");
	} else {
		error("%s: received spurious message type: %u",
		      __func__, msg->msg_type);
	}
	return rc;
}
Esempio n. 4
0
/*
 * init() is called when the plugin is loaded, before any other functions
 * are called.  Put global initialization here.
 */
extern int init ( void )
{
	static int first = 1;
	char *log_file = NULL;
	int 		rc = SLURM_SUCCESS;
	mode_t		prot = 0600;
	struct stat	statbuf;

	if (slurmdbd_conf) {
		fatal("The filetxt plugin should not "
		      "be run from the slurmdbd.  "
		      "Please use a database plugin");
	}

	/* This check for the slurm user id is a quick and dirty patch
	 * to see if the controller is calling this, since we open the
	 * file in append mode stats could fail on it if the file
	 * isn't world writable.
	 */
	if (first && (getuid() == slurm_get_slurm_user_id())) {
		debug2("slurmdb_init() called");
		log_file = slurm_get_accounting_storage_loc();
		if (!log_file)
			log_file = xstrdup(DEFAULT_STORAGE_LOC);
		slurm_mutex_lock( &logfile_lock );
		if (LOGFILE)
			fclose(LOGFILE);

		if (*log_file != '/')
			fatal("AccountingStorageLoc must specify an "
			      "absolute pathname");
		if (stat(log_file, &statbuf)==0)/* preserve current file mode */
			prot = statbuf.st_mode;
		LOGFILE = fopen(log_file, "a");
		if (LOGFILE == NULL) {
			error("open %s: %m", log_file);
			storage_init = 0;
			xfree(log_file);
			slurm_mutex_unlock( &logfile_lock );
			return SLURM_ERROR;
		} else
			chmod(log_file, prot);

		xfree(log_file);

		if (setvbuf(LOGFILE, NULL, _IOLBF, 0))
			error("setvbuf() failed");
		LOGFILE_FD = fileno(LOGFILE);
		slurm_mutex_unlock( &logfile_lock );
		storage_init = 1;
		/* since this can be loaded from many different places
		   only tell us once. */
		verbose("%s loaded", plugin_name);
		first = 0;
	} else {
		debug4("%s loaded", plugin_name);
	}
	return rc;
}
Esempio n. 5
0
static void
_handle_msg(slurm_msg_t *msg)
{
	static uint32_t slurm_uid = NO_VAL;
	uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred,
					     slurm_get_auth_info());
	uid_t uid = getuid();
	job_step_kill_msg_t *ss;
	srun_user_msg_t *um;

	if (slurm_uid == NO_VAL)
		slurm_uid = slurm_get_slurm_user_id();
	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
		error ("Security violation, slurm message from uid %u",
		       (unsigned int) req_uid);
 		return;
	}

	switch (msg->msg_type) {
	case SRUN_PING:
		debug3("slurmctld ping received");
		slurm_send_rc_msg(msg, SLURM_SUCCESS);
		slurm_free_srun_ping_msg(msg->data);
		break;
	case SRUN_JOB_COMPLETE:
		debug("received job step complete message");
		runjob_signal(SIGKILL);
		slurm_free_srun_job_complete_msg(msg->data);
		break;
	case SRUN_USER_MSG:
		um = msg->data;
		info("%s", um->msg);
		slurm_free_srun_user_msg(msg->data);
		break;
	case SRUN_TIMEOUT:
		debug("received job step timeout message");
		_handle_timeout(msg->data);
		slurm_free_srun_timeout_msg(msg->data);
		break;
	case SRUN_STEP_SIGNAL:
		ss = msg->data;
		debug("received step signal %u RPC", ss->signal);
		if (ss->signal)
			runjob_signal(ss->signal);
		slurm_free_job_step_kill_msg(msg->data);
		break;
	default:
		debug("received spurious message type: %u",
		      msg->msg_type);
		break;
	}
	return;
}
Esempio n. 6
0
static void _load_slurm_config(void)
{
	acct_storage_backup_host = slurm_get_accounting_storage_backup_host();
	acct_storage_host = slurm_get_accounting_storage_host();
	acct_storage_loc  = slurm_get_accounting_storage_loc();
	acct_storage_pass = slurm_get_accounting_storage_pass();
	acct_storage_port = slurm_get_accounting_storage_port();
	acct_storage_type = slurm_get_accounting_storage_type();
	acct_storage_user = slurm_get_accounting_storage_user();
	auth_type = slurm_get_auth_type();
	msg_timeout = slurm_get_msg_timeout();
	plugin_dir = slurm_get_plugin_dir();
	private_data = slurm_get_private_data();
	slurm_user_id = slurm_get_slurm_user_id();
	track_wckey = slurm_get_track_wckey();
}
Esempio n. 7
0
extern int clusteracct_storage_g_node_up(void *db_conn,
					 struct node_record *node_ptr,
					 time_t event_time)
{
	if (slurm_acct_storage_init(NULL) < 0)
		return SLURM_ERROR;

	/* on some systems we need to make sure we don't say something
	   is completely up if there are cpus in an error state */
	if(node_ptr->select_nodeinfo) {
		uint16_t err_cpus = 0;
		select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
					     SELECT_NODEDATA_SUBCNT,
					     NODE_STATE_ERROR,
					     &err_cpus);
		if(err_cpus) {
			char *reason = "Setting partial node down.";
			struct node_record send_node;
			struct config_record config_rec;
			uint16_t cpu_cnt = 0;
			select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT,
						&cpu_cnt);
			err_cpus *= cpu_cnt;
			memset(&send_node, 0, sizeof(struct node_record));
			memset(&config_rec, 0, sizeof(struct config_record));
			send_node.name = node_ptr->name;
			send_node.config_ptr = &config_rec;
			send_node.cpus = err_cpus;
			config_rec.cpus = err_cpus;

			send_node.node_state = NODE_STATE_ERROR;

			return (*(g_acct_storage_context->ops.node_down))
				(db_conn, &send_node,
				 event_time, reason, slurm_get_slurm_user_id());
		}
	}

 	return (*(g_acct_storage_context->ops.node_up))
		(db_conn, node_ptr, event_time);
}
Esempio n. 8
0
/*
 * set_front_end_down - make the specified front end node's state DOWN and
 *	kill jobs as needed
 * IN front_end_pt - pointer to the front end node
 * IN reason - why the node is DOWN
 */
extern void set_front_end_down (front_end_record_t *front_end_ptr,
				char *reason)
{
#ifdef HAVE_FRONT_END
	time_t now = time(NULL);
	uint16_t state_flags = front_end_ptr->node_state & NODE_STATE_FLAGS;

	state_flags &= (~NODE_STATE_COMPLETING);
	front_end_ptr->node_state = NODE_STATE_DOWN | state_flags;
	trigger_front_end_down(front_end_ptr);
	(void) kill_job_by_front_end_name(front_end_ptr->name);
	if ((front_end_ptr->reason == NULL) ||
	    (strncmp(front_end_ptr->reason, "Not responding", 14) == 0)) {
		xfree(front_end_ptr->reason);
		front_end_ptr->reason = xstrdup(reason);
		front_end_ptr->reason_time = now;
		front_end_ptr->reason_uid = slurm_get_slurm_user_id();
	}
	last_front_end_update = now;
#endif
}
Esempio n. 9
0
static void _layout_conf_dbd(GtkTreeStore *treestore)
{
	ListIterator itr = NULL;
	GtkTreeIter iter;
	config_key_pair_t *key_pair;
	int update = 0;
	time_t now = time(NULL);
	char tmp_str[128], *user_name = NULL;
	List dbd_config_list = NULL;

	/* first load accounting parms from slurm.conf */
	char *acct_storage_backup_host =
		slurm_get_accounting_storage_backup_host();
	char *acct_storage_host = slurm_get_accounting_storage_host();
	char *acct_storage_loc  = slurm_get_accounting_storage_loc();
	char *acct_storage_pass = slurm_get_accounting_storage_pass();
	uint32_t acct_storage_port = slurm_get_accounting_storage_port();
	char *acct_storage_type = slurm_get_accounting_storage_type();
	char *acct_storage_user = slurm_get_accounting_storage_user();
	char *auth_type = slurm_get_auth_type();
	uint16_t msg_timeout = slurm_get_msg_timeout();
	char *plugin_dir = slurm_get_plugin_dir();
	uint16_t private_data = slurm_get_private_data();
	uint32_t slurm_user_id = slurm_get_slurm_user_id();
	uint16_t track_wckey = slurm_get_track_wckey();

	slurm_make_time_str(&now, tmp_str, sizeof(tmp_str));
	add_display_treestore_line_with_font(
		update, treestore, &iter,
		"SLURM Configuration data as of", tmp_str, "bold");

	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageBackupHost",
				   acct_storage_backup_host);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageHost", acct_storage_host);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageLoc", acct_storage_loc);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStoragePass", acct_storage_pass);
	sprintf(tmp_str, "%u", acct_storage_port);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStoragePort", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageType", acct_storage_type);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageUser", acct_storage_user);
	add_display_treestore_line(update, treestore, &iter,
				   "AuthType", auth_type);
	sprintf(tmp_str, "%u sec", msg_timeout);
	add_display_treestore_line(update, treestore, &iter,
				   "MessageTimeout", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "PluginDir", plugin_dir);
	private_data_string(private_data, tmp_str, sizeof(tmp_str));
	add_display_treestore_line(update, treestore, &iter,
				   "PrivateData", tmp_str);
	user_name = uid_to_string(slurm_user_id);
	sprintf(tmp_str, "%s(%u)", user_name, slurm_user_id);
	xfree(user_name);
	add_display_treestore_line(update, treestore, &iter,
				   "SlurmUserId", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "SLURM_CONF", default_slurm_config_file);
	add_display_treestore_line(update, treestore, &iter,
				   "SLURM_VERSION", SLURM_VERSION_STRING);
	sprintf(tmp_str, "%u", track_wckey);
	add_display_treestore_line(update, treestore, &iter,
				   "TrackWCKey", tmp_str);

	xfree(acct_storage_backup_host);
	xfree(acct_storage_host);
	xfree(acct_storage_loc);
	xfree(acct_storage_pass);
	xfree(acct_storage_type);
	xfree(acct_storage_user);
	xfree(auth_type);
	xfree(plugin_dir);

	/* now load accounting parms from slurmdbd.conf */

	/* second load slurmdbd.conf parms */
	if (!(dbd_config_list = slurmdb_config_get(NULL)))
		return;

	add_display_treestore_line_with_font(
		update, treestore, &iter,
		"\nSlurmDBD Configuration:", NULL, "bold");

	itr = list_iterator_create(dbd_config_list);
	while ((key_pair = list_next(itr))) {
		add_display_treestore_line(update, treestore, &iter,
					   key_pair->name,
					   key_pair->value);
	}
	list_iterator_destroy(itr);
}
Esempio n. 10
0
extern int sacctmgr_dump_cluster (int argc, char **argv)
{
	slurmdb_user_cond_t user_cond;
	slurmdb_user_rec_t *user = NULL;
	slurmdb_hierarchical_rec_t *slurmdb_hierarchical_rec = NULL;
	slurmdb_assoc_rec_t *assoc = NULL;
	slurmdb_assoc_cond_t assoc_cond;
	List assoc_list = NULL;
	List acct_list = NULL;
	List user_list = NULL;
	List slurmdb_hierarchical_rec_list = NULL;
	char *cluster_name = NULL;
	char *file_name = NULL;
	char *user_name = NULL;
	char *line = NULL;
	int i, command_len = 0;
	FILE *fd = NULL;
	char *class_str = NULL;

	for (i = 0; i < argc; i++) {
		int end = parse_option_end(argv[i]);

		if (!end)
			command_len = strlen(argv[i]);
		else {
			command_len = end - 1;
			if (argv[i][end] == '=') {
				end++;
			}
		}
		if (!end || !strncasecmp(argv[i], "Cluster",
					 MAX(command_len, 1))) {
			if (cluster_name) {
				exit_code = 1;
				fprintf(stderr,
					" Can only do one cluster at a time.  "
					"Already doing %s\n", cluster_name);
				continue;
			}
			cluster_name = xstrdup(argv[i]+end);
		} else if (!strncasecmp(argv[i], "File",
					MAX(command_len, 1))) {
			if (file_name) {
				exit_code = 1;
				fprintf(stderr,
					" File name already set to %s\n",
					file_name);
				continue;
			}
			file_name = xstrdup(argv[i]+end);
		} else {
			exit_code = 1;
			fprintf(stderr, " Unknown option: %s\n", argv[i]);
		}
	}

	if (!cluster_name) {
		exit_code = 1;
		fprintf(stderr, " We need a cluster to dump.\n");
		xfree(file_name);
		return SLURM_ERROR;
	} else {
		List temp_list = NULL;
		slurmdb_cluster_cond_t cluster_cond;
		slurmdb_cluster_rec_t *cluster_rec = NULL;

		slurmdb_init_cluster_cond(&cluster_cond, 0);
		cluster_cond.cluster_list = list_create(NULL);
		list_push(cluster_cond.cluster_list, cluster_name);

		temp_list = acct_storage_g_get_clusters(db_conn, my_uid,
							&cluster_cond);
		FREE_NULL_LIST(cluster_cond.cluster_list);
		if (!temp_list) {
			exit_code = 1;
			fprintf(stderr,
				" Problem getting clusters from database.  "
				"Contact your admin.\n");
			xfree(cluster_name);
			xfree(file_name);
			return SLURM_ERROR;
		}

		cluster_rec = list_peek(temp_list);
		if (!cluster_rec) {
			exit_code = 1;
			fprintf(stderr, " Cluster %s doesn't exist.\n",
				cluster_name);
			xfree(cluster_name);
			xfree(file_name);
			FREE_NULL_LIST(temp_list);
			return SLURM_ERROR;
		}
		class_str = get_classification_str(cluster_rec->classification);
		FREE_NULL_LIST(temp_list);
	}

	if (!file_name) {
		file_name = xstrdup_printf("./%s.cfg", cluster_name);
		printf(" No filename given, using %s.\n", file_name);
	}

	memset(&user_cond, 0, sizeof(slurmdb_user_cond_t));
	user_cond.with_coords = 1;
	user_cond.with_wckeys = 1;
	user_cond.with_assocs = 1;

	memset(&assoc_cond, 0, sizeof(slurmdb_assoc_cond_t));
	assoc_cond.without_parent_limits = 1;
	assoc_cond.with_raw_qos = 1;
	assoc_cond.cluster_list = list_create(NULL);
	list_append(assoc_cond.cluster_list, cluster_name);
	/* this is needed for getting the correct wckeys */
	user_cond.assoc_cond = &assoc_cond;

	user_list = acct_storage_g_get_users(db_conn, my_uid, &user_cond);
	/* If not running with the DBD assoc_cond.user_list can be set,
	 * which will mess other things up.
	 */
	if (assoc_cond.user_list) {
		FREE_NULL_LIST(assoc_cond.user_list);
		assoc_cond.user_list = NULL;
	}

	/* make sure this person running is an admin */
	user_name = uid_to_string_cached(my_uid);
	if (!(user = sacctmgr_find_user_from_list(user_list, user_name))) {
		exit_code = 1;
		fprintf(stderr, " Your uid (%u) is not in the "
			"accounting system, can't dump cluster.\n", my_uid);
		FREE_NULL_LIST(assoc_cond.cluster_list);
		xfree(cluster_name);
		xfree(file_name);
		FREE_NULL_LIST(user_list);
		return SLURM_ERROR;

	} else {
		if (my_uid != slurm_get_slurm_user_id() && my_uid != 0
		    && user->admin_level < SLURMDB_ADMIN_SUPER_USER) {
			exit_code = 1;
			fprintf(stderr, " Your user does not have sufficient "
				"privileges to dump clusters.\n");
			FREE_NULL_LIST(assoc_cond.cluster_list);
			xfree(cluster_name);
			xfree(file_name);
			FREE_NULL_LIST(user_list);
			return SLURM_ERROR;
		}
	}
	xfree(user_name);

	/* assoc_cond is set up above */
	assoc_list = acct_storage_g_get_assocs(db_conn, my_uid,
						     &assoc_cond);
	FREE_NULL_LIST(assoc_cond.cluster_list);
	if (!assoc_list) {
		exit_code = 1;
		fprintf(stderr, " Problem with query.\n");
		xfree(cluster_name);
		xfree(file_name);
		return SLURM_ERROR;
	} else if (!list_count(assoc_list)) {
		exit_code = 1;
		fprintf(stderr, " Cluster %s returned nothing.\n",
			cluster_name);
		FREE_NULL_LIST(assoc_list);
		xfree(cluster_name);
		xfree(file_name);
		return SLURM_ERROR;
	}

	slurmdb_hierarchical_rec_list = slurmdb_get_acct_hierarchical_rec_list(
		assoc_list);

	acct_list = acct_storage_g_get_accounts(db_conn, my_uid, NULL);

	if ((fd = fopen(file_name,"w")) == NULL) {
		fprintf(stderr, "Can't open file %s, %s\n", file_name,
			slurm_strerror(errno));
		FREE_NULL_LIST(acct_list);
		FREE_NULL_LIST(assoc_list);
		xfree(cluster_name);
		xfree(file_name);
		FREE_NULL_LIST(slurmdb_hierarchical_rec_list);
		return SLURM_ERROR;
	}

	/* Add header */
	if (fprintf(fd,
		    "# To edit this file start with a cluster line "
		    "for the new cluster\n"
		    "# Cluster - 'cluster_name':MaxNodesPerJob=50\n"
		    "# Followed by Accounts you want in this fashion "
		    "(root is created by default)...\n"
		    "# Parent - 'root'\n"
		    "# Account - 'cs':MaxNodesPerJob=5:MaxJobs=4:"
		    "MaxTRESMins=cpu=20:FairShare=399:"
		    "MaxWallDuration=40:Description='Computer Science':"
		    "Organization='LC'\n"
		    "# Any of the options after a ':' can be left out and "
		    "they can be in any order.\n"
		    "# If you want to add any sub accounts just list the "
		    "Parent THAT HAS ALREADY \n"
		    "# BEEN CREATED before the account line in this "
		    "fashion...\n"
		    "# Parent - 'cs'\n"
		    "# Account - 'test':MaxNodesPerJob=1:MaxJobs=1:"
		    "MaxTRESMins=cpu=1:FairShare=1:"
		    "MaxWallDuration=1:"
		    "Description='Test Account':Organization='Test'\n"
		    "# To add users to a account add a line like this after a "
		    "Parent - 'line'\n"
		    "# User - 'lipari':MaxNodesPerJob=2:MaxJobs=3:"
		    "MaxTRESMins=cpu=4:FairShare=1:"
		    "MaxWallDurationPerJob=1\n") < 0) {
		exit_code = 1;
		fprintf(stderr, "Can't write to file");
		FREE_NULL_LIST(acct_list);
		FREE_NULL_LIST(assoc_list);
		xfree(cluster_name);
		xfree(file_name);
		FREE_NULL_LIST(slurmdb_hierarchical_rec_list);
		return SLURM_ERROR;
	}

	line = xstrdup_printf("Cluster - '%s'", cluster_name);

	if (class_str)
		xstrfmtcat(line, ":Classification='%s'", class_str);

	slurmdb_hierarchical_rec = list_peek(slurmdb_hierarchical_rec_list);
	assoc = slurmdb_hierarchical_rec->assoc;
	if (xstrcmp(assoc->acct, "root")) {
		fprintf(stderr, "Root association not on the top it was %s\n",
			assoc->acct);
	} else
		print_file_add_limits_to_line(&line, assoc);

	if (fprintf(fd, "%s\n", line) < 0) {
		exit_code = 1;
		fprintf(stderr, " Can't write to file");
		FREE_NULL_LIST(acct_list);
		FREE_NULL_LIST(assoc_list);
		xfree(cluster_name);
		xfree(file_name);
		xfree(line);
		FREE_NULL_LIST(slurmdb_hierarchical_rec_list);
		return SLURM_ERROR;
	}
	info("%s", line);
	xfree(line);

	print_file_slurmdb_hierarchical_rec_list(
		fd, slurmdb_hierarchical_rec_list, user_list, acct_list);

	FREE_NULL_LIST(acct_list);
	FREE_NULL_LIST(assoc_list);
	xfree(cluster_name);
	xfree(file_name);
	FREE_NULL_LIST(slurmdb_hierarchical_rec_list);
	fclose(fd);

	return SLURM_SUCCESS;
}
Esempio n. 11
0
/**
 * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates.
 *
 * Checks both SDB database and ALPS inventory for consistency. The inventory
 * part is identical to basil_inventory(), with the difference of being called
 * before valid bitmaps exist, from select_g_node_init().
 * Its dependencies are:
 * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields,
 * - it relies on _sync_nodes_to_jobs() to
 *   o kill active jobs on nodes now marked DOWN,
 *   o reset node state to ALLOCATED if it has been marked IDLE here (which is
 *     an error case, since there is no longer an ALPS reservation for the job,
 *     this is caught by the subsequent basil_inventory()).
 */
extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt)
{
    struct node_record *node_ptr, *end = node_ptr_array + node_cnt;
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;

    /* General mySQL */
    MYSQL		*handle;
    MYSQL_STMT	*stmt = NULL;
    /* Input parameters */
    unsigned int	node_id;
    /*
     * Use a left outer join here since the attributes table may not be
     * populated for a given nodeid (e.g. when the node has been disabled
     * on the SMW via 'xtcli disable').
     * The processor table has more authoritative information, if a nodeid
     * is not listed there, it does not exist.
     */
    const char query[] =	"SELECT x_coord, y_coord, z_coord,"
                            "       cab_position, cab_row, cage, slot, cpu,"
                            "	LOG2(coremask+1), availmem, "
                            "       processor_type  "
                            "FROM  processor LEFT JOIN attributes "
                            "ON    processor_id = nodeid "
                            "WHERE processor_id = ? ";
    const int	PARAM_COUNT = 1;	/* node id */
    MYSQL_BIND	params[PARAM_COUNT];

    int		x_coord, y_coord, z_coord;
    int		cab, row, cage, slot, cpu;
    unsigned int	node_cpus, node_mem;
    char		proc_type[BASIL_STRING_SHORT];
    MYSQL_BIND	bind_cols[COLUMN_COUNT];
    my_bool		is_null[COLUMN_COUNT];
    my_bool		is_error[COLUMN_COUNT];
    int		is_gemini, i;
    time_t		now = time(NULL);

    memset(params, 0, sizeof(params));
    params[0].buffer_type = MYSQL_TYPE_LONG;
    params[0].is_unsigned = true;
    params[0].is_null     = (my_bool *)0;
    params[0].buffer      = (char *)&node_id;

    memset(bind_cols, 0, sizeof(bind_cols));
    for (i = 0; i < COLUMN_COUNT; i ++) {
        bind_cols[i].is_null = &is_null[i];
        bind_cols[i].error   = &is_error[i];

        if (i == COL_TYPE) {
            bind_cols[i].buffer_type   = MYSQL_TYPE_STRING;
            bind_cols[i].buffer_length = sizeof(proc_type);
            bind_cols[i].buffer	   = proc_type;
        } else {
            bind_cols[i].buffer_type   = MYSQL_TYPE_LONG;
            bind_cols[i].is_unsigned   = (i >= COL_CORES);
        }
    }
    bind_cols[COL_X].buffer	     = (char *)&x_coord;
    bind_cols[COL_Y].buffer	     = (char *)&y_coord;
    bind_cols[COL_Z].buffer	     = (char *)&z_coord;
    bind_cols[COL_CAB].buffer    = (char *)&cab;
    bind_cols[COL_ROW].buffer    = (char *)&row;
    bind_cols[COL_CAGE].buffer   = (char *)&cage;
    bind_cols[COL_SLOT].buffer   = (char *)&slot;
    bind_cols[COL_CPU].buffer    = (char *)&cpu;
    bind_cols[COL_CORES].buffer  = (char *)&node_cpus;
    bind_cols[COL_MEMORY].buffer = (char *)&node_mem;

    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get initial BASIL inventory");

    info("BASIL %s initial INVENTORY: %d/%d batch nodes available",
         bv_names_long[version], inv->batch_avail, inv->batch_total);

    handle = cray_connect_sdb();
    if (handle == NULL)
        fatal("can not connect to XTAdmin database on the SDB");

    is_gemini = cray_is_gemini_system(handle);
    if (is_gemini < 0)
        fatal("can not determine Cray XT/XE system type");

    stmt = prepare_stmt(handle, query, params, PARAM_COUNT,
                        bind_cols, COLUMN_COUNT);
    if (stmt == NULL)
        fatal("can not prepare statement to resolve Cray coordinates");

    for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) {
        struct basil_node *node;
        char *reason = NULL;

        if ((node_ptr->name == NULL) ||
                (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) {
            error("can not read basil_node_id from %s",
                  node_ptr->name);
            continue;
        }

        if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0)
            fatal("can not resolve %s coordinates", node_ptr->name);

        if (fetch_stmt(stmt) == 0) {
#if _DEBUG
            info("proc_type:%s cpus:%u memory:%u",
                 proc_type, node_cpus, node_mem);
            info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u",
                 row, cage, slot, cpu, x_coord, y_coord, z_coord);
#endif
            if (strcmp(proc_type, "compute") != 0) {
                /*
                 * Switching a compute node to be a service node
                 * can not happen at runtime: requires a reboot.
                 */
                fatal("Node '%s' is a %s node. "
                      "Only compute nodes can appear in slurm.conf.",
                      node_ptr->name, proc_type);
            } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) {
                /*
                 * This can happen if a node has been disabled
                 * on the SMW (using 'xtcli disable <nid>'). The
                 * node will still be listed in the 'processor'
                 * table, but have no 'attributes' entry (NULL
                 * values for CPUs/memory). Also, the node will
                 * be invisible to ALPS, which is why we need to
                 * set it down here already.
                 */
                node_cpus = node_mem = 0;
                reason = "node data unknown - disabled on SMW?";
            } else if (is_null[COL_X] || is_null[COL_Y]
                       || is_null[COL_Z]) {
                /*
                 * Similar case to the one above, observed when
                 * a blade has been removed. Node will not
                 * likely show up in ALPS.
                 */
                x_coord = y_coord = z_coord = 0;
                reason = "unknown coordinates - hardware failure?";
            } else if (node_cpus < node_ptr->config_ptr->cpus) {
                /*
                 * FIXME: Might reconsider this policy.
                 *
                 * FastSchedule is ignored here, it requires the
                 * slurm.conf to be consistent with hardware.
                 *
                 * Assumption is that CPU/Memory do not change
                 * at runtime (Cray has no hot-swappable parts).
                 *
                 * Hence checking it in basil_inventory() would
                 * mean a lot of runtime overhead.
                 */
                fatal("slurm.conf: node %s has only Procs=%d",
                      node_ptr->name, node_cpus);
            } else if (node_mem < node_ptr->config_ptr->real_memory) {
                fatal("slurm.conf: node %s has RealMemory=%d",
                      node_ptr->name, node_mem);
            }

        } else if (is_gemini) {
            fatal("Non-existing Gemini node '%s' in slurm.conf",
                  node_ptr->name);
        } else {
            fatal("Non-existing SeaStar node '%s' in slurm.conf",
                  node_ptr->name);
        }

        if (!is_gemini) {
            /*
             * SeaStar: each node has unique coordinates
             */
            if (node_ptr->arch == NULL)
                node_ptr->arch = xstrdup("XT");
        } else {
            /*
             * Gemini: each 2 nodes share the same network
             * interface (i.e., nodes 0/1 and 2/3 each have
             * the same coordinates).
             */
            if (node_ptr->arch == NULL)
                node_ptr->arch = xstrdup("XE");
        }

        xfree(node_ptr->node_hostname);
        xfree(node_ptr->comm_name);
        /*
         * Convention: since we are using SLURM in frontend-mode,
         *             we use Node{Addr,HostName} as follows.
         *
         * NodeAddr:      <X><Y><Z> coordinates in base-36 encoding
         *
         * NodeHostName:  c#-#c#s#n# using the  NID convention
         *                <cabinet>-<row><chassis><slot><node>
         * - each cabinet can accommodate 3 chassis (c1..c3)
         * - each chassis has 8 slots               (s0..s7)
         * - each slot contains 2 or 4 nodes        (n0..n3)
         *   o either 2 service nodes (n0/n3)
         *   o or 4 compute nodes     (n0..n3)
         *   o or 2 gemini chips      (g0/g1 serving n0..n3)
         *
         * Example: c0-0c1s0n1
         *          - c0- = cabinet 0
         *          - 0   = row     0
         *          - c1  = chassis 1
         *          - s0  = slot    0
         *          - n1  = node    1
         */
        node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab,
                                  row, cage, slot, cpu);
        node_ptr->comm_name = xstrdup_printf("%c%c%c",
                                             _enc_coord(x_coord),
                                             _enc_coord(y_coord),
                                             _enc_coord(z_coord));
        dim_size[0] = MAX(dim_size[0], (x_coord - 1));
        dim_size[1] = MAX(dim_size[1], (y_coord - 1));
        dim_size[2] = MAX(dim_size[2], (z_coord - 1));
#if _DEBUG
        info("%s  %s  %s  cpus=%u, mem=%u reason=%s", node_ptr->name,
             node_ptr->node_hostname, node_ptr->comm_name,
             node_cpus, node_mem, reason);
#endif
        /*
         * Check the current state reported by ALPS inventory, unless it
         * is already evident that the node has some other problem.
         */
        if (reason == NULL) {
            for (node = inv->f->node_head; node; node = node->next)
                if (node->node_id == node_id)
                    break;
            if (node == NULL) {
                reason = "not visible to ALPS - check hardware";
            } else if (node->state == BNS_DOWN) {
                reason = "ALPS marked it DOWN";
            } else if (node->state == BNS_UNAVAIL) {
                reason = "node is UNAVAILABLE";
            } else if (node->state == BNS_ROUTE) {
                reason = "node does ROUTING";
            } else if (node->state == BNS_SUSPECT) {
                reason = "entered SUSPECT mode";
            } else if (node->state == BNS_ADMINDOWN) {
                reason = "node is ADMINDOWN";
            } else if (node->state != BNS_UP) {
                reason = "state not UP";
            } else if (node->role != BNR_BATCH) {
                reason = "mode not BATCH";
            } else if (node->arch != BNA_XT) {
                reason = "arch not XT/XE";
            }
        }

        /* Base state entirely derives from ALPS
         * NOTE: The node bitmaps are not defined when this code is
         * initially executed. */
        node_ptr->node_state &= NODE_STATE_FLAGS;
        if (reason) {
            if (node_ptr->down_time == 0)
                node_ptr->down_time = now;
            if (IS_NODE_DOWN(node_ptr)) {
                /* node still down */
                debug("Initial DOWN node %s - %s",
                      node_ptr->name, node_ptr->reason);
            } else if (slurmctld_conf.slurmd_timeout &&
                       ((now - node_ptr->down_time) <
                        slurmctld_conf.slurmd_timeout)) {
                node_ptr->node_state |= NODE_STATE_NO_RESPOND;
            } else {
                info("Initial DOWN node %s - %s",
                     node_ptr->name, reason);
                node_ptr->reason = xstrdup(reason);
                /* Node state flags preserved above */
                node_ptr->node_state |= NODE_STATE_DOWN;
                clusteracct_storage_g_node_down(acct_db_conn,
                                                node_ptr,
                                                now, NULL,
                                                slurm_get_slurm_user_id());
            }
        } else {
            bool node_up_flag = IS_NODE_DOWN(node_ptr) &&
                                !IS_NODE_DRAIN(node_ptr) &&
                                !IS_NODE_FAIL(node_ptr);
            node_ptr->down_time = 0;
            if (node_is_allocated(node))
                node_ptr->node_state |= NODE_STATE_ALLOCATED;
            else
                node_ptr->node_state |= NODE_STATE_IDLE;
            node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
            xfree(node_ptr->reason);
            if (node_up_flag) {
                info("ALPS returned node %s to service",
                     node_ptr->name);
                clusteracct_storage_g_node_up(acct_db_conn,
                                              node_ptr, now);
            }
        }

        free_stmt_result(stmt);
    }

    if (stmt_close(stmt))
        error("error closing statement: %s", mysql_stmt_error(stmt));
    cray_close_sdb(handle);
    free_inv(inv);

    return SLURM_SUCCESS;
}
Esempio n. 12
0
static uint32_t _get_wckeyid(mysql_conn_t *mysql_conn, char **name,
			     uid_t uid, char *cluster, uint32_t associd)
{
	uint32_t wckeyid = 0;

	if (slurm_get_track_wckey()) {
		/* Here we are looking for the wckeyid if it doesn't
		 * exist we will create one.  We don't need to check
		 * if it is good or not.  Right now this is the only
		 * place things are created. We do this only on a job
		 * start, not on a job submit since we don't want to
		 * slow down getting the db_index back to the
		 * controller.
		 */
		slurmdb_wckey_rec_t wckey_rec;
		char *user = NULL;

		/* since we are unable to rely on uids here (someone could
		   not have there uid in the system yet) we must
		   first get the user name from the associd */
		if (!(user = _get_user_from_associd(
			      mysql_conn, cluster, associd))) {
			error("No user for associd %u", associd);
			goto no_wckeyid;
		}
		/* get the default key */
		if (!*name) {
			slurmdb_user_rec_t user_rec;
			memset(&user_rec, 0, sizeof(slurmdb_user_rec_t));
			user_rec.uid = NO_VAL;
			user_rec.name = user;
			if (assoc_mgr_fill_in_user(mysql_conn, &user_rec,
						   1, NULL) != SLURM_SUCCESS) {
				error("No user by name of %s assoc %u",
				      user, associd);
				xfree(user);
				goto no_wckeyid;
			}

			if (user_rec.default_wckey)
				*name = xstrdup_printf("*%s",
						       user_rec.default_wckey);
			else
				*name = xstrdup_printf("*");
		}

		memset(&wckey_rec, 0, sizeof(slurmdb_wckey_rec_t));
		wckey_rec.name = (*name);
		wckey_rec.uid = NO_VAL;
		wckey_rec.user = user;
		wckey_rec.cluster = cluster;
		if (assoc_mgr_fill_in_wckey(mysql_conn, &wckey_rec,
					    ACCOUNTING_ENFORCE_WCKEYS,
					    NULL) != SLURM_SUCCESS) {
			List wckey_list = NULL;
			slurmdb_wckey_rec_t *wckey_ptr = NULL;

			wckey_list = list_create(slurmdb_destroy_wckey_rec);

			wckey_ptr = xmalloc(sizeof(slurmdb_wckey_rec_t));
			wckey_ptr->name = xstrdup((*name));
			wckey_ptr->user = xstrdup(user);
			wckey_ptr->cluster = xstrdup(cluster);
			list_append(wckey_list, wckey_ptr);
			/* info("adding wckey '%s' '%s' '%s'", */
			/* 	     wckey_ptr->name, wckey_ptr->user, */
			/* 	     wckey_ptr->cluster); */
			/* we have already checked to make
			   sure this was the slurm user before
			   calling this */
			if (as_mysql_add_wckeys(mysql_conn,
						slurm_get_slurm_user_id(),
						wckey_list)
			    == SLURM_SUCCESS)
				acct_storage_p_commit(mysql_conn, 1);
			/* If that worked lets get it */
			assoc_mgr_fill_in_wckey(mysql_conn, &wckey_rec,
						ACCOUNTING_ENFORCE_WCKEYS,
						NULL);

			list_destroy(wckey_list);
		}
		xfree(user);
		/* info("got wckeyid of %d", wckey_rec.id); */
		wckeyid = wckey_rec.id;
	}
no_wckeyid:
	return wckeyid;
}
Esempio n. 13
0
/* Find the specified BlueGene node ID and drain it from SLURM */
static void _configure_node_down(rm_bp_id_t bp_id, my_bluegene_t *my_bg)
{
	int bp_num, i, rc;
	rm_bp_id_t bpid;
	rm_BP_t *my_bp;
	rm_location_t bp_loc;
	rm_BP_state_t bp_state;
	char bg_down_node[128];

	if ((rc = bridge_get_data(my_bg, RM_BPNum, &bp_num)) != SLURM_SUCCESS) {
		error("bridge_get_data(RM_BPNum): %s", bg_err_str(rc));
		bp_num = 0;
	}

	for (i=0; i<bp_num; i++) {
		if (i) {
			if ((rc = bridge_get_data(my_bg, RM_NextBP, &my_bp))
			    != SLURM_SUCCESS) {
				error("bridge_get_data(RM_NextBP): %s",
				      bg_err_str(rc));
				continue;
			}
		} else {
			if ((rc = bridge_get_data(my_bg, RM_FirstBP, &my_bp))
			    !=
			    SLURM_SUCCESS) {
				error("bridge_get_data(RM_FirstBP): %s",
				      bg_err_str(rc));
				continue;
			}
		}

		if ((rc = bridge_get_data(my_bp, RM_BPID, &bpid))
		    != SLURM_SUCCESS) {
			error("bridge_get_data(RM_BPID): %s",
			      bg_err_str(rc));
			continue;
		}

		if (!bpid) {
			error("No BPID was returned from database");
			continue;
		}

		if (strcmp(bp_id, bpid) != 0) {	/* different midplane */
			free(bpid);
			continue;
		}
		free(bpid);

		if ((rc = bridge_get_data(my_bp, RM_BPState, &bp_state))
		    != SLURM_SUCCESS) {
			error("bridge_get_data(RM_BPState): %s",
			      bg_err_str(rc));
			continue;
		}
		if  (bp_state != RM_BP_UP) 		/* already down */
			continue;

		if ((rc = bridge_get_data(my_bp, RM_BPLoc, &bp_loc))
		    != SLURM_SUCCESS) {
			error("bridge_get_data(RM_BPLoc): %s",
			      bg_err_str(rc));
			continue;
		}

		/* make sure we have this midplane in the system */
		if (bp_loc.X >= DIM_SIZE[X]
		    || bp_loc.Y >= DIM_SIZE[Y]
		    || bp_loc.Z >= DIM_SIZE[Z]) {
			debug4("node %s%c%c%c isn't configured",
			       bg_conf->slurm_node_prefix,
			       alpha_num[bp_loc.X], alpha_num[bp_loc.Y],
			       alpha_num[bp_loc.Z]);
			continue;
		}

		snprintf(bg_down_node, sizeof(bg_down_node), "%s%c%c%c",
			 bg_conf->slurm_node_prefix,
			 alpha_num[bp_loc.X], alpha_num[bp_loc.Y],
			 alpha_num[bp_loc.Z]);


		if (node_already_down(bg_down_node))
			break;

		error("switch for node %s is bad", bg_down_node);
		slurm_drain_nodes(bg_down_node,
				  "select_bluegene: MMCS switch not UP",
				  slurm_get_slurm_user_id());
		break;
	}
}
Esempio n. 14
0
/*
 * This could potentially lock the node lock in the slurmctld with
 * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we
 * will call the functions without locking the locks again.
 */
extern int down_nodecard(char *mp_name, bitoff_t io_start,
			 bool slurmctld_locked)
{
	List requests = NULL;
	List delete_list = NULL;
	ListIterator itr = NULL;
	bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record;
	bg_record_t *smallest_bg_record = NULL;
	struct node_record *node_ptr = NULL;
	int mp_bit = 0;
	static int io_cnt = NO_VAL;
	static int create_size = NO_VAL;
	static select_ba_request_t blockreq;
	int rc = SLURM_SUCCESS;
	char *reason = "select_bluegene: nodecard down";

	xassert(mp_name);

	if (io_cnt == NO_VAL) {
		io_cnt = 1;
		/* Translate 1 nodecard count to ionode count */
		if ((io_cnt *= bg_conf->io_ratio))
			io_cnt--;

		/* make sure we create something that is able to be
		   created */
		if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt)
			create_size = bg_conf->nodecard_cnode_cnt;
		else
			create_size = bg_conf->smallest_block;
	}

	node_ptr = find_node_record(mp_name);
	if (!node_ptr) {
		error ("down_sub_node_blocks: invalid node specified '%s'",
		       mp_name);
		return EINVAL;
	}

	/* this is here for sanity check to make sure we don't core on
	   these bits when we set them below. */
	if (io_start >= bg_conf->ionodes_per_mp
	    || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) {
		debug("io %d-%d not configured on this "
		      "system, only %d ionodes per midplane",
		      io_start, io_start+io_cnt, bg_conf->ionodes_per_mp);
		return EINVAL;
	}
	mp_bit = (node_ptr - node_record_table_ptr);

	memset(&blockreq, 0, sizeof(select_ba_request_t));

	blockreq.conn_type[0] = SELECT_SMALL;
	blockreq.save_name = mp_name;

	debug3("here setting node %d of %d and ionodes %d-%d of %d",
	       mp_bit, node_record_count, io_start,
	       io_start+io_cnt, bg_conf->ionodes_per_mp);

	memset(&tmp_record, 0, sizeof(bg_record_t));
	tmp_record.mp_count = 1;
	tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt;
	tmp_record.mp_bitmap = bit_alloc(node_record_count);
	bit_set(tmp_record.mp_bitmap, mp_bit);

	tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
	bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);

	slurm_mutex_lock(&block_state_mutex);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		if (!bit_test(bg_record->mp_bitmap, mp_bit))
			continue;

		if (!blocks_overlap(bg_record, &tmp_record))
			continue;

		if (bg_record->job_running > NO_JOB_RUNNING) {
			if (slurmctld_locked)
				job_fail(bg_record->job_running);
			else
				slurm_fail_job(bg_record->job_running);

		}
		/* If Running Dynamic mode and the block is
		   smaller than the create size just continue on.
		*/
		if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
		    && (bg_record->cnode_cnt < create_size)) {
			if (!delete_list)
				delete_list = list_create(NULL);
			list_append(delete_list, bg_record);
			continue;
		}

		/* keep track of the smallest size that is at least
		   the size of create_size. */
		if (!smallest_bg_record ||
		    (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt))
			smallest_bg_record = bg_record;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
		debug3("running non-dynamic mode");
		/* This should never happen, but just in case... */
		if (delete_list)
			list_destroy(delete_list);

		/* If we found a block that is smaller or equal to a
		   midplane we will just mark it in an error state as
		   opposed to draining the node.
		*/
		if (smallest_bg_record
		    && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){
			if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
				rc = SLURM_NO_CHANGE_IN_DATA;
				goto cleanup;
			}

			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		debug("No block under 1 midplane available for this nodecard.  "
		      "Draining the whole node.");
		if (!node_already_down(mp_name)) {
			if (slurmctld_locked)
				drain_nodes(mp_name, reason,
					    slurm_get_slurm_user_id());
			else
				slurm_drain_nodes(mp_name, reason,
						  slurm_get_slurm_user_id());
		}
		rc = SLURM_SUCCESS;
		goto cleanup;
	}

	/* below is only for Dynamic mode */

	if (delete_list) {
		int cnt_set = 0;
		bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp);
		/* don't lock here since it is handled inside
		   the put_block_in_error_state
		*/
		itr = list_iterator_create(delete_list);
		while ((bg_record = list_next(itr))) {
			debug2("combining smaller than nodecard "
			       "dynamic block %s",
			       bg_record->bg_block_id);
			while (bg_record->job_running > NO_JOB_RUNNING)
				sleep(1);

			bit_or(iobitmap, bg_record->ionode_bitmap);
			cnt_set++;
		}
		list_iterator_destroy(itr);
		list_destroy(delete_list);
		if (!cnt_set) {
			FREE_NULL_BITMAP(iobitmap);
			rc = SLURM_ERROR;
			goto cleanup;
		}
		/* set the start to be the same as the start of the
		   ionode_bitmap.  If no ionodes set (not a small
		   block) set io_start = 0. */
		if ((io_start = bit_ffs(iobitmap)) == -1) {
			io_start = 0;
			if (create_size > bg_conf->nodecard_cnode_cnt)
				blockreq.small128 = 4;
			else
				blockreq.small32 = 16;
		} else if (create_size <= bg_conf->nodecard_cnode_cnt)
			blockreq.small32 = 1;
		else
			/* this should never happen */
			blockreq.small128 = 1;

		FREE_NULL_BITMAP(iobitmap);
	} else if (smallest_bg_record) {
		debug2("smallest dynamic block is %s",
		       smallest_bg_record->bg_block_id);
		if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
			rc = SLURM_NO_CHANGE_IN_DATA;
			goto cleanup;
		}

		while (smallest_bg_record->job_running > NO_JOB_RUNNING)
			sleep(1);

		if (smallest_bg_record->cnode_cnt == create_size) {
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		if (create_size > smallest_bg_record->cnode_cnt) {
			/* we should never get here.  This means we
			 * have a create_size that is bigger than a
			 * block that is already made.
			 */
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}
		debug3("node count is %d", smallest_bg_record->cnode_cnt);
		switch(smallest_bg_record->cnode_cnt) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small32 = 2;
			break;
		case 256:
			blockreq.small32 = 8;
			break;
#endif
		case 128:
			blockreq.small32 = 4;
			break;
		case 512:
		default:
			blockreq.small32 = 16;
			break;
		}

		if (create_size != bg_conf->nodecard_cnode_cnt) {
			blockreq.small128 = blockreq.small32 / 4;
			blockreq.small32 = 0;
			io_start = 0;
		} else if ((io_start =
			    bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
			/* set the start to be the same as the start of the
			   ionode_bitmap.  If no ionodes set (not a small
			   block) set io_start = 0. */
			io_start = 0;
	} else {
		switch(create_size) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small64 = 8;
			break;
		case 256:
			blockreq.small256 = 2;
#endif
		case 32:
			blockreq.small32 = 16;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
		case 512:
			if (!node_already_down(mp_name)) {
				char *reason = "select_bluegene: nodecard down";
				if (slurmctld_locked)
					drain_nodes(mp_name, reason,
						    slurm_get_slurm_user_id());
				else
					slurm_drain_nodes(
						mp_name, reason,
						slurm_get_slurm_user_id());
			}
			rc = SLURM_SUCCESS;
			goto cleanup;
			break;
		default:
			error("Unknown create size of %d", create_size);
			break;
		}
		/* since we don't have a block in this midplane
		   we need to start at the beginning. */
		io_start = 0;
		/* we also need a bg_block to pretend to be the
		   smallest block that takes up the entire midplane. */
	}


	/* Here we need to add blocks that take up nodecards on this
	   midplane.  Since Slurm only keeps track of midplanes
	   natively this is the only want to handle this case.
	*/
	requests = list_create(destroy_bg_record);
	add_bg_record(requests, NULL, &blockreq, 1, io_start);

	slurm_mutex_lock(&block_state_mutex);
	delete_list = list_create(NULL);
	while ((bg_record = list_pop(requests))) {
		itr = list_iterator_create(bg_lists->main);
		while ((found_record = list_next(itr))) {
			if (!blocks_overlap(bg_record, found_record))
				continue;
			list_push(delete_list, found_record);
			list_remove(itr);
		}
		list_iterator_destroy(itr);

		/* we need to add this record since it doesn't exist */
		if (bridge_block_create(bg_record) == SLURM_ERROR) {
			destroy_bg_record(bg_record);
			error("down_sub_node_blocks: "
			      "unable to configure block in api");
			continue;
		}

		debug("adding block %s to fill in small blocks "
		      "around bad nodecards",
		      bg_record->bg_block_id);
		print_bg_record(bg_record);
		list_append(bg_lists->main, bg_record);
		if (bit_overlap(bg_record->ionode_bitmap,
				tmp_record.ionode_bitmap)) {
			/* here we know the error block doesn't exist
			   so just set the state here */
			slurm_mutex_unlock(&block_state_mutex);
			rc = put_block_in_error_state(bg_record, reason);
			slurm_mutex_lock(&block_state_mutex);
		}
	}
	list_destroy(requests);

	if (delete_list) {
		slurm_mutex_unlock(&block_state_mutex);
		free_block_list(NO_VAL, delete_list, 0, 0);
		list_destroy(delete_list);
	}
	slurm_mutex_lock(&block_state_mutex);
	sort_bg_record_inc_size(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);
	last_bg_update = time(NULL);

cleanup:
	FREE_NULL_BITMAP(tmp_record.mp_bitmap);
	FREE_NULL_BITMAP(tmp_record.ionode_bitmap);

	return rc;

}
Esempio n. 15
0
/* block_state_mutex should be locked before calling */
static int _check_all_blocks_error(int node_inx, time_t event_time,
				   char *reason)
{
	bg_record_t *bg_record = NULL;
	ListIterator itr = NULL;
	struct node_record send_node, *node_ptr;
	struct config_record config_rec;
	int total_cpus = 0;
	int rc = SLURM_SUCCESS;

	xassert(node_inx <= node_record_count);
	node_ptr = &node_record_table_ptr[node_inx];

	/* only do this if the node isn't in the DRAINED state.
	   DRAINING is ok */
	if (IS_NODE_DRAINED(node_ptr))
		return rc;

	memset(&send_node, 0, sizeof(struct node_record));
	memset(&config_rec, 0, sizeof(struct config_record));
	send_node.name = xstrdup(node_ptr->name);
	send_node.config_ptr = &config_rec;

	/* here we need to check if there are any other blocks on this
	   midplane and adjust things correctly */
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		/* only look at other nodes in error state */
		if (!(bg_record->state & BG_BLOCK_ERROR_FLAG))
			continue;
		if (!bit_test(bg_record->mp_bitmap, node_inx))
			continue;
		if (bg_record->cpu_cnt >= bg_conf->cpus_per_mp) {
			total_cpus = bg_conf->cpus_per_mp;
			break;
		} else
			total_cpus += bg_record->cpu_cnt;
	}
	list_iterator_destroy(itr);

	send_node.cpus = total_cpus;
	config_rec.cpus = total_cpus;

	if (send_node.cpus) {
		if (!reason)
			reason = "update block: setting partial node down.";
		if (!node_ptr->reason)
			node_ptr->reason = xstrdup(reason);
		node_ptr->reason_time = event_time;
		node_ptr->reason_uid = slurm_get_slurm_user_id();

		send_node.node_state = NODE_STATE_ERROR;
		rc = clusteracct_storage_g_node_down(acct_db_conn,
						     &send_node, event_time,
						     reason,
						     node_ptr->reason_uid);
	} else {
		if (node_ptr->reason)
			xfree(node_ptr->reason);
		node_ptr->reason_time = 0;
		node_ptr->reason_uid = NO_VAL;

		send_node.node_state = NODE_STATE_IDLE;
		rc = clusteracct_storage_g_node_up(acct_db_conn,
						   &send_node, event_time);
	}

	xfree(send_node.name);

	return rc;
}