Пример #1
0
extern int slurm_persist_conn_reopen(slurm_persist_conn_t *persist_conn,
				     bool with_init)
{
	slurm_persist_conn_close(persist_conn);

	if (with_init)
		return slurm_persist_conn_open(persist_conn);
	else
		return slurm_persist_conn_open_without_init(persist_conn);
}
Пример #2
0
static int _open_controller_conn(slurmdb_cluster_rec_t *cluster, bool locked)
{
	int rc;
	slurm_persist_conn_t *persist_conn = NULL;
	static int timeout = -1;

	if (timeout < 0)
		timeout = slurm_get_msg_timeout() * 1000;

	if (cluster == fed_mgr_cluster_rec) {
		info("%s: hey! how did we get here with ourselves?", __func__);
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&cluster->lock);

	if (!cluster->control_host || !cluster->control_host[0] ||
	    !cluster->control_port) {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
			info("%s: Sibling cluster %s doesn't appear up yet, skipping",
			     __func__, cluster->name);
		if (!locked)
			slurm_mutex_unlock(&cluster->lock);
		return SLURM_ERROR;
	}

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
		info("opening sibling conn to %s", cluster->name);

	if (!cluster->fed.send) {
		persist_conn = xmalloc(sizeof(slurm_persist_conn_t));

		cluster->fed.send = persist_conn;

		/* Since this connection is coming from us, make it so ;) */
		persist_conn->cluster_name = xstrdup(slurmctld_cluster_name);
		persist_conn->my_port = slurmctld_conf.slurmctld_port;
		persist_conn->rem_host = xstrdup(cluster->control_host);
		persist_conn->rem_port = cluster->control_port;
		persist_conn->shutdown = &slurmctld_config.shutdown_time;
		persist_conn->timeout = timeout; /* don't put this as 0 it
						  * could cause deadlock */
	} else {
		persist_conn = cluster->fed.send;

		/* Perhaps a backup came up, so don't assume it was the same
		 * host or port we had before.
		 */
		xfree(persist_conn->rem_host);
		persist_conn->rem_host = xstrdup(cluster->control_host);
		persist_conn->rem_port = cluster->control_port;
	}

	rc = slurm_persist_conn_open(persist_conn);
	if (rc != SLURM_SUCCESS) {
		error("fed_mgr: Unable to open connection to cluster %s using host %s(%u)",
		      cluster->name,
		      persist_conn->rem_host, persist_conn->rem_port);
	} else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
		info("opened sibling conn to %s:%d",
		     cluster->name, persist_conn->fd);
	if (!locked)
		slurm_mutex_unlock(&cluster->lock);

	return rc;
}
Пример #3
0
/* Open a connection to the Slurm DBD and set slurmdbd_conn */
static void _open_slurmdbd_conn(bool need_db)
{
	bool try_backup = true;
	int rc;

	if (slurmdbd_conn && slurmdbd_conn->fd >= 0) {
		debug("Attempt to re-open slurmdbd socket");
		/* clear errno (checked after this for errors) */
		errno = 0;
		return;
	}

	slurm_persist_conn_close(slurmdbd_conn);
	if (!slurmdbd_conn) {
		slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t));
		slurmdbd_conn->flags =
			PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT;
		slurmdbd_conn->persist_type = PERSIST_TYPE_DBD;

		if (!slurmdbd_cluster)
			slurmdbd_cluster = slurm_get_cluster_name();

		slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster);

		slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000;

		slurmdbd_conn->rem_port = slurm_get_accounting_storage_port();

		if (!slurmdbd_conn->rem_port) {
			slurmdbd_conn->rem_port = SLURMDBD_PORT;
			slurm_set_accounting_storage_port(
				slurmdbd_conn->rem_port);
		}
	}
	slurmdbd_shutdown = 0;
	slurmdbd_conn->shutdown = &slurmdbd_shutdown;
	slurmdbd_conn->version  = SLURM_PROTOCOL_VERSION;

	xfree(slurmdbd_conn->rem_host);
	slurmdbd_conn->rem_host = slurm_get_accounting_storage_host();
	if (!slurmdbd_conn->rem_host) {
		slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST);
		slurm_set_accounting_storage_host(
			slurmdbd_conn->rem_host);
	}

again:

	if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) &&
	    try_backup) {
		xfree(slurmdbd_conn->rem_host);
		try_backup = false;
		if ((slurmdbd_conn->rem_host =
		     slurm_get_accounting_storage_backup_host()))
			goto again;
	}

	if (rc == SLURM_SUCCESS) {
		/* set the timeout to the timeout to be used for all other
		 * messages */
		slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000;
		if (slurmdbd_conn->trigger_callbacks.dbd_resumed)
			(slurmdbd_conn->trigger_callbacks.dbd_resumed)();
		if (slurmdbd_conn->trigger_callbacks.db_resumed)
			(slurmdbd_conn->trigger_callbacks.db_resumed)();
	}

	if ((!need_db && (rc == ESLURM_DB_CONNECTION)) ||
	    (rc == SLURM_SUCCESS)) {
		debug("slurmdbd: Sent PersistInit msg");
		/* clear errno (checked after this for
		   errors)
		*/
		errno = 0;
	} else {
		if ((rc == ESLURM_DB_CONNECTION) &&
		    slurmdbd_conn->trigger_callbacks.db_fail)
			(slurmdbd_conn->trigger_callbacks.db_fail)();

		error("slurmdbd: Sending PersistInit msg: %m");
		slurm_persist_conn_close(slurmdbd_conn);
	}
}
Пример #4
0
/* Open a connection to the Slurm DBD and set slurmdbd_conn */
static void _open_slurmdbd_conn(bool need_db)
{
	char *backup_host = NULL;
	int rc;

	if (slurmdbd_conn && slurmdbd_conn->fd >= 0) {
		debug("Attempt to re-open slurmdbd socket");
		/* clear errno (checked after this for errors) */
		errno = 0;
		return;
	}

	slurm_persist_conn_close(slurmdbd_conn);
	if (!slurmdbd_conn) {
		slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t));
		slurmdbd_conn->flags =
			PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT;
		slurmdbd_conn->persist_type = PERSIST_TYPE_DBD;

		if (!slurmdbd_cluster)
			slurmdbd_cluster = slurm_get_cluster_name();

		slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster);

		slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000;

		slurmdbd_conn->rem_port = slurm_get_accounting_storage_port();

		if (!slurmdbd_conn->rem_port) {
			slurmdbd_conn->rem_port = SLURMDBD_PORT;
			slurm_set_accounting_storage_port(
				slurmdbd_conn->rem_port);
		}
	}
	slurmdbd_shutdown = 0;
	slurmdbd_conn->shutdown = &slurmdbd_shutdown;
	slurmdbd_conn->version  = SLURM_PROTOCOL_VERSION;

	xfree(slurmdbd_conn->rem_host);
	slurmdbd_conn->rem_host = slurm_get_accounting_storage_host();
	if (!slurmdbd_conn->rem_host) {
		slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST);
		slurm_set_accounting_storage_host(
			slurmdbd_conn->rem_host);
	}

	// See if a backup slurmdbd is configured
	backup_host = slurm_get_accounting_storage_backup_host();

again:
	// A connection failure is only an error if backup dne or also fails
	if (backup_host)
		slurmdbd_conn->flags |= PERSIST_FLAG_SUPPRESS_ERR;
	else
		slurmdbd_conn->flags &= (~PERSIST_FLAG_SUPPRESS_ERR);

	if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) &&
	    backup_host) {
		xfree(slurmdbd_conn->rem_host);
		// Force the next error to display
		slurmdbd_conn->comm_fail_time = 0;
		slurmdbd_conn->rem_host = backup_host;
		backup_host = NULL;
		goto again;
	}

	xfree(backup_host);

	if (rc == SLURM_SUCCESS) {
		/* set the timeout to the timeout to be used for all other
		 * messages */
		slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000;
		if (slurmdbd_conn->trigger_callbacks.dbd_resumed)
			(slurmdbd_conn->trigger_callbacks.dbd_resumed)();
		if (slurmdbd_conn->trigger_callbacks.db_resumed)
			(slurmdbd_conn->trigger_callbacks.db_resumed)();
	}

	if ((!need_db && (rc == ESLURM_DB_CONNECTION)) ||
	    (rc == SLURM_SUCCESS)) {
		debug("slurmdbd: Sent PersistInit msg");
		/* clear errno (checked after this for
		   errors)
		*/
		errno = 0;
	} else {
		if ((rc == ESLURM_DB_CONNECTION) &&
		    slurmdbd_conn->trigger_callbacks.db_fail)
			(slurmdbd_conn->trigger_callbacks.db_fail)();

		error("slurmdbd: Sending PersistInit msg: %m");
		slurm_persist_conn_close(slurmdbd_conn);
	}
}