extern int slurm_persist_conn_reopen(slurm_persist_conn_t *persist_conn, bool with_init) { slurm_persist_conn_close(persist_conn); if (with_init) return slurm_persist_conn_open(persist_conn); else return slurm_persist_conn_open_without_init(persist_conn); }
static int _open_controller_conn(slurmdb_cluster_rec_t *cluster, bool locked) { int rc; slurm_persist_conn_t *persist_conn = NULL; static int timeout = -1; if (timeout < 0) timeout = slurm_get_msg_timeout() * 1000; if (cluster == fed_mgr_cluster_rec) { info("%s: hey! how did we get here with ourselves?", __func__); return SLURM_ERROR; } if (!locked) slurm_mutex_lock(&cluster->lock); if (!cluster->control_host || !cluster->control_host[0] || !cluster->control_port) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("%s: Sibling cluster %s doesn't appear up yet, skipping", __func__, cluster->name); if (!locked) slurm_mutex_unlock(&cluster->lock); return SLURM_ERROR; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("opening sibling conn to %s", cluster->name); if (!cluster->fed.send) { persist_conn = xmalloc(sizeof(slurm_persist_conn_t)); cluster->fed.send = persist_conn; /* Since this connection is coming from us, make it so ;) */ persist_conn->cluster_name = xstrdup(slurmctld_cluster_name); persist_conn->my_port = slurmctld_conf.slurmctld_port; persist_conn->rem_host = xstrdup(cluster->control_host); persist_conn->rem_port = cluster->control_port; persist_conn->shutdown = &slurmctld_config.shutdown_time; persist_conn->timeout = timeout; /* don't put this as 0 it * could cause deadlock */ } else { persist_conn = cluster->fed.send; /* Perhaps a backup came up, so don't assume it was the same * host or port we had before. */ xfree(persist_conn->rem_host); persist_conn->rem_host = xstrdup(cluster->control_host); persist_conn->rem_port = cluster->control_port; } rc = slurm_persist_conn_open(persist_conn); if (rc != SLURM_SUCCESS) { error("fed_mgr: Unable to open connection to cluster %s using host %s(%u)", cluster->name, persist_conn->rem_host, persist_conn->rem_port); } else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("opened sibling conn to %s:%d", cluster->name, persist_conn->fd); if (!locked) slurm_mutex_unlock(&cluster->lock); return rc; }
/* Open a connection to the Slurm DBD and set slurmdbd_conn */ static void _open_slurmdbd_conn(bool need_db) { bool try_backup = true; int rc; if (slurmdbd_conn && slurmdbd_conn->fd >= 0) { debug("Attempt to re-open slurmdbd socket"); /* clear errno (checked after this for errors) */ errno = 0; return; } slurm_persist_conn_close(slurmdbd_conn); if (!slurmdbd_conn) { slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t)); slurmdbd_conn->flags = PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT; slurmdbd_conn->persist_type = PERSIST_TYPE_DBD; if (!slurmdbd_cluster) slurmdbd_cluster = slurm_get_cluster_name(); slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster); slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000; slurmdbd_conn->rem_port = slurm_get_accounting_storage_port(); if (!slurmdbd_conn->rem_port) { slurmdbd_conn->rem_port = SLURMDBD_PORT; slurm_set_accounting_storage_port( slurmdbd_conn->rem_port); } } slurmdbd_shutdown = 0; slurmdbd_conn->shutdown = &slurmdbd_shutdown; slurmdbd_conn->version = SLURM_PROTOCOL_VERSION; xfree(slurmdbd_conn->rem_host); slurmdbd_conn->rem_host = slurm_get_accounting_storage_host(); if (!slurmdbd_conn->rem_host) { slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST); slurm_set_accounting_storage_host( slurmdbd_conn->rem_host); } again: if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) && try_backup) { xfree(slurmdbd_conn->rem_host); try_backup = false; if ((slurmdbd_conn->rem_host = slurm_get_accounting_storage_backup_host())) goto again; } if (rc == SLURM_SUCCESS) { /* set the timeout to the timeout to be used for all other * messages */ slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000; if (slurmdbd_conn->trigger_callbacks.dbd_resumed) (slurmdbd_conn->trigger_callbacks.dbd_resumed)(); if (slurmdbd_conn->trigger_callbacks.db_resumed) (slurmdbd_conn->trigger_callbacks.db_resumed)(); } if ((!need_db && (rc == ESLURM_DB_CONNECTION)) || (rc == SLURM_SUCCESS)) { debug("slurmdbd: Sent PersistInit msg"); /* clear errno (checked after this for errors) */ errno = 0; } else { if ((rc == ESLURM_DB_CONNECTION) && slurmdbd_conn->trigger_callbacks.db_fail) (slurmdbd_conn->trigger_callbacks.db_fail)(); error("slurmdbd: Sending PersistInit msg: %m"); slurm_persist_conn_close(slurmdbd_conn); } }
/* Open a connection to the Slurm DBD and set slurmdbd_conn */ static void _open_slurmdbd_conn(bool need_db) { char *backup_host = NULL; int rc; if (slurmdbd_conn && slurmdbd_conn->fd >= 0) { debug("Attempt to re-open slurmdbd socket"); /* clear errno (checked after this for errors) */ errno = 0; return; } slurm_persist_conn_close(slurmdbd_conn); if (!slurmdbd_conn) { slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t)); slurmdbd_conn->flags = PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT; slurmdbd_conn->persist_type = PERSIST_TYPE_DBD; if (!slurmdbd_cluster) slurmdbd_cluster = slurm_get_cluster_name(); slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster); slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000; slurmdbd_conn->rem_port = slurm_get_accounting_storage_port(); if (!slurmdbd_conn->rem_port) { slurmdbd_conn->rem_port = SLURMDBD_PORT; slurm_set_accounting_storage_port( slurmdbd_conn->rem_port); } } slurmdbd_shutdown = 0; slurmdbd_conn->shutdown = &slurmdbd_shutdown; slurmdbd_conn->version = SLURM_PROTOCOL_VERSION; xfree(slurmdbd_conn->rem_host); slurmdbd_conn->rem_host = slurm_get_accounting_storage_host(); if (!slurmdbd_conn->rem_host) { slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST); slurm_set_accounting_storage_host( slurmdbd_conn->rem_host); } // See if a backup slurmdbd is configured backup_host = slurm_get_accounting_storage_backup_host(); again: // A connection failure is only an error if backup dne or also fails if (backup_host) slurmdbd_conn->flags |= PERSIST_FLAG_SUPPRESS_ERR; else slurmdbd_conn->flags &= (~PERSIST_FLAG_SUPPRESS_ERR); if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) && backup_host) { xfree(slurmdbd_conn->rem_host); // Force the next error to display slurmdbd_conn->comm_fail_time = 0; slurmdbd_conn->rem_host = backup_host; backup_host = NULL; goto again; } xfree(backup_host); if (rc == SLURM_SUCCESS) { /* set the timeout to the timeout to be used for all other * messages */ slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000; if (slurmdbd_conn->trigger_callbacks.dbd_resumed) (slurmdbd_conn->trigger_callbacks.dbd_resumed)(); if (slurmdbd_conn->trigger_callbacks.db_resumed) (slurmdbd_conn->trigger_callbacks.db_resumed)(); } if ((!need_db && (rc == ESLURM_DB_CONNECTION)) || (rc == SLURM_SUCCESS)) { debug("slurmdbd: Sent PersistInit msg"); /* clear errno (checked after this for errors) */ errno = 0; } else { if ((rc == ESLURM_DB_CONNECTION) && slurmdbd_conn->trigger_callbacks.db_fail) (slurmdbd_conn->trigger_callbacks.db_fail)(); error("slurmdbd: Sending PersistInit msg: %m"); slurm_persist_conn_close(slurmdbd_conn); } }