extern int slurm_persist_conn_reopen(slurm_persist_conn_t *persist_conn, bool with_init) { slurm_persist_conn_close(persist_conn); if (with_init) return slurm_persist_conn_open(persist_conn); else return slurm_persist_conn_open_without_init(persist_conn); }
/* run_dbd_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ extern void run_dbd_backup(void) { slurm_persist_conn_t slurmdbd_conn; primary_resumed = false; memset(&slurmdbd_conn, 0, sizeof(slurm_persist_conn_t)); slurmdbd_conn.rem_host = slurmdbd_conf->dbd_addr; slurmdbd_conn.rem_port = slurmdbd_conf->dbd_port; slurmdbd_conn.cluster_name = "backup_slurmdbd"; slurmdbd_conn.fd = -1; slurmdbd_conn.shutdown = &shutdown_time; slurm_persist_conn_open_without_init(&slurmdbd_conn); /* repeatedly ping Primary */ while (!shutdown_time) { int writeable = slurm_persist_conn_writeable(&slurmdbd_conn); //info("%d %d", have_control, writeable); if (have_control && writeable == 1) { info("Primary has come back"); primary_resumed = true; shutdown_threads(); have_control = false; break; } else if (!have_control && writeable <= 0) { have_control = true; info("Taking Control"); break; } sleep(1); if (writeable <= 0) slurm_persist_conn_reopen(&slurmdbd_conn, false); } slurm_persist_conn_close(&slurmdbd_conn); return; }
/* Open a persistent socket connection * IN/OUT - persistent connection needing rem_host and rem_port filled in. * Returned completely filled in. * Returns SLURM_SUCCESS on success or SLURM_ERROR on failure */ extern int slurm_persist_conn_open(slurm_persist_conn_t *persist_conn) { int rc = SLURM_ERROR; slurm_msg_t req_msg; persist_init_req_msg_t req; persist_rc_msg_t *resp = NULL; if (slurm_persist_conn_open_without_init(persist_conn) != SLURM_SUCCESS) return rc; slurm_msg_t_init(&req_msg); /* Always send the lowest protocol since we don't know what version the * other side is running yet. */ req_msg.protocol_version = persist_conn->version; req_msg.msg_type = REQUEST_PERSIST_INIT; req_msg.flags |= SLURM_GLOBAL_AUTH_KEY; if (persist_conn->flags & PERSIST_FLAG_DBD) req_msg.flags |= SLURMDBD_CONNECTION; memset(&req, 0, sizeof(persist_init_req_msg_t)); req.cluster_name = persist_conn->cluster_name; req.persist_type = persist_conn->persist_type; req.port = persist_conn->my_port; req.version = SLURM_PROTOCOL_VERSION; req_msg.data = &req; if (slurm_send_node_msg(persist_conn->fd, &req_msg) < 0) { error("%s: failed to send persistent connection init message to %s:%d", __func__, persist_conn->rem_host, persist_conn->rem_port); _close_fd(&persist_conn->fd); } else { Buf buffer = slurm_persist_recv_msg(persist_conn); persist_msg_t msg; slurm_persist_conn_t persist_conn_tmp; if (!buffer) { if (_comm_fail_log(persist_conn)) { error("%s: No response to persist_init", __func__); } _close_fd(&persist_conn->fd); goto end_it; } memset(&msg, 0, sizeof(persist_msg_t)); memcpy(&persist_conn_tmp, persist_conn, sizeof(slurm_persist_conn_t)); /* The first unpack is done the same way for dbd or normal * communication . */ persist_conn_tmp.flags &= (~PERSIST_FLAG_DBD); rc = slurm_persist_msg_unpack(&persist_conn_tmp, &msg, buffer); free_buf(buffer); resp = (persist_rc_msg_t *)msg.data; if (resp && (rc == SLURM_SUCCESS)) { rc = resp->rc; persist_conn->version = resp->ret_info; persist_conn->flags |= resp->flags; } if (rc != SLURM_SUCCESS) { if (resp) { error("%s: Something happened with the receiving/processing of the persistent connection init message to %s:%d: %s", __func__, persist_conn->rem_host, persist_conn->rem_port, resp->comment); } else { error("%s: Failed to unpack persistent connection init resp message from %s:%d", __func__, persist_conn->rem_host, persist_conn->rem_port); } _close_fd(&persist_conn->fd); } } end_it: slurm_persist_free_rc_msg(resp); return rc; }