extern int slurm_persist_send_msg( slurm_persist_conn_t *persist_conn, Buf buffer) { uint32_t msg_size, nw_size; char *msg; ssize_t msg_wrote; int rc, retry_cnt = 0; xassert(persist_conn); if (persist_conn->fd < 0) return EAGAIN; if (!buffer) return SLURM_ERROR; rc = slurm_persist_conn_writeable(persist_conn); if (rc == -1) { re_open: if (retry_cnt++ > 3) return EAGAIN; /* if errno is ACCESS_DENIED do not try to reopen to connection just return that */ if (errno == ESLURM_ACCESS_DENIED) return ESLURM_ACCESS_DENIED; if (persist_conn->flags & PERSIST_FLAG_RECONNECT) { slurm_persist_conn_reopen(persist_conn, true); rc = slurm_persist_conn_writeable(persist_conn); } else return SLURM_ERROR; } if (rc < 1) return EAGAIN; msg_size = get_buf_offset(buffer); nw_size = htonl(msg_size); msg_wrote = write(persist_conn->fd, &nw_size, sizeof(nw_size)); if (msg_wrote != sizeof(nw_size)) return EAGAIN; msg = get_buf_data(buffer); while (msg_size > 0) { rc = slurm_persist_conn_writeable(persist_conn); if (rc == -1) goto re_open; if (rc < 1) return EAGAIN; msg_wrote = write(persist_conn->fd, msg, msg_size); if (msg_wrote <= 0) return EAGAIN; msg += msg_wrote; msg_size -= msg_wrote; } return SLURM_SUCCESS; }
/* run_dbd_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ extern void run_dbd_backup(void) { slurm_persist_conn_t slurmdbd_conn; primary_resumed = false; memset(&slurmdbd_conn, 0, sizeof(slurm_persist_conn_t)); slurmdbd_conn.rem_host = slurmdbd_conf->dbd_addr; slurmdbd_conn.rem_port = slurmdbd_conf->dbd_port; slurmdbd_conn.cluster_name = "backup_slurmdbd"; slurmdbd_conn.fd = -1; slurmdbd_conn.shutdown = &shutdown_time; slurm_persist_conn_open_without_init(&slurmdbd_conn); /* repeatedly ping Primary */ while (!shutdown_time) { int writeable = slurm_persist_conn_writeable(&slurmdbd_conn); //info("%d %d", have_control, writeable); if (have_control && writeable == 1) { info("Primary has come back"); primary_resumed = true; shutdown_threads(); have_control = false; break; } else if (!have_control && writeable <= 0) { have_control = true; info("Taking Control"); break; } sleep(1); if (writeable <= 0) slurm_persist_conn_reopen(&slurmdbd_conn, false); } slurm_persist_conn_close(&slurmdbd_conn); return; }
extern Buf slurm_persist_recv_msg(slurm_persist_conn_t *persist_conn) { uint32_t msg_size, nw_size; char *msg; ssize_t msg_read, offset; Buf buffer; xassert(persist_conn); if (persist_conn->fd < 0) return NULL; if (!_conn_readable(persist_conn)) goto endit; msg_read = read(persist_conn->fd, &nw_size, sizeof(nw_size)); if (msg_read != sizeof(nw_size)) goto endit; msg_size = ntohl(nw_size); /* We don't error check for an upper limit here * since size could possibly be massive */ if (msg_size < 2) { error("Persistent Conn: Invalid msg_size (%u)", msg_size); goto endit; } msg = xmalloc(msg_size); offset = 0; while (msg_size > offset) { if (!_conn_readable(persist_conn)) break; /* problem with this socket */ msg_read = read(persist_conn->fd, (msg + offset), (msg_size - offset)); if (msg_read <= 0) { error("Persistent Conn: read: %m"); break; } offset += msg_read; } if (msg_size != offset) { if (!(*persist_conn->shutdown)) { error("Persistent Conn: only read %zd of %d bytes", offset, msg_size); } /* else in shutdown mode */ xfree(msg); goto endit; } buffer = create_buf(msg, msg_size); return buffer; endit: /* Close it since we abandoned it. If the connection does still exist * on the other end we can't rely on it after this point since we didn't * listen long enough for this response. */ if (!(*persist_conn->shutdown) && persist_conn->flags & PERSIST_FLAG_RECONNECT) slurm_persist_conn_reopen(persist_conn, true); return NULL; }