extern Buf slurm_persist_msg_pack(slurm_persist_conn_t *persist_conn, persist_msg_t *req_msg) { Buf buffer; xassert(persist_conn); if (persist_conn->flags & PERSIST_FLAG_DBD) buffer = pack_slurmdbd_msg((slurmdbd_msg_t *)req_msg, persist_conn->version); else { slurm_msg_t msg; slurm_msg_t_init(&msg); msg.data = req_msg->data; msg.data_size = req_msg->data_size; msg.msg_type = req_msg->msg_type; msg.protocol_version = persist_conn->version; buffer = init_buf(BUF_SIZE); pack16(req_msg->msg_type, buffer); if (pack_msg(&msg, buffer) != SLURM_SUCCESS) { free_buf(buffer); return NULL; } } return buffer; }
static void *_agent(void *x) { int cnt, rc; Buf buffer; struct timespec abs_time; static time_t fail_time = 0; int sigarray[] = {SIGUSR1, 0}; slurmdbd_msg_t list_req; dbd_list_msg_t list_msg; list_req.msg_type = DBD_SEND_MULT_MSG; list_req.data = &list_msg; memset(&list_msg, 0, sizeof(dbd_list_msg_t)); /* DEF_TIMERS; */ /* Prepare to catch SIGUSR1 to interrupt pending * I/O and terminate in a timely fashion. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); while (*slurmdbd_conn->shutdown == 0) { /* START_TIMER; */ slurm_mutex_lock(&slurmdbd_lock); if (halt_agent) slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock); if ((slurmdbd_conn->fd < 0) && (difftime(time(NULL), fail_time) >= 10)) { /* The connection to Slurm DBD is not open */ _open_slurmdbd_conn(1); if (slurmdbd_conn->fd < 0) fail_time = time(NULL); } slurm_mutex_lock(&agent_lock); if (agent_list && slurmdbd_conn->fd) cnt = list_count(agent_list); else cnt = 0; if ((cnt == 0) || (slurmdbd_conn->fd < 0) || (fail_time && (difftime(time(NULL), fail_time) < 10))) { slurm_mutex_unlock(&slurmdbd_lock); abs_time.tv_sec = time(NULL) + 10; abs_time.tv_nsec = 0; slurm_cond_timedwait(&agent_cond, &agent_lock, &abs_time); slurm_mutex_unlock(&agent_lock); continue; } else if ((cnt > 0) && ((cnt % 100) == 0)) info("slurmdbd: agent queue size %u", cnt); /* Leave item on the queue until processing complete */ if (agent_list) { int handle_agent_count = 1000; if (cnt > handle_agent_count) { int agent_count = 0; ListIterator agent_itr = list_iterator_create(agent_list); list_msg.my_list = list_create(NULL); while ((buffer = list_next(agent_itr))) { list_enqueue(list_msg.my_list, buffer); agent_count++; if (agent_count > handle_agent_count) break; } list_iterator_destroy(agent_itr); buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else if (cnt > 1) { list_msg.my_list = agent_list; buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else buffer = (Buf) list_peek(agent_list); } else buffer = NULL; slurm_mutex_unlock(&agent_lock); if (buffer == NULL) { slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); continue; } /* NOTE: agent_lock is clear here, so we can add more * requests to the queue while waiting for this RPC to * complete. */ rc = slurm_persist_send_msg(slurmdbd_conn, buffer); if (rc != SLURM_SUCCESS) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure sending message: %d: %m", rc); } else if (list_msg.my_list) { rc = _handle_mult_rc_ret(); } else { rc = _get_return_code(); if (rc == EAGAIN) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure with " "message need to resend: %d: %m", rc); } } slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); slurm_mutex_lock(&agent_lock); if (agent_list && (rc == SLURM_SUCCESS)) { /* * If we sent a mult_msg we just need to free buffer, * we don't need to requeue, just mark list_msg.my_list * as NULL as that is the sign we sent a mult_msg. */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; } else buffer = (Buf) list_dequeue(agent_list); free_buf(buffer); fail_time = 0; } else { /* We need to free a mult_msg even on failure */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; free_buf(buffer); } fail_time = time(NULL); } slurm_mutex_unlock(&agent_lock); /* END_TIMER; */ /* info("at the end with %s", TIME_STR); */ } slurm_mutex_lock(&agent_lock); _save_dbd_state(); FREE_NULL_LIST(agent_list); slurm_mutex_unlock(&agent_lock); return NULL; }
/* Send an RPC to the SlurmDBD and wait for an arbitrary reply message. * The RPC will not be queued if an error occurs. * The "resp" message must be freed by the caller. * Returns SLURM_SUCCESS or an error code */ extern int send_recv_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req, slurmdbd_msg_t *resp) { int rc = SLURM_SUCCESS; Buf buffer; xassert(req); xassert(resp); /* To make sure we can get this to send instead of the agent sending stuff that can happen anytime we set halt_agent and then after we get into the mutex we unset. */ halt_agent = 1; slurm_mutex_lock(&slurmdbd_lock); halt_agent = 0; if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) { /* Either slurm_open_slurmdbd_conn() was not executed or * the connection to Slurm DBD has been closed */ if (req->msg_type == DBD_GET_CONFIG) _open_slurmdbd_conn(0); else _open_slurmdbd_conn(1); if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) { rc = SLURM_ERROR; goto end_it; } } if (!(buffer = pack_slurmdbd_msg(req, rpc_version))) { rc = SLURM_ERROR; goto end_it; } rc = slurm_persist_send_msg(slurmdbd_conn, buffer); free_buf(buffer); if (rc != SLURM_SUCCESS) { error("slurmdbd: Sending message type %s: %d: %m", rpc_num2string(req->msg_type), rc); goto end_it; } buffer = slurm_persist_recv_msg(slurmdbd_conn); if (buffer == NULL) { error("slurmdbd: Getting response to message type %u", req->msg_type); rc = SLURM_ERROR; goto end_it; } rc = unpack_slurmdbd_msg(resp, rpc_version, buffer); /* check for the rc of the start job message */ if (rc == SLURM_SUCCESS && resp->msg_type == DBD_ID_RC) rc = ((dbd_id_rc_msg_t *)resp->data)->return_code; free_buf(buffer); end_it: slurm_cond_signal(&slurmdbd_cond); slurm_mutex_unlock(&slurmdbd_lock); return rc; }
static void _load_dbd_state(void) { char *dbd_fname; Buf buffer; int fd, recovered = 0; uint16_t rpc_version = 0; dbd_fname = slurm_get_state_save_location(); xstrcat(dbd_fname, "/dbd.messages"); fd = open(dbd_fname, O_RDONLY); if (fd < 0) { /* don't print an error message if there is no file */ if (errno == ENOENT) debug4("slurmdbd: There is no state save file to " "open by name %s", dbd_fname); else error("slurmdbd: Opening state save file %s: %m", dbd_fname); } else { char *ver_str = NULL; uint32_t ver_str_len; buffer = _load_dbd_rec(fd); if (buffer == NULL) goto end_it; /* This is set to the end of the buffer for send so we need to set it back to 0 */ set_buf_offset(buffer, 0); safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); debug3("Version string in dbd_state header is %s", ver_str); unpack_error: free_buf(buffer); buffer = NULL; if (ver_str) { /* get the version after VER */ rpc_version = slurm_atoul(ver_str + 3); xfree(ver_str); } while (1) { /* If the buffer was not the VER%d string it was an actual message so we don't want to skip it. */ if (!buffer) buffer = _load_dbd_rec(fd); if (buffer == NULL) break; if (rpc_version != SLURM_PROTOCOL_VERSION) { /* unpack and repack with new * PROTOCOL_VERSION just so we keep * things up to date. */ slurmdbd_msg_t msg; int rc; set_buf_offset(buffer, 0); rc = unpack_slurmdbd_msg( &msg, rpc_version, buffer); free_buf(buffer); if (rc == SLURM_SUCCESS) buffer = pack_slurmdbd_msg( &msg, SLURM_PROTOCOL_VERSION); else buffer = NULL; } if (!buffer) { error("no buffer given"); continue; } if (!list_enqueue(agent_list, buffer)) fatal("slurmdbd: list_enqueue, no memory"); recovered++; buffer = NULL; } end_it: verbose("slurmdbd: recovered %d pending RPCs", recovered); (void) close(fd); } xfree(dbd_fname); }