extern int spawn_resp_send_to_fd(spawn_resp_t *resp, int fd) { Buf buf; int rc; buf = init_buf(1024); /* sync with spawn_req_send_to_srun */ /* cmd = TREE_CMD_SPAWN_RESP; */ /* pack16(cmd, buf); */ spawn_resp_pack(resp, buf); rc = _slurm_msg_sendto(fd, get_buf_data(buf), get_buf_offset(buf), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS); free_buf(buf); return rc; }
void *_forward_thread(void *arg) { forward_msg_t *fwd_msg = (forward_msg_t *)arg; Buf buffer = init_buf(fwd_msg->buf_len); List ret_list = NULL; slurm_fd_t fd = -1; ret_data_info_t *ret_data_info = NULL; char *name = NULL; hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist); slurm_addr_t addr; char *buf = NULL; int steps = 0; int start_timeout = fwd_msg->timeout; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(hl))) { if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) { error("forward_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); continue; } goto cleanup; } if ((fd = slurm_open_msg_conn(&addr)) < 0) { error("forward_thread to %s: %m", name); slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward( &fwd_msg->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); free(name); if (hostlist_count(hl) > 0) { slurm_mutex_unlock(fwd_msg->forward_mutex); continue; } goto cleanup; } buf = hostlist_ranged_string_xmalloc(hl); xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = buf; fwd_msg->header.forward.cnt = hostlist_count(hl); /* info("sending %d forwards (%s) to %s", */ /* fwd_msg->header.forward.cnt, */ /* fwd_msg->header.forward.nodelist, name); */ if (fwd_msg->header.forward.nodelist[0]) { debug3("forward: send to %s along with %s", name, fwd_msg->header.forward.nodelist); } else debug3("forward: send to %s ", name); pack_header(&fwd_msg->header, buffer); /* add forward data to buffer */ if (remaining_buf(buffer) < fwd_msg->buf_len) { buffer->size += (fwd_msg->buf_len + BUF_SIZE); xrealloc(buffer->head, buffer->size); } if (fwd_msg->buf_len) { memcpy(&buffer->head[buffer->processed], fwd_msg->buf, fwd_msg->buf_len); buffer->processed += fwd_msg->buf_len; } /* * forward message */ if (_slurm_msg_sendto(fd, get_buf_data(buffer), get_buf_offset(buffer), SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) { error("forward_thread: slurm_msg_sendto: %m"); slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, errno); free(name); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_msg->buf_len); slurm_mutex_unlock(fwd_msg->forward_mutex); slurm_close_accepted_conn(fd); fd = -1; continue; } goto cleanup; } if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) || (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) || (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) { slurm_mutex_lock(fwd_msg->forward_mutex); ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_msg->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); while ((name = hostlist_shift(hl))) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); list_push(fwd_msg->ret_list, ret_data_info); ret_data_info->node_name = xstrdup(name); free(name); } goto cleanup; } if (fwd_msg->header.forward.cnt > 0) { static int message_timeout = -1; if (message_timeout < 0) message_timeout = slurm_get_msg_timeout() * 1000; steps = (fwd_msg->header.forward.cnt+1) / slurm_get_tree_width(); fwd_msg->timeout = (message_timeout*steps); /* info("got %d * %d = %d", message_timeout, steps, fwd_msg->timeout); */ steps++; fwd_msg->timeout += (start_timeout*steps); /* info("now + %d*%d = %d", start_timeout, steps, fwd_msg->timeout); */ } ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); /* info("sent %d forwards got %d back", */ /* fwd_msg->header.forward.cnt, list_count(ret_list)); */ if (!ret_list || (fwd_msg->header.forward.cnt != 0 && list_count(ret_list) <= 1)) { slurm_mutex_lock(fwd_msg->forward_mutex); mark_as_failed_forward(&fwd_msg->ret_list, name, errno); free(name); if (ret_list) list_destroy(ret_list); if (hostlist_count(hl) > 0) { free_buf(buffer); buffer = init_buf(fwd_msg->buf_len); slurm_mutex_unlock(fwd_msg->forward_mutex); slurm_close_accepted_conn(fd); fd = -1; continue; } goto cleanup; } else if ((fwd_msg->header.forward.cnt+1) != list_count(ret_list)) { /* this should never be called since the above should catch the failed forwards and pipe them back down, but this is here so we never have to worry about a locked mutex */ ListIterator itr = NULL; char *tmp = NULL; int first_node_found = 0; hostlist_iterator_t host_itr = hostlist_iterator_create(hl); error("We shouldn't be here. We forwarded to %d " "but only got %d back", (fwd_msg->header.forward.cnt+1), list_count(ret_list)); while ((tmp = hostlist_next(host_itr))) { int node_found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (!ret_data_info->node_name) { first_node_found = 1; ret_data_info->node_name = xstrdup(name); } if (!strcmp(tmp, ret_data_info->node_name)) { node_found = 1; break; } } list_iterator_destroy(itr); if (!node_found) { mark_as_failed_forward( &fwd_msg->ret_list, tmp, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } free(tmp); } hostlist_iterator_destroy(host_itr); if (!first_node_found) { mark_as_failed_forward(&fwd_msg->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); } } break; } slurm_mutex_lock(fwd_msg->forward_mutex); if (ret_list) { while ((ret_data_info = list_pop(ret_list)) != NULL) { if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(name); } list_push(fwd_msg->ret_list, ret_data_info); debug3("got response from %s", ret_data_info->node_name); } list_destroy(ret_list); } free(name); cleanup: if ((fd >= 0) && slurm_close_accepted_conn(fd) < 0) error ("close(%d): %m", fd); hostlist_destroy(hl); destroy_forward(&fwd_msg->header.forward); free_buf(buffer); pthread_cond_signal(fwd_msg->notify); slurm_mutex_unlock(fwd_msg->forward_mutex); return (NULL); }