void *_fwd_tree_thread(void *arg) { fwd_tree_t *fwd_tree = (fwd_tree_t *)arg; List ret_list = NULL; char *name = NULL; char *buf = NULL; slurm_msg_t send_msg; slurm_msg_t_init(&send_msg); send_msg.msg_type = fwd_tree->orig_msg->msg_type; send_msg.data = fwd_tree->orig_msg->data; send_msg.protocol_version = fwd_tree->orig_msg->protocol_version; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(fwd_tree->tree_hl))) { if (slurm_conf_get_addr(name, &send_msg.address) == SLURM_ERROR) { error("fwd_tree_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward(&fwd_tree->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } send_msg.forward.timeout = fwd_tree->timeout; if ((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))){ buf = hostlist_ranged_string_xmalloc( fwd_tree->tree_hl); send_msg.forward.nodelist = buf; } else send_msg.forward.nodelist = NULL; if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) { debug3("Tree sending to %s along with %s", name, send_msg.forward.nodelist); } else debug3("Tree sending to %s", name); ret_list = slurm_send_addr_recv_msgs(&send_msg, name, fwd_tree->timeout); xfree(send_msg.forward.nodelist); if (ret_list) { int ret_cnt = list_count(ret_list); /* This is most common if a slurmd is running an older version of Slurm than the originator of the message. */ if ((ret_cnt <= send_msg.forward.cnt) && (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) { error("fwd_tree_thread: %s failed to forward " "the message, expecting %d ret got only " "%d", name, send_msg.forward.cnt + 1, ret_cnt); if (ret_cnt > 1) { /* not likely */ ret_data_info_t *ret_data_info = NULL; ListIterator itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { if (xstrcmp(ret_data_info-> node_name, name)) hostlist_delete_host( fwd_tree-> tree_hl, ret_data_info-> node_name); } list_iterator_destroy(itr); } } slurm_mutex_lock(fwd_tree->tree_mutex); list_transfer(fwd_tree->ret_list, ret_list); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); FREE_NULL_LIST(ret_list); /* try next node */ if (ret_cnt <= send_msg.forward.cnt) { free(name); /* Abandon tree. This way if all the * nodes in the branch are down we * don't have to time out for each * node serially. */ _start_msg_tree_internal( fwd_tree->tree_hl, NULL, fwd_tree, hostlist_count(fwd_tree->tree_hl)); continue; } } else { /* This should never happen (when this was * written slurm_send_addr_recv_msgs always * returned a list */ error("fwd_tree_thread: no return list given from " "slurm_send_addr_recv_msgs spawned for %s", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward( &fwd_tree->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); slurm_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } free(name); /* check for error and try again */ if (errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR) continue; break; } _destroy_tree_fwd(fwd_tree); return NULL; }
void *_fwd_tree_thread(void *arg) { fwd_tree_t *fwd_tree = (fwd_tree_t *)arg; List ret_list = NULL; char *name = NULL; char *buf = NULL; slurm_msg_t send_msg; slurm_msg_t_init(&send_msg); send_msg.msg_type = fwd_tree->orig_msg->msg_type; send_msg.data = fwd_tree->orig_msg->data; /* repeat until we are sure the message was sent */ while ((name = hostlist_shift(fwd_tree->tree_hl))) { if (slurm_conf_get_addr(name, &send_msg.address) == SLURM_ERROR) { error("fwd_tree_thread: can't find address for host " "%s, check slurm.conf", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward(&fwd_tree->ret_list, name, SLURM_UNKNOWN_FORWARD_ADDR); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } send_msg.forward.timeout = fwd_tree->timeout; if((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))) { buf = hostlist_ranged_string_xmalloc( fwd_tree->tree_hl); send_msg.forward.nodelist = buf; } else send_msg.forward.nodelist = NULL; if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) { debug3("Tree sending to %s along with %s", name, send_msg.forward.nodelist); } else debug3("Tree sending to %s", name); ret_list = slurm_send_addr_recv_msgs(&send_msg, name, fwd_tree->timeout); xfree(send_msg.forward.nodelist); if(ret_list) { slurm_mutex_lock(fwd_tree->tree_mutex); list_transfer(fwd_tree->ret_list, ret_list); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); list_destroy(ret_list); } else { /* This should never happen (when this was written slurm_send_addr_recv_msgs always returned a list */ error("fwd_tree_thread: no return list given from " "slurm_send_addr_recv_msgs spawned for %s", name); slurm_mutex_lock(fwd_tree->tree_mutex); mark_as_failed_forward( &fwd_tree->ret_list, name, SLURM_COMMUNICATIONS_CONNECTION_ERROR); pthread_cond_signal(fwd_tree->notify); slurm_mutex_unlock(fwd_tree->tree_mutex); free(name); continue; } free(name); /* check for error and try again */ if(errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR) continue; break; } _destroy_tree_fwd(fwd_tree); return NULL; }