/* * Act on pipe commands */ static bool service_pipe_cmd(void) { bool ret = false; cmd_t cmd; cmd_list_item_t *cli; opal_fd_read(pipe_to_service_thread[0], cmd_size, &cmd); switch (cmd.pc_cmd) { case CMD_ADD_FD: OPAL_OUTPUT((-1, "fd service thread: CMD_ADD_FD")); if (OMPI_SUCCESS != service_pipe_cmd_add_fd(false, &cmd)) { ret = true; } break; case CMD_REMOVE_FD: OPAL_OUTPUT((-1, "fd service thread: CMD_REMOVE_FD")); if (OMPI_SUCCESS != service_pipe_cmd_remove_fd(&cmd)) { ret = true; } break; case CMD_CALL_FUNCTION: OPAL_OUTPUT((-1, "fd service thread: CMD_RUN_FUNCTION")); if (OMPI_SUCCESS != service_pipe_cmd_call_function(&cmd)) { ret = true; } break; case CMD_TIME_TO_QUIT: OPAL_OUTPUT((-1, "fd service thread: CMD_TIME_TO_QUIT")); ret = true; break; case ACK_RAN_FUNCTION: /* We don't have a guarantee that the main thread will check its pipe frequently, so we do some simple counting to ensure we just don't have too many outstanding commands to the main thread at any given time. The main thread will ACK every CALL_FUNCTION command, so this thread will always wake up and continue to drain any queued up functions. */ cli = (cmd_list_item_t*) opal_list_remove_first(&pending_to_main_thread); if (NULL != cli) { OPAL_OUTPUT((-1, "sending queued up cmd function to main thread")); opal_fd_write(pipe_to_main_thread[1], cmd_size, &(cli->cli_cmd)); OBJ_RELEASE(cli); } else { --waiting_for_ack_from_main_thread; } break; default: OPAL_OUTPUT((-1, "fd service thread: unknown pipe command!")); break; } return ret; }
static void stop_handler(int sd, short flags, void* cbdata) { char byte; opal_fd_read(progress_thread_pipe[0], 1, &byte); fprintf(stderr, "Stop handler called\n"); /* reset the event */ opal_event_add(&stop_event, 0); return; }
static void main_thread_event_callback(int fd, short event, void *context) { cmd_t cmd; OPAL_OUTPUT((-1, "main thread -- reading command")); opal_fd_read(pipe_to_main_thread[0], cmd_size, &cmd); switch (cmd.pc_cmd) { case CMD_CALL_FUNCTION: OPAL_OUTPUT((-1, "fd main thread: calling command")); main_pipe_cmd_call_function(&cmd); break; default: OPAL_OUTPUT((-1, "fd main thread: unknown pipe command: %d", cmd.pc_cmd)); break; } }
static void* rcv_processing_thread(opal_object_t *obj) { orte_rmcast_msg_t *msg; int rc; struct timespec tp={0, 10}; OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, "%s rmcast:base: recv processing thread operational", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl); orte_rmcast_base.recv_process_ctl.running = true; ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl); while (1) { /* block here until a trigger arrives */ if (0 > (rc = opal_fd_read(orte_rmcast_base.recv_pipe[0], sizeof(orte_rmcast_msg_t*), &msg))) { /* if something bad happened, punt */ opal_output(0, "%s PUNTING THREAD", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl); orte_rmcast_base.recv_process_ctl.running = false; ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl); /* give a little delay to ensure the main thread gets into * opal_thread_join before we exit */ nanosleep(&tp, NULL); return OPAL_THREAD_CANCELLED; } /* check to see if we were told to stop */ if (NULL == msg) { ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl); orte_rmcast_base.recv_process_ctl.running = false; ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl); return OPAL_THREAD_CANCELLED; } /* process it - processing function releases the msg */ orte_rmcast.process_msg(msg); } }
static int do_parent(orte_app_context_t* context, orte_proc_t *child, char **environ_copy, orte_job_t *jobdat, int read_fd, orte_iof_base_io_conf_t opts) { int rc; orte_odls_pipe_err_msg_t msg; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { /* connect endpoints IOF */ rc = orte_iof_base_setup_parent(&child->name, &opts); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } } /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); /* If the pipe closed, then the child successfully launched */ if (OPAL_ERR_TIMEOUT == rc) { break; } /* If Something Bad happened in the read, error out */ if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } /* Otherwise, we got a warning or error message from the child */ if (NULL != child) { if (msg.fatal) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } else { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); } } /* Read in the strings; ensure to terminate them with \0 */ if (msg.file_str_len > 0) { rc = opal_fd_read(read_fd, msg.file_str_len, file); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } file[msg.file_str_len] = '\0'; } if (msg.topic_str_len > 0) { rc = opal_fd_read(read_fd, msg.topic_str_len, topic); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } topic[msg.topic_str_len] = '\0'; } if (msg.msg_str_len > 0) { str = calloc(1, msg.msg_str_len + 1); if (NULL == str) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } rc = opal_fd_read(read_fd, msg.msg_str_len, str); } /* Print out what we got. We already have a rendered string, so use orte_show_help_norender(). */ if (msg.msg_str_len > 0) { orte_show_help_norender(file, topic, false, str); free(str); str = NULL; } /* If msg.fatal is true, then the child exited with an error. Otherwise, whatever we just printed was a warning, so loop around and see what else is on the pipe (or if the pipe closed, indicating that the child launched successfully). */ if (msg.fatal) { if (NULL != child) { child->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_ERR_FAILED_TO_START; } } /* If we got here, it means that the pipe closed without indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != child) { child->state = ORTE_PROC_STATE_RUNNING; ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_SUCCESS; }
/* * Startup the agent and share our MCA param values with the it. */ int opal_btl_usnic_connectivity_client_init(void) { /* If connectivity checking is not enabled, do nothing */ if (!mca_btl_usnic_component.connectivity_enabled) { return OPAL_SUCCESS; } assert(!initialized); /* Open local IPC socket to the agent */ agent_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (agent_fd < 0) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("socket() failed"); /* Will not return */ } char *ipc_filename = NULL; asprintf(&ipc_filename, "%s/%s", opal_process_info.job_session_dir, CONNECTIVITY_SOCK_NAME); if (NULL == ipc_filename) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("Out of memory"); /* Will not return */ } #if !defined(NDEBUG) struct sockaddr_un sun; assert(strlen(ipc_filename) <= sizeof(sun.sun_path)); #endif /* Wait for the agent to create its socket. Timeout after 10 seconds if we don't find the socket. */ struct stat sbuf; time_t start = time(NULL); while (1) { int ret = stat(ipc_filename, &sbuf); if (0 == ret) { break; } else if (ENOENT != errno) { /* If the error wasn't "file not found", then something else Bad happened */ OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("stat() failed"); /* Will not return */ } /* If the named socket wasn't there yet, then give the agent a little time to establish it */ usleep(1); if (time(NULL) - start > 10) { ABORT("connectivity client timeout waiting for server socket to appear"); /* Will not return */ } } /* Connect */ struct sockaddr_un address; memset(&address, 0, sizeof(struct sockaddr_un)); address.sun_family = AF_UNIX; strncpy(address.sun_path, ipc_filename, sizeof(address.sun_path) - 1); int count = 0; while (1) { int ret = connect(agent_fd, (struct sockaddr*) &address, sizeof(address)); if (0 == ret) { break; } // If we get ECONNREFUSED, delay a little and try again if (ECONNREFUSED == errno) { if (count < mca_btl_usnic_component.connectivity_num_retries) { usleep(100); ++count; continue; } } OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("connect() failed"); /* Will not return */ } /* Send the magic token */ int tlen = strlen(CONNECTIVITY_MAGIC_TOKEN); if (OPAL_SUCCESS != opal_fd_write(agent_fd, tlen, CONNECTIVITY_MAGIC_TOKEN)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC connect write failed"); /* Will not return */ } /* Receive a magic token back */ char *ack = alloca(tlen + 1); if (NULL == ack) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("Out of memory"); /* Will not return */ } if (OPAL_SUCCESS != opal_fd_read(agent_fd, tlen, ack)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC connect read failed"); /* Will not return */ } if (memcmp(ack, CONNECTIVITY_MAGIC_TOKEN, tlen) != 0) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client got wrong token back from agent"); /* Will not return */ } /* All done */ initialized = true; opal_output_verbose(20, USNIC_OUT, "usNIC connectivity client initialized"); return OPAL_SUCCESS; }
/* * Send a listen command to the agent */ int opal_btl_usnic_connectivity_listen(opal_btl_usnic_module_t *module) { /* If connectivity checking is not enabled, do nothing */ if (!mca_btl_usnic_component.connectivity_enabled) { module->local_modex.connectivity_udp_port = 0; return OPAL_SUCCESS; } /* Send the LISTEN command */ int id = CONNECTIVITY_AGENT_CMD_LISTEN; if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC write failed"); /* Will not return */ } /* Send the LISTEN command parameters */ opal_btl_usnic_connectivity_cmd_listen_t cmd = { .module = NULL, .ipv4_addr = module->local_modex.ipv4_addr, .netmask = module->local_modex.netmask, .max_msg_size = module->local_modex.max_msg_size }; /* Only the MPI process who is also the agent will send the pointer value (it doesn't make sense otherwise) */ if (0 == opal_process_info.my_local_rank) { cmd.module = module; } /* Ensure to NULL-terminate the passed strings */ strncpy(cmd.nodename, opal_process_info.nodename, CONNECTIVITY_NODENAME_LEN - 1); strncpy(cmd.usnic_name, module->fabric_info->fabric_attr->name, CONNECTIVITY_IFNAME_LEN - 1); if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC write failed"); /* Will not return */ } /* Wait for the reply with the UDP port */ opal_btl_usnic_connectivity_cmd_listen_reply_t reply; memset(&reply, 0, sizeof(reply)); if (OPAL_SUCCESS != opal_fd_read(agent_fd, sizeof(reply), &reply)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC read failed"); /* Will not return */ } /* Get the UDP port number that was received */ assert(CONNECTIVITY_AGENT_CMD_LISTEN == reply.cmd); module->local_modex.connectivity_udp_port = reply.udp_port; return OPAL_SUCCESS; } int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, uint32_t dest_ipv4_addr, uint32_t dest_netmask, int dest_port, char *dest_nodename, size_t max_msg_size) { OPAL_THREAD_LOCK(&btl_usnic_lock); /* If connectivity checking is not enabled, do nothing */ if (!mca_btl_usnic_component.connectivity_enabled) { return OPAL_SUCCESS; } /* Send the PING command */ int id = CONNECTIVITY_AGENT_CMD_PING; if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC write failed"); /* Will not return */ } /* Send the PING command parameters */ opal_btl_usnic_connectivity_cmd_ping_t cmd = { .src_ipv4_addr = src_ipv4_addr, .src_udp_port = src_port, .dest_ipv4_addr = dest_ipv4_addr, .dest_netmask = dest_netmask, .dest_udp_port = dest_port, .max_msg_size = max_msg_size }; /* Ensure to NULL-terminate the passed string */ strncpy(cmd.dest_nodename, dest_nodename, CONNECTIVITY_NODENAME_LEN - 1); if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) { OPAL_ERROR_LOG(OPAL_ERR_IN_ERRNO); ABORT("usnic connectivity client IPC write failed"); /* Will not return */ } OPAL_THREAD_UNLOCK(&btl_usnic_lock); return OPAL_SUCCESS; }
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd) { int rc; orte_odls_pipe_err_msg_t msg; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; if (cd->opts.connect_stdin) { close(cd->opts.p_stdin[0]); } close(cd->opts.p_stdout[1]); if( !orte_iof_base.redirect_app_stderr_to_stdout ) { close(cd->opts.p_stderr[1]); } #if OPAL_PMIX_V1 close(cd->opts.p_internal[1]); #endif /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); /* If the pipe closed, then the child successfully launched */ if (OPAL_ERR_TIMEOUT == rc) { break; } /* If Something Bad happened in the read, error out */ if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } /* Otherwise, we got a warning or error message from the child */ if (NULL != cd->child) { if (msg.fatal) { ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE); } else { ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE); } } /* Read in the strings; ensure to terminate them with \0 */ if (msg.file_str_len > 0) { rc = opal_fd_read(read_fd, msg.file_str_len, file); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } file[msg.file_str_len] = '\0'; } if (msg.topic_str_len > 0) { rc = opal_fd_read(read_fd, msg.topic_str_len, topic); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } topic[msg.topic_str_len] = '\0'; } if (msg.msg_str_len > 0) { str = calloc(1, msg.msg_str_len + 1); if (NULL == str) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } rc = opal_fd_read(read_fd, msg.msg_str_len, str); } /* Print out what we got. We already have a rendered string, so use orte_show_help_norender(). */ if (msg.msg_str_len > 0) { orte_show_help_norender(file, topic, false, str); free(str); str = NULL; } /* If msg.fatal is true, then the child exited with an error. Otherwise, whatever we just printed was a warning, so loop around and see what else is on the pipe (or if the pipe closed, indicating that the child launched successfully). */ if (msg.fatal) { if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_ERR_FAILED_TO_START; } } /* If we got here, it means that the pipe closed without indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_RUNNING; ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_SUCCESS; }