void orte_iof_hnp_stdin_cb(int fd, short event, void *cbdata) { bool should_process = orte_iof_hnp_stdin_check(0); if (should_process) { mca_iof_hnp_component.stdinev->active = true; opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0); } else { opal_event_del(&(mca_iof_hnp_component.stdinev->ev)); mca_iof_hnp_component.stdinev->active = false; } }
/* Setup to read local data. If the tag is other than STDIN, * then this is output being pushed from one of my child processes * and I'll write the data out myself. If the tag is STDIN, * then I need to setup to read from my stdin, and send anything * I get to the specified dst_name. The dst_name in this case tells * us which procs are to get stdin - only two options are supported: * * (a) a specific name, usually vpid=0; or * * (b) all procs, specified by vpid=ORTE_VPID_WILDCARD * * The orte_plm_base_launch_apps function calls iof.push after * the procs are launched and tells us how to distribute stdin. This * ensures that the procs are started -before- we begin reading stdin * and attempting to send it to remote procs */ static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd) { orte_job_t *jdata; orte_proc_t *proc; orte_iof_sink_t *sink; orte_iof_proc_t *proct; opal_list_item_t *item; int flags; char *outfile; int fdout; orte_odls_job_t *jobdat=NULL; int np, numdigs; int rc; orte_ns_cmp_bitmask_t mask; /* don't do this if the dst vpid is invalid or the fd is negative! */ if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp pushing fd %d for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fd, ORTE_NAME_PRINT(dst_name))); if (!(src_tag & ORTE_IOF_STDIN)) { /* set the file descriptor to non-blocking - do this before we setup * and activate the read event in case it fires right away */ if((flags = fcntl(fd, F_GETFL, 0)) < 0) { opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", __FILE__, __LINE__, errno); } else { flags |= O_NONBLOCK; fcntl(fd, F_SETFL, flags); } /* do we already have this process in our list? */ for (item = opal_list_get_first(&mca_iof_hnp_component.procs); item != opal_list_get_end(&mca_iof_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) { /* found it */ goto SETUP; } } /* if we get here, then we don't yet have this proc in our list */ proct = OBJ_NEW(orte_iof_proc_t); proct->name.jobid = dst_name->jobid; proct->name.vpid = dst_name->vpid; ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch); opal_list_append(&mca_iof_hnp_component.procs, &proct->super); /* see if we are to output to a file */ if (NULL != orte_output_filename) { /* get the local jobdata for this proc */ for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; if (jobdat->jobid == proct->name.jobid) { break; } } if (NULL == jobdat) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } np = jobdat->num_procs / 10; /* determine the number of digits required for max vpid */ numdigs = 1; while (np > 0) { numdigs++; np = np / 10; } /* construct the filename */ asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename, (int)ORTE_LOCAL_JOBID(proct->name.jobid), numdigs, (unsigned long)proct->name.vpid); /* create the file */ fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644); free(outfile); if (fdout < 0) { /* couldn't be opened */ ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); return ORTE_ERR_FILE_OPEN_FAILURE; } /* define a sink to that file descriptor */ ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL, orte_iof_base_write_handler, &mca_iof_hnp_component.sinks); } SETUP: /* define a read event and activate it */ if (src_tag & ORTE_IOF_STDOUT) { ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT, orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDERR) { ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR, orte_iof_hnp_read_local_handler, false); } else if (src_tag & ORTE_IOF_STDDIAG) { ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG, orte_iof_hnp_read_local_handler, false); } /* if -all- of the readevents for this proc have been defined, then * activate them. Otherwise, we can think that the proc is complete * because one of the readevents fires -prior- to all of them having * been defined! */ if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) { proct->revstdout->active = true; opal_event_add(&(proct->revstdout->ev), 0); proct->revstderr->active = true; opal_event_add(&(proct->revstderr->ev), 0); proct->revstddiag->active = true; opal_event_add(&(proct->revstddiag->ev), 0); } return ORTE_SUCCESS; } /* if we are pushing stdin, this is happening only during launch - setup * a target for this destination if it is going somewhere other than me */ if (ORTE_VPID_WILDCARD == dst_name->vpid) { /* if wildcard, define a sink with that info so it gets sent out */ ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN, stdin_write_handler, &mca_iof_hnp_component.sinks); } else { /* no - lookup the proc's daemon and set that into sink */ if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, dst_name->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* if it is me, then don't set this up - we'll get it on the pull */ if (ORTE_PROC_MY_NAME->vpid != proc->node->daemon->name.vpid) { ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN, stdin_write_handler, &mca_iof_hnp_component.sinks); sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid; sink->daemon.vpid = proc->node->daemon->name.vpid; ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon)); } } /* now setup the read - but check to only do this once */ if (NULL == mca_iof_hnp_component.stdinev) { /* Since we are the HNP, we don't want to set nonblocking on our * stdio stream. If we do so, we set the file descriptor to * non-blocking for everyone that has that file descriptor, which * includes everyone else in our shell pipeline chain. (See * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html). * This causes things like "mpirun -np 1 big_app | cat" to lose * output, because cat's stdout is then ALSO non-blocking and cat * isn't built to deal with that case (same with almost all other * unix text utils). */ if (0 != fd) { if((flags = fcntl(fd, F_GETFL, 0)) < 0) { opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", __FILE__, __LINE__, errno); } else { flags |= O_NONBLOCK; fcntl(fd, F_SETFL, flags); } } if (isatty(fd)) { /* We should avoid trying to read from stdin if we * have a terminal, but are backgrounded. Catch the * signals that are commonly used when we switch * between being backgrounded and not. If the * filedescriptor is not a tty, don't worry about it * and always stay connected. */ opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig, SIGCONT, orte_iof_hnp_stdin_cb, NULL); /* setup a read event to read stdin, but don't activate it yet. The * dst_name indicates who should receive the stdin. If that recipient * doesn't do a corresponding pull, however, then the stdin will * be dropped upon receipt at the local daemon */ ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev, dst_name, fd, ORTE_IOF_STDIN, orte_iof_hnp_read_local_handler, false); /* check to see if we want the stdin read event to be * active - we will always at least define the event, * but may delay its activation */ if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) { mca_iof_hnp_component.stdinev->active = true; if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) { ORTE_ERROR_LOG(rc); } } } else { /* if we are not looking at a tty, just setup a read event * and activate it */ ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev, dst_name, fd, ORTE_IOF_STDIN, orte_iof_hnp_read_local_handler, true); } } return ORTE_SUCCESS; }
/* this is the read handler for my own child procs. In this case, * the data is going nowhere - I just output it myself */ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; int rc; OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ if (numbytes < 0) { /* either we have a connection error or it was a non-blocking read */ /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); /* Un-recoverable error. Allow the code to flow as usual in order to * to send the zero bytes message up the stream, and then close the * file descriptor and delete the event. */ numbytes = 0; } /* is this read from our stdin? */ if (ORTE_IOF_STDIN & rev->tag) { /* if job termination has been ordered, just ignore the * data and delete the read event */ if (orte_job_term_ordered) { OBJ_RELEASE(mca_iof_hnp_component.stdinev); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* cycle through our list of sinks */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t* sink = (orte_iof_sink_t*)item; /* only look at stdin sinks */ if (!(ORTE_IOF_STDIN & sink->tag)) { continue; } /* if the daemon is me, then this is a local sink */ if (ORTE_PROC_MY_NAME->jobid == sink->daemon.jobid && ORTE_PROC_MY_NAME->vpid == sink->daemon.vpid) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from stdin - writing to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name))); /* send the bytes down the pipe - we even send 0 byte events * down the pipe so it forces out any preceding data before * closing the output stream */ if (NULL != sink->wev) { if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev)) { /* getting too backed up - stop the read event for now if it is still active */ if (mca_iof_hnp_component.stdinev->active) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "buffer backed up - holding")); mca_iof_hnp_component.stdinev->active = false; } OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending %d bytes from stdin to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&sink->daemon))); /* send the data to the daemon so it can * write it to the proc's fd - in this case, * we pass sink->name to indicate who is to * receive the data. If the connection closed, * numbytes will be zero so zero bytes will be * sent - this will tell the daemon to close * the fd for stdin to that proc */ orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &sink->name, ORTE_IOF_STDIN, data, numbytes); } } /* if num_bytes was zero, then we need to terminate the event */ if (0 == numbytes) { /* this will also close our stdin file descriptor */ OBJ_RELEASE(mca_iof_hnp_component.stdinev); } else { /* if we are looking at a tty, then we just go ahead and restart the * read event assuming we are not backgrounded */ if (orte_iof_hnp_stdin_check(fd)) { restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ ORTE_TIMER_EVENT(0, 10000, restart_stdin); } } /* nothing more to do */ OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* this must be output from one of my local procs - see * if anyone else has requested a copy of this info */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target isn't set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID == sink->daemon.jobid) { continue; } if ((sink->tag & rev->tag) && sink->name.jobid == rev->name.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || sink->name.vpid == rev->name.vpid)) { /* need to send the data to the remote endpoint - if * the connection closed, numbytes will be zero, so * the remote endpoint will know to close its local fd. * In this case, we pass rev->name to indicate who the * data came from. */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending data to tool %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&sink->daemon))); orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &rev->name, rev->tag, data, numbytes); } } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from %s of %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"), ORTE_NAME_PRINT(&rev->name))); if (0 == numbytes) { /* if we read 0 bytes from the stdout/err/diag, there is * nothing to output - find this proc on our list and * release the appropriate event. This will delete the * read event and close the file descriptor */ for (item = opal_list_get_first(&mca_iof_hnp_component.procs); item != opal_list_get_end(&mca_iof_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == rev->name.jobid && proct->name.vpid == rev->name.vpid) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { OBJ_RELEASE(proct->revstdout); } else if (rev->tag & ORTE_IOF_STDERR) { OBJ_RELEASE(proct->revstderr); } else if (rev->tag & ORTE_IOF_STDDIAG) { OBJ_RELEASE(proct->revstddiag); } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { opal_buffer_t cmdbuf; orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_hnp_component.procs, item); /* setup a cmd to notify that the iof is complete */ OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); command = ORTE_DAEMON_IOF_COMPLETE; if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); CLEANUP: OBJ_DESTRUCT(&cmdbuf); OBJ_RELEASE(proct); } break; } } OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } /* is this the desired proc? */ if (sink->name.jobid == rev->name.jobid && sink->name.vpid == rev->name.vpid) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } } else { /* output this to our local output */ if (ORTE_IOF_STDOUT & rev->tag || orte_xml_output) { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); } } /* re-add the event */ opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; }