Beispiel #1
0
static int orted_pull(const orte_process_name_t* dst_name,
                      orte_iof_tag_t src_tag,
                      int fd)
{
    orte_iof_sink_t *sink;
    int flags;
    
    /* this is a local call - only stdin is supported */
    if (ORTE_IOF_STDIN != src_tag) {
        return ORTE_ERR_NOT_SUPPORTED;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:orted pulling fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* set the file descriptor to non-blocking - do this before we setup
     * the sink in case it fires right away
     */
    if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
        opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                    __FILE__, __LINE__, errno);
    } else {
        flags |= O_NONBLOCK;
        fcntl(fd, F_SETFL, flags);
    }

    ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
                         stdin_write_handler,
                         &mca_iof_orted_component.sinks);
    
    return ORTE_SUCCESS;
}
Beispiel #2
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
static int orte_iof_base_open(mca_base_open_flag_t flags)
{
    int xmlfd;

    /* daemons do not need to do this as they do not write out stdout/err */
    if (!ORTE_PROC_IS_DAEMON) {
        if (orte_xml_output) {
            if (NULL != orte_xml_fp) {
                /* user wants all xml-formatted output sent to file */
                xmlfd = fileno(orte_xml_fp);
            } else {
                xmlfd = 1;
            }
            /* setup the stdout event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
                                 xmlfd, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
            /* don't create a stderr event - all output will go to
             * the stdout channel
             */
        } else {
            /* setup the stdout event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
                                 1, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
            /* setup the stderr event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME,
                                 2, ORTE_IOF_STDERR, orte_iof_base_write_handler);
        }

        /* do NOT set these file descriptors to non-blocking. If we do so,
         * we set the file descriptor to non-blocking for everyone that has
         * that file descriptor, which includes everyone else in our shell
         * pipeline chain.  (See
         * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
         * This causes things like "mpirun -np 1 big_app | cat" to lose
         * output, because cat's stdout is then ALSO non-blocking and cat
         * isn't built to deal with that case (same with almost all other
         * unix text utils).
         */
    }

    /* Open up all available components */
    return mca_base_framework_components_open(&orte_iof_base_framework, flags);
}
Beispiel #3
0
static int orted_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
    int flags;
    opal_list_item_t *item;
    orte_iof_proc_t *proct;
    orte_iof_sink_t *sink;
    char *outfile;
    int fdout;
    orte_job_t *jobdat=NULL;
    int np, numdigs;
    orte_ns_cmp_bitmask_t mask;

    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:orted pushing fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* set the file descriptor to non-blocking - do this before we setup
     * and activate the read event in case it fires right away
     */
    if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
        opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                    __FILE__, __LINE__, errno);
    } else {
        flags |= O_NONBLOCK;
        fcntl(fd, F_SETFL, flags);
    }

    /* do we already have this process in our list? */
    for (item = opal_list_get_first(&mca_iof_orted_component.procs);
         item != opal_list_get_end(&mca_iof_orted_component.procs);
         item = opal_list_get_next(item)) {
        proct = (orte_iof_proc_t*)item;
        
        mask = ORTE_NS_CMP_ALL;

        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
            /* found it */
            goto SETUP;
        }
    }
    /* if we get here, then we don't yet have this proc in our list */
    proct = OBJ_NEW(orte_iof_proc_t);
    proct->name.jobid = dst_name->jobid;
    proct->name.vpid = dst_name->vpid;
    opal_list_append(&mca_iof_orted_component.procs, &proct->super);
    /* see if we are to output to a file */
    if (NULL != orte_output_filename) {
        /* get the local jobdata for this proc */
        if (NULL == (jobdat = orte_get_job_data_object(proct->name.jobid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        np = jobdat->num_procs / 10;
        /* determine the number of digits required for max vpid */
        numdigs = 1;
        while (np > 0) {
            numdigs++;
            np = np / 10;
        }
        /* construct the filename */
        asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
                 (int)ORTE_LOCAL_JOBID(proct->name.jobid),
                 numdigs, (unsigned long)proct->name.vpid);
        /* create the file */
        fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
        free(outfile);
        if (fdout < 0) {
            /* couldn't be opened */
            ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
            return ORTE_ERR_FILE_OPEN_FAILURE;
        }
        /* define a sink to that file descriptor */
        ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
                             orte_iof_base_write_handler,
                             &mca_iof_orted_component.sinks);
    }
    
SETUP:
    /* define a read event and activate it */
    if (src_tag & ORTE_IOF_STDOUT) {
        ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
                            orte_iof_orted_read_handler, false);
    } else if (src_tag & ORTE_IOF_STDERR) {
        ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
                            orte_iof_orted_read_handler, false);
    } else if (src_tag & ORTE_IOF_STDDIAG) {
        ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
                            orte_iof_orted_read_handler, false);
    }
    /* if -all- of the readevents for this proc have been defined, then
     * activate them. Otherwise, we can think that the proc is complete
     * because one of the readevents fires -prior- to all of them having
     * been defined!
     */
    if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
        proct->revstdout->active = true;
        opal_event_add(proct->revstdout->ev, 0);
        proct->revstderr->active = true;
        opal_event_add(proct->revstderr->ev, 0);
        proct->revstddiag->active = true;
        opal_event_add(proct->revstddiag->ev, 0);
    }
    return ORTE_SUCCESS;
}
Beispiel #4
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
static int orte_iof_base_open(mca_base_open_flag_t flags)
{
    int rc, xmlfd;

    /* did the user request we print output to files? */
    if (NULL != orte_output_filename) {
        /* we will setup the files themselves as needed in the iof
         * module. For now, let's see if the filename contains a
         * path, or just a name
         */
        char *path;
        path = opal_dirname(orte_output_filename);
        if (NULL == path) {
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        if (0 != strcmp(path, orte_output_filename)) {
            /* there is a path in this name - ensure that the directory
             * exists, and create it if not
             */
            if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
                free(path);
                return rc;
            }
        }
        free(path);
    }

    /* daemons do not need to do this as they do not write out stdout/err */
    if (!ORTE_PROC_IS_DAEMON ||
        (ORTE_PROC_IS_DAEMON && ORTE_PROC_IS_CM)) {
        if (orte_xml_output) {
            if (NULL != orte_xml_fp) {
                /* user wants all xml-formatted output sent to file */
                xmlfd = fileno(orte_xml_fp);
            } else {
                xmlfd = 1;
            }
            /* setup the stdout event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
                                 xmlfd, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
            /* don't create a stderr event - all output will go to
             * the stdout channel
             */
        } else {
            /* setup the stdout event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
                                 1, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
            /* setup the stderr event */
            ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME,
                                 2, ORTE_IOF_STDERR, orte_iof_base_write_handler);
        }

        /* do NOT set these file descriptors to non-blocking. If we do so,
         * we set the file descriptor to non-blocking for everyone that has
         * that file descriptor, which includes everyone else in our shell
         * pipeline chain.  (See
         * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
         * This causes things like "mpirun -np 1 big_app | cat" to lose
         * output, because cat's stdout is then ALSO non-blocking and cat
         * isn't built to deal with that case (same with almost all other
         * unix text utils).
         */
    }

    /* Open up all available components */
    return mca_base_framework_components_open(&orte_iof_base_framework, flags);
}
Beispiel #5
0
/* Setup to read local data. If the tag is other than STDIN,
 * then this is output being pushed from one of my child processes
 * and I'll write the data out myself. If the tag is STDIN,
 * then I need to setup to read from my stdin, and send anything
 * I get to the specified dst_name. The dst_name in this case tells
 * us which procs are to get stdin - only two options are supported:
 *
 * (a) a specific name, usually vpid=0; or
 *
 * (b) all procs, specified by vpid=ORTE_VPID_WILDCARD
 *
 * The orte_plm_base_launch_apps function calls iof.push after
 * the procs are launched and tells us how to distribute stdin. This
 * ensures that the procs are started -before- we begin reading stdin
 * and attempting to send it to remote procs
 */
static int hnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_iof_sink_t *sink;
    orte_iof_proc_t *proct;
    opal_list_item_t *item;
    int flags;
    char *outfile;
    int fdout;
    orte_odls_job_t *jobdat=NULL;
    int np, numdigs;
    int rc;
    orte_ns_cmp_bitmask_t mask;

    /* don't do this if the dst vpid is invalid or the fd is negative! */
    if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
        return ORTE_SUCCESS;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:hnp pushing fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    if (!(src_tag & ORTE_IOF_STDIN)) {
        /* set the file descriptor to non-blocking - do this before we setup
         * and activate the read event in case it fires right away
         */
        if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
            opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                        __FILE__, __LINE__, errno);
        } else {
            flags |= O_NONBLOCK;
            fcntl(fd, F_SETFL, flags);
        }
        /* do we already have this process in our list? */
        for (item = opal_list_get_first(&mca_iof_hnp_component.procs);
             item != opal_list_get_end(&mca_iof_hnp_component.procs);
             item = opal_list_get_next(item)) {
            proct = (orte_iof_proc_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
                /* found it */
                goto SETUP;
            }
        }
        /* if we get here, then we don't yet have this proc in our list */
        proct = OBJ_NEW(orte_iof_proc_t);
        proct->name.jobid = dst_name->jobid;
        proct->name.vpid = dst_name->vpid;
        ORTE_EPOCH_SET(proct->name.epoch,dst_name->epoch);
        opal_list_append(&mca_iof_hnp_component.procs, &proct->super);
        /* see if we are to output to a file */
        if (NULL != orte_output_filename) {
            /* get the local jobdata for this proc */
            for (item = opal_list_get_first(&orte_local_jobdata);
                 item != opal_list_get_end(&orte_local_jobdata);
                 item = opal_list_get_next(item)) {
                jobdat = (orte_odls_job_t*)item;
                if (jobdat->jobid == proct->name.jobid) {
                    break;
                }
            }
            if (NULL == jobdat) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return ORTE_ERR_NOT_FOUND;
            }
            np = jobdat->num_procs / 10;
            /* determine the number of digits required for max vpid */
            numdigs = 1;
            while (np > 0) {
                numdigs++;
                np = np / 10;
            }
            /* construct the filename */
            asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
                     (int)ORTE_LOCAL_JOBID(proct->name.jobid),
                     numdigs, (unsigned long)proct->name.vpid);
            /* create the file */
            fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
            free(outfile);
            if (fdout < 0) {
                /* couldn't be opened */
                ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
                return ORTE_ERR_FILE_OPEN_FAILURE;
            }
            /* define a sink to that file descriptor */
            ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
                                 orte_iof_base_write_handler,
                                 &mca_iof_hnp_component.sinks);
        }
        
    SETUP:
        /* define a read event and activate it */
        if (src_tag & ORTE_IOF_STDOUT) {
            ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
                                orte_iof_hnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDERR) {
            ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
                                orte_iof_hnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDDIAG) {
            ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
                                orte_iof_hnp_read_local_handler, false);
        }
        /* if -all- of the readevents for this proc have been defined, then
         * activate them. Otherwise, we can think that the proc is complete
         * because one of the readevents fires -prior- to all of them having
         * been defined!
         */
        if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
            proct->revstdout->active = true;
            opal_event_add(&(proct->revstdout->ev), 0);
            proct->revstderr->active = true;
            opal_event_add(&(proct->revstderr->ev), 0);
            proct->revstddiag->active = true;
            opal_event_add(&(proct->revstddiag->ev), 0);
        }
        return ORTE_SUCCESS;
    }

    /* if we are pushing stdin, this is happening only during launch - setup
     * a target for this destination if it is going somewhere other than me
     */
    if (ORTE_VPID_WILDCARD == dst_name->vpid) {
        /* if wildcard, define a sink with that info so it gets sent out */
        ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN,
                             stdin_write_handler,
                             &mca_iof_hnp_component.sinks);
    } else {
        /* no - lookup the proc's daemon and set that into sink */
        if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) {
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            return ORTE_ERR_BAD_PARAM;
        }
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, dst_name->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        /* if it is me, then don't set this up - we'll get it on the pull */
        if (ORTE_PROC_MY_NAME->vpid != proc->node->daemon->name.vpid) {
            ORTE_IOF_SINK_DEFINE(&sink, dst_name, -1, ORTE_IOF_STDIN,
                                 stdin_write_handler,
                                 &mca_iof_hnp_component.sinks);
            sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
            sink->daemon.vpid = proc->node->daemon->name.vpid;
            ORTE_EPOCH_SET(sink->daemon.epoch,orte_ess.proc_get_epoch(&sink->daemon));
        }
    }
    
    /* now setup the read - but check to only do this once */
    if (NULL == mca_iof_hnp_component.stdinev) {
        /* Since we are the HNP, we don't want to set nonblocking on our
         * stdio stream.  If we do so, we set the file descriptor to
         * non-blocking for everyone that has that file descriptor, which
         * includes everyone else in our shell pipeline chain.  (See
         * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
         * This causes things like "mpirun -np 1 big_app | cat" to lose
         * output, because cat's stdout is then ALSO non-blocking and cat
         * isn't built to deal with that case (same with almost all other
         * unix text utils). 
         */
        if (0 != fd) {
            if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
                opal_output(orte_iof_base.iof_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                            __FILE__, __LINE__, errno);
            } else {
                flags |= O_NONBLOCK;
                fcntl(fd, F_SETFL, flags);
            }            
        }
        if (isatty(fd)) {
            /* We should avoid trying to read from stdin if we
             * have a terminal, but are backgrounded.  Catch the
             * signals that are commonly used when we switch
             * between being backgrounded and not.  If the
             * filedescriptor is not a tty, don't worry about it
             * and always stay connected.
             */
            opal_event_signal_set(opal_event_base, &mca_iof_hnp_component.stdinsig,
                                  SIGCONT, orte_iof_hnp_stdin_cb,
                                  NULL);
            
            /* setup a read event to read stdin, but don't activate it yet. The
             * dst_name indicates who should receive the stdin. If that recipient
             * doesn't do a corresponding pull, however, then the stdin will
             * be dropped upon receipt at the local daemon
             */
            ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_hnp_read_local_handler, false);
            
            /* check to see if we want the stdin read event to be
             * active - we will always at least define the event,
             * but may delay its activation
             */
            if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_hnp_stdin_check(fd)) {
                mca_iof_hnp_component.stdinev->active = true;
                if (OPAL_SUCCESS != (rc = opal_event_add(&(mca_iof_hnp_component.stdinev->ev), 0))) {
                    ORTE_ERROR_LOG(rc);
                }
            }
        } else {
            /* if we are not looking at a tty, just setup a read event
             * and activate it
             */
            ORTE_IOF_READ_EVENT(&mca_iof_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_hnp_read_local_handler, true);
        }
    }
    return ORTE_SUCCESS;
}
Beispiel #6
0
void orte_iof_hnp_recv(int status, orte_process_name_t* sender,
                       opal_buffer_t* buffer, orte_rml_tag_t tag,
                       void* cbdata)
{
    orte_process_name_t origin, requestor;
    unsigned char data[ORTE_IOF_BASE_MSG_MAX];
    orte_iof_tag_t stream;
    int32_t count, numbytes;
    orte_iof_sink_t *sink;
    opal_list_item_t *item, *next;
    int rc;
    bool exclusive;

    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s received IOF from proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(sender)));

    /* unpack the stream first as this may be flow control info */
    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) {
        ORTE_ERROR_LOG(rc);
        goto CLEAN_RETURN;
    }

    if (ORTE_IOF_XON & stream) {
        /* re-start the stdin read event */
        if (NULL != mca_iof_hnp_component.stdinev &&
            !orte_job_term_ordered &&
            !mca_iof_hnp_component.stdinev->active) {
            mca_iof_hnp_component.stdinev->active = true;
            opal_event_add(mca_iof_hnp_component.stdinev->ev, 0);
        }
        goto CLEAN_RETURN;
    } else if (ORTE_IOF_XOFF & stream) {
        /* stop the stdin read event */
        if (NULL != mca_iof_hnp_component.stdinev &&
            !mca_iof_hnp_component.stdinev->active) {
            opal_event_del(mca_iof_hnp_component.stdinev->ev);
            mca_iof_hnp_component.stdinev->active = false;
        }
        goto CLEAN_RETURN;
    }

    /* get name of the process whose io we are discussing */
    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        goto CLEAN_RETURN;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s received IOF cmd from sender %s for source %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&requestor),
                         ORTE_NAME_PRINT(&origin)));

    /* check to see if a tool has requested something */
    if (ORTE_IOF_PULL & stream) {
        /* get name of the process wishing to be the sink */
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &requestor, &count, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto CLEAN_RETURN;
        }

        OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                             "%s received pull cmd from remote tool %s for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&requestor),
                             ORTE_NAME_PRINT(&origin)));

        if (ORTE_IOF_EXCLUSIVE & stream) {
            exclusive = true;
        } else {
            exclusive = false;
        }
        /* a tool is requesting that we send it a copy of the specified stream(s)
         * from the specified process(es), so create a sink for it
         */
        if (ORTE_IOF_STDOUT & stream) {
            ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDOUT,
                                 NULL, &mca_iof_hnp_component.sinks);
            sink->daemon.jobid = requestor.jobid;
            sink->daemon.vpid = requestor.vpid;
            sink->exclusive = exclusive;
        }
        if (ORTE_IOF_STDERR & stream) {
            ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR,
                                 NULL, &mca_iof_hnp_component.sinks);
            sink->daemon.jobid = requestor.jobid;
            sink->daemon.vpid = requestor.vpid;
            sink->exclusive = exclusive;
        }
        if (ORTE_IOF_STDDIAG & stream) {
            ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG,
                                 NULL, &mca_iof_hnp_component.sinks);
            sink->daemon.jobid = requestor.jobid;
            sink->daemon.vpid = requestor.vpid;
            sink->exclusive = exclusive;
        }
        goto CLEAN_RETURN;
    }

    if (ORTE_IOF_CLOSE & stream) {
        OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                             "%s received close cmd from remote tool %s for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender),
                             ORTE_NAME_PRINT(&origin)));
        /* a tool is requesting that we no longer forward a copy of the
         * specified stream(s) from the specified process(es) - remove the sink
         */
        item = opal_list_get_first(&mca_iof_hnp_component.sinks);
        while (item != opal_list_get_end(&mca_iof_hnp_component.sinks)) {
            next = opal_list_get_next(item);
            sink = (orte_iof_sink_t*)item;
            /* if the target isn't set, then this sink is for another purpose - ignore it */
            if (ORTE_JOBID_INVALID == sink->daemon.jobid) {
                continue;
            }
            /* if this sink is the designated one, then remove it from list */
            if ((stream & sink->tag) &&
                sink->name.jobid == origin.jobid &&
                (ORTE_VPID_WILDCARD == sink->name.vpid ||
                 ORTE_VPID_WILDCARD == origin.vpid ||
                 sink->name.vpid == origin.vpid)) {
                /* send an ack message to the requestor - this ensures that the RML has
                 * completed sending anything to that requestor before it exits
                 */
                orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &origin, ORTE_IOF_CLOSE, NULL, 0);
                opal_list_remove_item(&mca_iof_hnp_component.sinks, item);
                OBJ_RELEASE(item);
            }
            item = next;
        }
        goto CLEAN_RETURN;
    }

    /* this must have come from a daemon forwarding output - unpack the data */
    numbytes=ORTE_IOF_BASE_MSG_MAX;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) {
        ORTE_ERROR_LOG(rc);
        goto CLEAN_RETURN;
    }
    /* numbytes will contain the actual #bytes that were sent */

    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s unpacked %d bytes from remote proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                         ORTE_NAME_PRINT(&origin)));

    /* cycle through the endpoints to see if someone else wants a copy */
    exclusive = false;
    for (item = opal_list_get_first(&mca_iof_hnp_component.sinks);
         item != opal_list_get_end(&mca_iof_hnp_component.sinks);
         item = opal_list_get_next(item)) {
        sink = (orte_iof_sink_t*)item;
        /* if the target isn't set, then this sink is for another purpose - ignore it */
        if (ORTE_JOBID_INVALID == sink->daemon.jobid) {
            continue;
        }
        if ((stream & sink->tag) &&
            sink->name.jobid == origin.jobid &&
            (ORTE_VPID_WILDCARD == sink->name.vpid ||
             ORTE_VPID_WILDCARD == origin.vpid ||
             sink->name.vpid == origin.vpid)) {
            /* send the data to the tool */
            orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &origin, stream, data, numbytes);
            if (sink->exclusive) {
                exclusive = true;
            }
        }
    }

    /* output this to our local output unless one of the sinks was exclusive */
    if (!exclusive) {
        if (ORTE_IOF_STDOUT & stream || orte_xml_output) {
            orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev);
        } else {
            orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev);
        }
    }

 CLEAN_RETURN:
    return;
}
Beispiel #7
0
static int mrhnp_pull(const orte_process_name_t* dst_name,
                    orte_iof_tag_t src_tag,
                    int fd)
{
    orte_iof_sink_t *sink;
    int flags, j;
    orte_iof_proc_t *ptr, *proct;
    opal_list_item_t *item;
    orte_job_t *jdata;
    orte_iof_job_t *jptr;
    bool found;

    /* this is a local call - only stdin is supported */
    if (ORTE_IOF_STDIN != src_tag) {
        return ORTE_ERR_NOT_SUPPORTED;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s iof:mrhnp pulling fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* get the job object for this proc and check to see if it
     * is a mapper - if so, add it to the jobs that receive
     * our stdin
     */
    jdata = orte_get_job_data_object(dst_name->jobid);
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MAPPER, NULL, OPAL_BOOL)) {
        /* see if we already have it */
        found = false;
        for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
            if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
                continue;
            }
            if (jptr->jdata->jobid == jdata->jobid) {
                found = true;
                break;
            }
        }
        if (!found) {
            jptr = OBJ_NEW(orte_iof_job_t);
            OBJ_RETAIN(jdata);
            jptr->jdata = jdata;
            opal_bitmap_init(&jptr->xoff, jdata->num_procs);
            opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr);
        }
    }

    /* set the file descriptor to non-blocking - do this before we setup
     * the sink in case it fires right away
     */
    if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
        opal_output(orte_iof_base_framework.framework_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                    __FILE__, __LINE__, errno);
    } else {
        flags |= O_NONBLOCK;
        fcntl(fd, F_SETFL, flags);
    }
    
    ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
                         stdin_write_handler, NULL);
    sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
    sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;

    /* find the proct for this proc */
    proct = NULL;
    for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
         item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
         item = opal_list_get_next(item)) {
        ptr = (orte_iof_proc_t*)item;
        if (ptr->name.jobid == dst_name->jobid &&
            ptr->name.vpid == dst_name->vpid) {
            proct = ptr;
            break;
        }
    }
    if (NULL == proct) {
        /* we don't yet have this proc in our list */
        proct = OBJ_NEW(orte_iof_proc_t);
        proct->name.jobid = dst_name->jobid;
        proct->name.vpid = dst_name->vpid;
        opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super);
    }
    proct->sink = sink;

    return ORTE_SUCCESS;
}
Beispiel #8
0
int orte_iof_base_setup_output_files(const orte_process_name_t* dst_name,
                                     orte_job_t *jobdat,
                                     orte_iof_proc_t *proct)
{
    int rc;
    char *dirname, *outdir, *outfile;
    int np, numdigs, fdout, i;
    char *p, **s;
    bool usejobid = true;

    /* see if we are to output to a file */
    dirname = NULL;
    if (orte_get_attribute(&jobdat->attributes, ORTE_JOB_OUTPUT_TO_FILE, (void**)&dirname, OPAL_STRING) &&
        NULL != dirname) {
        np = jobdat->num_procs / 10;
        /* determine the number of digits required for max vpid */
        numdigs = 1;
        while (np > 0) {
            numdigs++;
            np = np / 10;
        }
        /* check for a conditional in the directory name */
        if (NULL != (p = strchr(dirname, ':'))) {
            *p = '\0';
            ++p;
            /* could me more than one directive */
            s = opal_argv_split(p, ',');
            for (i=0; NULL != s[i]; i++) {
                if (0 == strcasecmp(s[i], "nojobid")) {
                    usejobid = false;
                } else if (0 == strcasecmp(s[i], "nocopy")) {
                    proct->copy = false;
                }
            }
        }

        /* construct the directory where the output files will go */
        if (usejobid) {
            asprintf(&outdir, "%s/%d/rank.%0*lu", dirname,
                     (int)ORTE_LOCAL_JOBID(proct->name.jobid),
                     numdigs, (unsigned long)proct->name.vpid);
        } else {
            asprintf(&outdir, "%s/rank.%0*lu", dirname,
                     numdigs, (unsigned long)proct->name.vpid);
        }
        /* ensure the directory exists */
        if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(outdir, S_IRWXU|S_IRGRP|S_IXGRP))) {
            ORTE_ERROR_LOG(rc);
            free(outdir);
            return rc;
        }
        if (NULL != proct->revstdout && NULL == proct->revstdout->sink) {
            /* setup the stdout sink */
            asprintf(&outfile, "%s/stdout", outdir);
            fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
            free(outfile);
            if (fdout < 0) {
                /* couldn't be opened */
                ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
                return ORTE_ERR_FILE_OPEN_FAILURE;
            }
            /* define a sink to that file descriptor */
            ORTE_IOF_SINK_DEFINE(&proct->revstdout->sink, dst_name,
                                 fdout, ORTE_IOF_STDOUT,
                                 orte_iof_base_write_handler);
        }

        if (NULL != proct->revstderr && NULL == proct->revstderr->sink) {
            /* if they asked for stderr to be combined with stdout, then we
             * only create one file and tell the IOF to put both streams
             * into it. Otherwise, we create separate files for each stream */
            if (orte_get_attribute(&jobdat->attributes, ORTE_JOB_MERGE_STDERR_STDOUT, NULL, OPAL_BOOL)) {
                /* just use the stdout sink */
                OBJ_RETAIN(proct->revstdout->sink);
                proct->revstdout->sink->tag = ORTE_IOF_STDMERGE;  // show that it is merged
                proct->revstderr->sink = proct->revstdout->sink;
            } else {
                asprintf(&outfile, "%s/stderr", outdir);
                fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
                free(outfile);
                if (fdout < 0) {
                    /* couldn't be opened */
                    ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
                    return ORTE_ERR_FILE_OPEN_FAILURE;
                }
                /* define a sink to that file descriptor */
                ORTE_IOF_SINK_DEFINE(&proct->revstderr->sink, dst_name,
                                     fdout, ORTE_IOF_STDERR,
                                     orte_iof_base_write_handler);
            }
        }
#if OPAL_PMIX_V1
        if (NULL != proct->revstddiag && NULL == proct->revstddiag->sink) {
            /* always tie the sink for stddiag to stderr */
            OBJ_RETAIN(proct->revstderr->sink);
            proct->revstddiag->sink = proct->revstderr->sink;
        }
#endif
    }

    return ORTE_SUCCESS;
}