Beispiel #1
0
int
orte_session_dir_finalize(orte_process_name_t *proc)
{
    int rc;
    char *tmp;
    char *job, *job_session_dir, *vpid, *proc_session_dir;

    /* need to setup the top_session_dir with the prefix */
    tmp = opal_os_path(false,
                       orte_process_info.tmpdir_base,
                       orte_process_info.top_session_dir, NULL);
    
    /* define the proc and job session directories for this process */
    if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&job, proc))) {
        ORTE_ERROR_LOG(rc);
        free(tmp);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&vpid, proc))) {
        ORTE_ERROR_LOG(rc);
        free(tmp);
        free(job);
        return rc;
    }
    job_session_dir = opal_os_path( false, orte_process_info.universe_session_dir,
                                    job, NULL );
    if( NULL == job_session_dir ) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        free(tmp);
        free(job);
        free(vpid);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    proc_session_dir = opal_os_path( false, job_session_dir, vpid, NULL );
    if( NULL == proc_session_dir ) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        free(tmp);
        free(job);
        free(vpid);
        free(job_session_dir);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    
    opal_os_dirpath_destroy(proc_session_dir,
                            false, orte_dir_check_file);
    opal_os_dirpath_destroy(job_session_dir,
                            false, orte_dir_check_file);
    opal_os_dirpath_destroy(orte_process_info.universe_session_dir,
                            false, orte_dir_check_file);
    opal_os_dirpath_destroy(tmp,
                            false, orte_dir_check_file);

    if (opal_os_dirpath_is_empty(proc_session_dir)) {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: found proc session dir empty - deleting");
    	}
    	rmdir(proc_session_dir);
    } else {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: proc session dir not empty - leaving");
    	}
        goto CLEANUP;
    }

    if (opal_os_dirpath_is_empty(job_session_dir)) {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");
    	}
    	rmdir(job_session_dir);
    } else {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: job session dir not empty - leaving");
    	}
        goto CLEANUP;
    }

    if (opal_os_dirpath_is_empty(orte_process_info.universe_session_dir)) {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: found univ session dir empty - deleting");
    	}
    	rmdir(orte_process_info.universe_session_dir);
    } else {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: univ session dir not empty - leaving");
    	}
    	goto CLEANUP;
    }

    if (opal_os_dirpath_is_empty(tmp)) {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: found top session dir empty - deleting");
    	}
    	rmdir(tmp);
    } else {
    	if (orte_debug_flag) {
    	    opal_output(0, "sess_dir_finalize: top session dir not empty - leaving");
    	}
    }

CLEANUP:
    free(tmp);
    free(job);
    free(vpid);
    free(job_session_dir);
    free(proc_session_dir);
    return ORTE_SUCCESS;
}
Beispiel #2
0
static int mca_oob_ud_send_self (orte_rml_send_t *msg)
{
    unsigned int srco, dsto;
    mca_oob_ud_req_t *req;
    int srci, dsti;
    int rc, size;

    MCA_OOB_UD_IOV_SIZE(msg, size);

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s mca_oob_ud_send_self: sending %d bytes to myself",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size);

    rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false);
    if (ORTE_SUCCESS != rc) {
        return rc;
    }

    req->req_rem_data_len = size;
    req->req_is_eager     = true;

    rc = mca_oob_ud_recv_alloc (req);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
            free (req->req_data.iov.uiov);
        }
        OBJ_RELEASE(req);
        return rc;
    }

    srci = dsti = 0;
    srco = dsto = 0;

    if (msg->iov != NULL) {
        do {
            req->req_data_type = MCA_OOB_UD_REQ_IOV;
            size_t copy = min(msg->iov[srci].iov_len - srco,
                              req->req_data.iov.uiov[dsti].iov_len - dsto);

            memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto,
                     (unsigned char *) msg->iov[srci].iov_base + srco, copy);

            srco += copy;
            if (srco == msg->iov[srci].iov_len) {
                srci++;
                srco = 0;
            }

            dsto += copy;
            if (dsto == req->req_data.iov.uiov[dsti].iov_len) {
                dsti++;
                dsto = 0;
            }
        } while (srci < req->req_data.iov.count && dsti < msg->count);
    } else {
        req->req_data_type = MCA_OOB_UD_REQ_BUF;

        opal_buffer_t *buffer;
        buffer = OBJ_NEW(opal_buffer_t);

        if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(buffer);
            return rc;
        }
        if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size)))
        {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(buffer);
            free(req->req_data.buf.p);
            return rc;
        }
        OBJ_RELEASE(buffer);
    }

    req->state = MCA_OOB_UD_REQ_COMPLETE;

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s mca_oob_ud_send_self: complete. calling callbacks",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* queue up recv callback */
    mca_oob_ud_event_queue_completed (req);

    req->rml_msg->status = ORTE_SUCCESS;

    return size;
}
Beispiel #3
0
int orte_sstore_base_get_all_snapshots(opal_list_t *all_snapshots, char *basedir)
{
#ifndef HAVE_DIRENT_H
    return ORTE_ERR_NOT_SUPPORTED;
#else
    int ret, exit_status = ORTE_SUCCESS;
    char *loc_basedir = NULL;
    char * tmp_str = NULL, * metadata_file = NULL;
    DIR *dirp = NULL;
    struct dirent *dir_entp = NULL;
    struct stat file_status;
    orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;

    /* Sanity check */
    if( NULL == all_snapshots || 
        (NULL == orte_sstore_base_global_snapshot_dir && NULL == basedir)) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if( NULL == basedir ) {
        loc_basedir = strdup(orte_sstore_base_global_snapshot_dir);
    } else {
        loc_basedir = strdup(basedir);
    }

    /*
     * Get all subdirectories under the base directory
     */
    dirp = opendir(loc_basedir);
    while( NULL != (dir_entp = readdir(dirp))) {
        /* Skip "." and ".." if they are in the list */
        if( 0 == strncmp("..", dir_entp->d_name, strlen("..") ) ||
            0 == strncmp(".",  dir_entp->d_name, strlen(".")  ) ) {
            continue;
        }

        /* Add the full path */
        asprintf(&tmp_str, "%s/%s", loc_basedir, dir_entp->d_name);
        if(0 != (ret = stat(tmp_str, &file_status) ) ){
            free( tmp_str);
            tmp_str = NULL;
            continue;
        } else {
            /* Is it a directory? */
            if(S_ISDIR(file_status.st_mode) ) {
                asprintf(&metadata_file, "%s/%s",
                         tmp_str,
                         orte_sstore_base_global_metadata_filename);
                if(0 != (ret = stat(metadata_file, &file_status) ) ){
                    free( tmp_str);
                    tmp_str = NULL;
                    free( metadata_file);
                    metadata_file = NULL;
                    continue;
                } else {
                    if(S_ISREG(file_status.st_mode) ) {
                        global_snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);

                        global_snapshot->ss_handle = 1;
                        global_snapshot->basedir = strdup(loc_basedir);
                        asprintf(&(global_snapshot->reference),
                                 "%s",
                                 dir_entp->d_name);
                        asprintf(&(global_snapshot->metadata_filename), 
                                 "%s/%s/%s",
                                 global_snapshot->basedir,
                                 global_snapshot->reference,
                                 orte_sstore_base_global_metadata_filename);

                        opal_list_append(all_snapshots, &(global_snapshot->super));
                    }
                }
                free( metadata_file);
                metadata_file = NULL;
            }
        }

        free( tmp_str);
        tmp_str = NULL;
    }
    
    closedir(dirp);

 cleanup:
    if( NULL != loc_basedir ) {
        free(loc_basedir);
        loc_basedir = NULL;
    }

    if( NULL != tmp_str) {
        free( tmp_str);
        tmp_str = NULL;
    }

    return exit_status;
#endif /* HAVE_DIRENT_H */
}
/*
 * PROC
 */
int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
                        int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, n, count, k;
    orte_attribute_t *kv;;
    orte_proc_t **procs;
    
    /* unpack into array of orte_proc_t objects */
    procs = (orte_proc_t**) dest;
    for (i=0; i < *num_vals; i++) {
        
        /* create the orte_proc_t object */
        procs[i] = OBJ_NEW(orte_proc_t);
        if (NULL == procs[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        
        /* unpack the name */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         &(procs[i]->name), &n, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the node it is on */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(procs[i]->parent)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
       /* unpack the local rank */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(procs[i]->local_rank)), &n, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the node rank */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                          (&(procs[i]->node_rank)), &n, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the state */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(procs[i]->state)), &n, ORTE_PROC_STATE))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the app context index */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(procs[i]->app_idx)), &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the attributes */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            kv->local = ORTE_ATTR_GLOBAL;  // obviously not a local value
            opal_list_append(&procs[i]->attributes, &kv->super);
        }
    }
    return ORTE_SUCCESS;
}
/*
 * JOB_MAP
 * NOTE: There is no obvious reason to include all the node information when
 * sending a map - hence, we do not pack that field, so don't unpack it here
 */
int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
                       int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, n;
    orte_job_map_t **maps;
    
    /* unpack into array of orte_job_map_t objects */
    maps = (orte_job_map_t**) dest;
    for (i=0; i < *num_vals; i++) {
        
        /* create the orte_rmaps_base_map_t object */
        maps[i] = OBJ_NEW(orte_job_map_t);
        if (NULL == maps[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        
        /* unpack the requested mapper */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->req_mapper), &n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the policies */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->mapping), &n, ORTE_MAPPING_POLICY))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->ranking), &n, ORTE_RANKING_POLICY))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
#if OPAL_HAVE_HWLOC
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->binding), &n, OPAL_BINDING_POLICY))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
#endif
        /* unpack the ppr */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->ppr), &n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the display map flag */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->display_map), &n, OPAL_BOOL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* unpack the number of nodes involved in the job */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                                         &(maps[i]->num_nodes), &n, OPAL_UINT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }
    
    return ORTE_SUCCESS;
}
int main(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    char * tmp_env_var = NULL;

    /* init enough of opal to process cmd lines */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    opal_cmd_line_create(cmd_line, ompi_server_cmd_line_opts);
    mca_base_cmd_line_setup(cmd_line);
    if (OPAL_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        if (OPAL_ERR_SILENT != ret) {
            fprintf(stderr, "%s: command line error (%s)\n", argv[0],
                    opal_strerror(ret));
        }
        return 1;
    }
    
    /* check for help request */
    if (help) {
        char *str, *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        str = opal_show_help_string("help-ompi-server.txt", 
                                    "ompiserver:usage", false,
                                    argv[0], args);
        if (NULL != str) {
            printf("%s", str);
            free(str);
        }
        free(args);
        /* If we show the help message, that should be all we do */
        return 0;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* if debug is set, then set orte_debug_flag so that the data server
     * code will output
     */
    if (debug) {
        putenv("OMPI_MCA_orte_debug=1");
    }
        
    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(debug == false &&
       no_daemonize == false) {
        opal_daemon_init(NULL);
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Disable the checkpoint notification routine for this
     * tool. As we will never need to checkpoint this tool.
     * Note: This must happen before opal_init().
     */
    opal_cr_set_enabled(false);
    
    /* Select the none component, since we don't actually use a checkpointer */
    tmp_env_var = mca_base_param_env_var("crs");
    opal_setenv(tmp_env_var,
                "none",
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /* Mark as a tool program */
    tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#else
    tmp_env_var = NULL; /* Silence compiler warning */
#endif

    /* don't want session directories */
    orte_create_session_dirs = false;

    /* Perform the standard init, but flag that we are an HNP */
    if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
        fprintf(stderr, "ompi-server: failed to initialize -- aborting\n");
        exit(1);
    }
    
    /* report out our URI, if we were requested to do so, using syntax
     * proposed in an email thread by Jeff Squyres
     */
    if (NULL != report_uri) {
        rml_uri = orte_rml.get_contact_info();
        if (0 == strcmp(report_uri, "-")) {
            /* if '-', then output to stdout */
            printf("%s\n", rml_uri);
        } else if (0 == strcmp(report_uri, "+")) {
            /* if '+', output to stderr */
            fprintf(stderr, "%s\n", rml_uri);
        } else {
            /* treat it as a filename and output into it */
            FILE *fp;
            fp = fopen(report_uri, "w");
            if (NULL == fp) {
                fprintf(stderr, "ompi-server: failed to open designated file %s -- aborting\n", report_uri);
                orte_finalize();
                exit(1);
            }
            fprintf(fp, "%s\n", rml_uri);
            fclose(fp);
        }
        free(rml_uri);
    }
    
    /* setup the data server to listen for commands */
    if (ORTE_SUCCESS != (ret = orte_data_server_init())) {
        fprintf(stderr, "ompi-server: failed to start data server -- aborting\n");
        orte_finalize();
        exit(1);
    }
    
    /* setup to listen for commands sent specifically to me */
    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                                  ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(ret);
        orte_finalize();
        exit(1);
    }

    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    opal_event_set(opal_event_base, &term_handler, SIGTERM, OPAL_EV_SIGNAL,
                   shutdown_callback, NULL);
    opal_event_add(&term_handler, NULL);
    opal_event_set(opal_event_base, &int_handler, SIGINT, OPAL_EV_SIGNAL,
                   shutdown_callback, NULL);
    opal_event_add(&int_handler, NULL);

    /* We actually do *not* want the server to voluntarily yield() the
       processor more than necessary.  The server already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the server, we want the
       OS to wake up the server in a timely fashion (which most OS's
       seem good about doing) and then we want the server to process
       the message as fast as possible.  If the server yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the server to run again
       (particularly if there is no IO event to wake it up).  Hence,
       publish and lookup (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the server that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    if (debug) {
        opal_output(0, "%s ompi-server: up and running!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }

    /* wait to hear we are done */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* should never get here, but if we do... */
    
    /* Finalize and clean up ourselves */
    if (ORTE_SUCCESS != (ret = orte_finalize())) {
        ORTE_ERROR_LOG(ret);
    }
    return ret;
}
/*
 * JOB
 * NOTE: We do not pack all of the job object's fields as many of them have no
 * value in sending them to another location. The only purpose in packing and
 * sending a job object is to communicate the data required to dynamically
 * spawn another job - so we only pack that limited set of required data.
 * Therefore, only unpack what was packed
 */
int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
                       int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, k, n, count;
    orte_job_t **jobs;
    orte_app_idx_t j;
    orte_attribute_t *kv;

    /* unpack into array of orte_job_t objects */
    jobs = (orte_job_t**) dest;
    for (i=0; i < *num_vals; i++) {

        /* create the orte_job_t object */
        jobs[i] = OBJ_NEW(orte_job_t);
        if (NULL == jobs[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        /* unpack the jobid */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                &(jobs[i]->jobid), &n, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the num apps */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->num_apps)), &n, ORTE_APP_IDX))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* if there are apps, unpack them */
        if (0 < jobs[i]->num_apps) {
            orte_app_context_t *app;
            for (j=0; j < jobs[i]->num_apps; j++) {
                n = 1;
                if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                               &app, &n, ORTE_APP_CONTEXT))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                opal_pointer_array_add(jobs[i]->apps, app);
            }
        }
        
        /* unpack num procs and offset */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->num_procs)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                (&(jobs[i]->offset)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* and the procs, if provided */
        if (0 < jobs[i]->num_procs) {
            orte_proc_t *proc;
            for (j=0; j < jobs[i]->num_procs; j++) {
                n = 1;
                if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                               &proc, &n, ORTE_PROC))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                opal_pointer_array_add(jobs[i]->procs, proc);
            }
        }

        /* unpack stdin target */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                            (&(jobs[i]->stdin_target)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the total slots allocated to the job */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->total_slots_alloc)), &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* if the map is NULL, then we din't pack it as there was
         * nothing to pack. Instead, we packed a flag to indicate whether or not
         * the map is included         */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                            &j, &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (0 < j) {
            /* unpack the map */
            n = 1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                                            (&(jobs[i]->map)), &n, ORTE_JOB_MAP))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }
        
        /* no bookmark of oversubscribe_override flags to unpack */
        
        /* unpack the job state */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->state)), &n, ORTE_JOB_STATE))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the flags */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(jobs[i]->flags)), &n, ORTE_JOB_FLAGS_T))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the attributes */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            kv->local = ORTE_ATTR_GLOBAL;  // obviously not a local value
            opal_list_append(&jobs[i]->attributes, &kv->super);
        }
    }

    return ORTE_SUCCESS;
}
static void recv_data(int fd, short args, void *cbdata)
{
    bool found;
    int i, rc;
    orte_node_t *nd, *nd2;
    opal_list_t nds, ndtmp;
    opal_list_item_t *item, *itm;
    char recv_msg[8192];
    int nbytes, idx, sjob;
    char **alloc, *nodelist, *tpn;
    local_jobtracker_t *ptr, *jtrk;
    local_apptracker_t *aptrk;
    orte_app_context_t *app;
    orte_jobid_t jobid;
    orte_job_t *jdata;

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation - data recvd",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* read the data from the socket and put it in the
     * nodes field of op
     */
    memset(recv_msg, 0, sizeof(recv_msg));
    nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1);

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation msg: %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg);

    /* check if we got something */
    if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) {
        /* show an error here - basically, a "nothing was available"
         * message
         */
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true,
                       (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg);
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        return;
    }

    /* break the message into its component parts, separated by colons */
    alloc = opal_argv_split(recv_msg, ':');

    /* the first section contains the ORTE jobid for this allocation */
    tpn = strchr(alloc[0], '=');
    orte_util_convert_string_to_jobid(&jobid, tpn+1);
    /* get the corresponding job object */
    jdata = orte_get_job_data_object(jobid);
    jtrk = NULL;
    /* find the associated tracking object */
    for (item = opal_list_get_first(&jobs);
            item != opal_list_get_end(&jobs);
            item = opal_list_get_next(item)) {
        ptr = (local_jobtracker_t*)item;
        if (ptr->jobid == jobid) {
            jtrk = ptr;
            break;
        }
    }
    if (NULL == jtrk) {
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER");
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        opal_argv_free(alloc);
        return;
    }

    /* stop the timeout event */
    opal_event_del(&jtrk->timeout_ev);

    /* cycle across all the remaining parts - each is the allocation for
     * an app in this job
     */
    OBJ_CONSTRUCT(&nds, opal_list_t);
    OBJ_CONSTRUCT(&ndtmp, opal_list_t);
    idx = -1;
    sjob = -1;
    nodelist = NULL;
    for (i=1; NULL != alloc[i]; i++) {
        if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            return;
        }
        if (idx < 0 || NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            return;
        }
        /* track the Slurm jobid */
        if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) {
            aptrk = OBJ_NEW(local_apptracker_t);
            opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
        }
        aptrk->sjob = sjob;
        /* release the current dash_host as that contained the *desired* allocation */
        opal_argv_free(app->dash_host);
        app->dash_host = NULL;
        /* since the nodelist/tpn may contain regular expressions, parse them */
        if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
            ORTE_ERROR_LOG(rc);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            return;
        }
        /* transfer the discovered nodes to our node list, and construct
         * the new dash_host entry to match what was allocated
         */
        while (NULL != (item = opal_list_remove_first(&ndtmp))) {
            nd = (orte_node_t*)item;
            opal_argv_append_nosize(&app->dash_host, nd->name);
            /* check for duplicates */
            found = false;
            for (itm = opal_list_get_first(&nds);
                    itm != opal_list_get_end(&nds);
                    itm = opal_list_get_next(itm)) {
                nd2 = (orte_node_t*)itm;
                if (0 == strcmp(nd->name, nd2->name)) {
                    found = true;
                    nd2->slots += nd->slots;
                    OBJ_RELEASE(item);
                    break;
                }
            }
            if (!found) {
                /* append the new node to our list */
                opal_list_append(&nds, item);
            }
        }
        /* cleanup */
        free(nodelist);
        free(tpn);
    }
    /* cleanup */
    opal_argv_free(alloc);
    OBJ_DESTRUCT(&ndtmp);

    if (opal_list_is_empty(&nds)) {
        /* if we get here, then we were able to contact slurm,
         * which means we are in an actively managed cluster.
         * However, slurm indicated that nothing is currently
         * available that meets our requirements. This is a fatal
         * situation - we do NOT have the option of running on
         * user-specified hosts as the cluster is managed.
         */
        OBJ_DESTRUCT(&nds);
        orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

    /* store the found nodes */
    if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&nds);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }
    OBJ_DESTRUCT(&nds);

    /* default to no-oversubscribe-allowed for managed systems */
    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
    }
    /* flag that the allocation is managed */
    orte_managed_allocation = true;
    /* move the job along */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
    /* all done */
    return;
}
/* we cannot use the RML to communicate with SLURM as it doesn't
 * understand our internal protocol, so we have to do a bare-bones
 * exchange based on sockets
 */
static int dyn_allocate(orte_job_t *jdata)
{
    char *cmd_str, **cmd=NULL, *tmp, *jstring;
    char *node_list;
    orte_app_context_t *app;
    int i;
    struct timeval tv;
    local_jobtracker_t *jtrk;

    if (NULL == mca_ras_slurm_component.config_file) {
        opal_output(0, "Cannot perform dynamic allocation as no Slurm configuration file provided");
        return ORTE_ERR_NOT_FOUND;
    }

    /* track this request */
    jtrk = OBJ_NEW(local_jobtracker_t);
    jtrk->jobid = jdata->jobid;
    opal_list_append(&jobs, &jtrk->super);

    /* construct the command - note that the jdata structure contains
     * a field for the minimum number of nodes required for the job.
     * The node list can be constructed from the union of all the nodes
     * contained in the dash_host field of the app_contexts. So you'll
     * need to do a little work to build the command. We don't currently
     * have a field in the jdata structure for "mandatory" vs "optional"
     * allocations, so we'll have to add that someday. Likewise, you may
     * want to provide a param to adjust the timeout value
     */
    /* construct the cmd string */
    opal_argv_append_nosize(&cmd, "allocate");
    /* add the jobid */
    orte_util_convert_jobid_to_string(&jstring, jdata->jobid);
    asprintf(&tmp, "jobid=%s", jstring);
    opal_argv_append_nosize(&cmd, tmp);
    free(tmp);
    free(jstring);
    /* if we want the allocation for all apps in one shot,
     * then tell slurm
     *
     * RHC: we don't currently have the ability to handle
     * rolling allocations in the rest of the code base
     */
#if 0
    if (!mca_ras_slurm_component.rolling_alloc) {
        opal_argv_append_nosize(&cmd, "return=all");
    }
#else
    opal_argv_append_nosize(&cmd, "return=all");
#endif

    /* pass the timeout */
    asprintf(&tmp, "timeout=%d", mca_ras_slurm_component.timeout);
    opal_argv_append_nosize(&cmd, tmp);
    free(tmp);

    /* for each app, add its allocation request info */
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* add the app id, preceded by a colon separator */
        asprintf(&tmp, ": app=%d", (int)app->idx);
        opal_argv_append_nosize(&cmd, tmp);
        free(tmp);
        /* add the number of process "slots" we need */
        asprintf(&tmp, "np=%d", app->num_procs);
        opal_argv_append_nosize(&cmd, tmp);
        free(tmp);
        /* if we were given a minimum number of nodes, pass it along */
        if (0 < app->min_number_of_nodes) {
            asprintf(&tmp, "N=%ld", (long int)app->min_number_of_nodes);
            opal_argv_append_nosize(&cmd, tmp);
            free(tmp);
        }
        /* add the list of nodes, if one was given, ensuring
         * that each node only appears once
         */
        node_list =  get_node_list(app);
        if (NULL != node_list) {
            asprintf(&tmp, "node_list=%s", node_list);
            opal_argv_append_nosize(&cmd, tmp);
            free(node_list);
            free(tmp);
        }
        /* add the mandatory/optional flag */
        if (app->mandatory) {
            opal_argv_append_nosize(&cmd, "flag=mandatory");
        } else {
            opal_argv_append_nosize(&cmd, "flag=optional");
        }
    }

    /* assemble it into the final cmd to be sent */
    cmd_str = opal_argv_join(cmd, ' ');
    opal_argv_free(cmd);

    /* start a timer - if the response to our request doesn't appear
     * in the defined time, then we will error out as Slurm isn't
     * responding to us
     */
    opal_event_evtimer_set(orte_event_base, &jtrk->timeout_ev, timeout, jtrk);
    tv.tv_sec = mca_ras_slurm_component.timeout * 2;
    tv.tv_usec = 0;
    opal_event_evtimer_add(&jtrk->timeout_ev, &tv);

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s slurm:dynalloc cmd_str = %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        cmd_str);

    if (send(socket_fd, cmd_str, strlen(cmd_str)+1, 0) < 0) {
        ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
    }
    free(cmd_str);

    /* we cannot wait here for a response as we
     * are already in an event. So return a value
     * that indicates we are waiting for an
     * allocation so the base functions know
     * that they shouldn't progress the job
     */
    return ORTE_ERR_ALLOCATION_PENDING;
}
/**
 * Discover the available resources.
 *
 * In order to fully support slurm, we need to be able to handle
 * node regexp/task_per_node strings such as:
 * foo,bar    5,3
 * foo        5
 * foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
 *
 * @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
 * @param *tasks_per_node A tasks per node expression from SLURM
 *                        (i.e. SLURM_TASKS_PER_NODE)
 * @param *nodelist A list which has already been constucted to return
 *                  the found nodes in
 */
static int orte_ras_slurm_discover(char *regexp, char *tasks_per_node,
                                   opal_list_t* nodelist)
{
    int i, j, len, ret, count, reps, num_nodes;
    char *base, **names = NULL;
    char *begptr, *endptr, *orig;
    int *slots;
    bool found_range = false;
    bool more_to_come = false;

    orig = base = strdup(regexp);
    if (NULL == base) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                         "%s ras:slurm:allocate:discover: checking nodelist: %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         regexp));

    do {
        /* Find the base */
        len = strlen(base);
        for (i = 0; i <= len; ++i) {
            if (base[i] == '[') {
                /* we found a range. this gets dealt with below */
                base[i] = '\0';
                found_range = true;
                break;
            }
            if (base[i] == ',') {
                /* we found a singleton node, and there are more to come */
                base[i] = '\0';
                found_range = false;
                more_to_come = true;
                break;
            }
            if (base[i] == '\0') {
                /* we found a singleton node */
                found_range = false;
                more_to_come = false;
                break;
            }
        }
        if(i == 0) {
            /* we found a special character at the beginning of the string */
            orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
                           1, regexp, tasks_per_node, "SLURM_NODELIST");
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            free(orig);
            return ORTE_ERR_BAD_PARAM;
        }

        if (found_range) {
            /* If we found a range, now find the end of the range */
            for (j = i; j < len; ++j) {
                if (base[j] == ']') {
                    base[j] = '\0';
                    break;
                }
            }
            if (j >= len) {
                /* we didn't find the end of the range */
                orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
                               1, regexp, tasks_per_node, "SLURM_NODELIST");
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                free(orig);
                return ORTE_ERR_BAD_PARAM;
            }

            ret = orte_ras_slurm_parse_ranges(base, base + i + 1, &names);
            if(ORTE_SUCCESS != ret) {
                orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value",
                               1, regexp, tasks_per_node, "SLURM_NODELIST");
                ORTE_ERROR_LOG(ret);
                free(orig);
                return ret;
            }
            if(base[j + 1] == ',') {
                more_to_come = true;
                base = &base[j + 2];
            } else {
                more_to_come = false;
            }
        } else {
            /* If we didn't find a range, just add the node */

            OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                                 "%s ras:slurm:allocate:discover: found node %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 base));

            if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(&names, base))) {
                ORTE_ERROR_LOG(ret);
                free(orig);
                return ret;
            }
            /* set base equal to the (possible) next base to look at */
            base = &base[i + 1];
        }
    } while(more_to_come);

    free(orig);

    num_nodes = opal_argv_count(names);

    /* Find the number of slots per node */

    slots = malloc(sizeof(int) * num_nodes);
    if (NULL == slots) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    memset(slots, 0, sizeof(int) * num_nodes);

    orig = begptr = strdup(tasks_per_node);
    if (NULL == begptr) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        free(slots);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    j = 0;
    while (begptr) {
        count = strtol(begptr, &endptr, 10);
        if ((endptr[0] == '(') && (endptr[1] == 'x')) {
            reps = strtol((endptr+2), &endptr, 10);
            if (endptr[0] == ')') {
                endptr++;
            }
        } else {
            reps = 1;
        }

        /**
         * TBP: it seems like it would be an error to have more slot
         * descriptions than nodes. Turns out that this valid, and SLURM will
         * return such a thing. For instance, if I did:
         * srun -A -N 30 -w odin001
         * I would get SLURM_NODELIST=odin001 SLURM_TASKS_PER_NODE=4(x30)
         * That is, I am allocated 30 nodes, but since I only requested
         * one specific node, that's what is in the nodelist.
         * I'm not sure this is what users would expect, but I think it is
         * more of a SLURM issue than a orte issue, since SLURM is OK with it,
         * I'm ok with it
         */
        for (i = 0; i < reps && j < num_nodes; i++) {
            slots[j++] = count;
        }

        if (*endptr == ',') {
            begptr = endptr + 1;
        } else if (*endptr == '\0' || j >= num_nodes) {
            break;
        } else {
            orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1,
                           regexp, tasks_per_node, "SLURM_TASKS_PER_NODE");
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            free(slots);
            free(orig);
            return ORTE_ERR_BAD_PARAM;
        }
    }

    free(orig);

    /* Convert the argv of node names to a list of node_t's */

    for (i = 0; NULL != names && NULL != names[i]; ++i) {
        orte_node_t *node;

        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s ras:slurm:allocate:discover: adding node %s (%d slot%s)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             names[i], slots[i], (1 == slots[i]) ? "" : "s"));

        node = OBJ_NEW(orte_node_t);
        if (NULL == node) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            free(slots);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        node->name = strdup(names[i]);
        node->state = ORTE_NODE_STATE_UP;
        node->slots_inuse = 0;
        node->slots_max = 0;
        node->slots = slots[i];
        opal_list_append(nodelist, &node->super);
    }
    free(slots);
    opal_argv_free(names);

    /* All done */
    return ret;
}
/*
 * Parse a single range in a set and add the full names of the nodes
 * found to the names argv
 *
 * @param base     The base text of the node name
 * @param *ranges  A pointer to a single range. (i.e. "1-3" or "5")
 * @param ***names An argv array to add the newly discovered nodes to
 */
static int orte_ras_slurm_parse_range(char *base, char *range, char ***names)
{
    char *str, temp1[BUFSIZ];
    size_t i, j, start, end;
    size_t base_len, len, num_len;
    size_t num_str_len;
    bool found;
    int ret;

    len = strlen(range);
    base_len = strlen(base);
    /* Silence compiler warnings; start and end are always assigned
       properly, below */
    start = end = 0;

    /* Look for the beginning of the first number */

    for (found = false, i = 0; i < len; ++i) {
        if (isdigit((int) range[i])) {
            if (!found) {
                start = atoi(range + i);
                found = true;
                break;
            }
        }
    }
    if (!found) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }

    /* Look for the end of the first number */

    for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
        if (!isdigit((int) range[i])) {
            break;
        }
    }

    /* Was there no range, just a single number? */

    if (i >= len) {
        end = start;
        found = true;
    }

    /* Nope, there was a range.  Look for the beginning of the second
       number */

    else {
        for (; i < len; ++i) {
            if (isdigit((int) range[i])) {
                end = atoi(range + i);
                found = true;
                break;
            }
        }
    }
    if (!found) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }

    /* Make strings for all values in the range */

    len = base_len + num_str_len + 32;
    str = malloc(len);
    if (NULL == str) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    strcpy(str, base);
    for (i = start; i <= end; ++i) {
        str[base_len] = '\0';
        snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);

        /* Do we need zero pading? */

        if ((num_len = strlen(temp1)) < num_str_len) {
            for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
                str[j] = '0';
            }
            str[j] = '\0';
        }
        strcat(str, temp1);
        ret = opal_argv_append_nosize(names, str);
        if(ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free(str);
            return ret;
        }
    }
    free(str);

    /* All done */
    return ORTE_SUCCESS;
}
/**
 * Discover available (pre-allocated) nodes.  Allocate the
 * requested number of nodes/process slots to the job.
 *
 */
static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
{
    int ret, cpus_per_task;
    char *slurm_node_str, *regexp;
    char *tasks_per_node, *node_tasks;
    char *tmp;
    char *slurm_jobid;

    if (NULL == (slurm_jobid = getenv("SLURM_JOBID"))) {
        /* we are not in a slurm allocation - see if dyn alloc
         * is enabled
         */
        if (!mca_ras_slurm_component.dyn_alloc_enabled) {
            /* nope - nothing we can do */
            opal_output_verbose(2, orte_ras_base_framework.framework_output,
                                "%s ras:slurm: no prior allocation and dynamic alloc disabled",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    } else {
        /* save this value in the global job ident string for
         * later use in any error reporting
         */
        orte_job_ident = strdup(slurm_jobid);
    }

    slurm_node_str = getenv("SLURM_NODELIST");
    if (NULL == slurm_node_str) {
        /* see if dynamic allocation is enabled */
        if (mca_ras_slurm_component.dyn_alloc_enabled) {
            /* attempt to get the allocation - the function
            * dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
            * if it succeeds in sending the allocation request
            */
            ret = dyn_allocate(jdata);
            /* return to the above layer in ras/base/ras_base_allocate.c
             * to wait for event (libevent) happening
             */
            return ret;
        }
        orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
                       "SLURM_NODELIST");
        return ORTE_ERR_NOT_FOUND;
    }
    regexp = strdup(slurm_node_str);

    tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
    if (NULL == tasks_per_node) {
        /* try an older variation */
        tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
        if (NULL == tasks_per_node) {
            /* couldn't find any version - abort */
            orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
                           "SLURM_TASKS_PER_NODE");
            return ORTE_ERR_NOT_FOUND;
        }
    }
    node_tasks = strdup(tasks_per_node);

    if(NULL == regexp || NULL == node_tasks) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    /* get the number of CPUs per task that the user provided to slurm */
    tmp = getenv("SLURM_CPUS_PER_TASK");
    if(NULL != tmp) {
        cpus_per_task = atoi(tmp);
        if(0 >= cpus_per_task) {
            opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
                        "Variable was: %s\n", tmp);
            ORTE_ERROR_LOG(ORTE_ERROR);
            return ORTE_ERROR;
        }
    } else {
        cpus_per_task = 1;
    }

    ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
    free(regexp);
    free(node_tasks);
    if (ORTE_SUCCESS != ret) {
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s ras:slurm:allocate: discover failed!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        return ret;
    }
    /* record the number of allocated nodes */
    orte_num_allocated_nodes = opal_list_get_size(nodes);

    /* All done */

    OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                         "%s ras:slurm:allocate: success",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    return ORTE_SUCCESS;
}
/* init the module */
static int init(void)
{
    char *slurm_host=NULL;
    uint16_t port=0;
    struct sockaddr_in address;
    int flags;
    struct hostent *h;

    if (mca_ras_slurm_component.dyn_alloc_enabled) {
        if (NULL == mca_ras_slurm_component.config_file) {
            orte_show_help("help-ras-slurm.txt", "dyn-alloc-no-config", true);
            return ORTE_ERR_SILENT;
        }
        /* setup the socket */
        if (ORTE_SUCCESS != read_ip_port(mca_ras_slurm_component.config_file,
                                         &slurm_host, &port) ||
                NULL == slurm_host || 0 == port) {
            return ORTE_ERR_SILENT;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_ras_base_framework.framework_output,
                             "ras:slurm got [ ip = %s, port = %u ] from %s\n",
                             slurm_host, port, mca_ras_slurm_component.config_file));

        /* obtain a socket for our use */
        if ((socket_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        /* connect to the Slurm dynamic allocation port */
        bzero(&address, sizeof(address));
        address.sin_family = AF_INET;
        if (!opal_net_isaddr(slurm_host)) {
            /* if the ControlMachine was not specified as an IP address,
             * we need to resolve it here
             */
            if (NULL == (h = gethostbyname(slurm_host))) {
                /* could not resolve it */
                orte_show_help("help-ras-slurm.txt", "host-not-resolved",
                               true, slurm_host);
                free(slurm_host);
                return ORTE_ERR_SILENT;
            }
            free(slurm_host);
            slurm_host = strdup(inet_ntoa(*(struct in_addr*)h->h_addr_list[0]));
        }
        address.sin_addr.s_addr = inet_addr(slurm_host);
        address.sin_port =  htons(port);
        if (connect(socket_fd, (struct sockaddr*)&address, sizeof(address)) < 0) {
            orte_show_help("help-ras-slurm.txt", "connection-failed",
                           true, slurm_host, (int)port);
            return ORTE_ERR_SILENT;
        }

        /* set socket up to be non-blocking */
        if ((flags = fcntl(socket_fd, F_GETFL, 0)) < 0) {
            opal_output(0, "ras:slurm:dyn: fcntl(F_GETFL) failed: %s (%d)",
                        strerror(opal_socket_errno), opal_socket_errno);
            return ORTE_ERROR;
        } else {
            flags |= O_NONBLOCK;
            if (fcntl(socket_fd, F_SETFL, flags) < 0) {
                opal_output(0, "ras:slurm:dyn: fcntl(F_SETFL) failed: %s (%d)",
                            strerror(opal_socket_errno), opal_socket_errno);
                return ORTE_ERROR;
            }
        }

        /* setup to recv data */
        opal_event_set(orte_event_base, &recv_ev, socket_fd,
                       OPAL_EV_READ, recv_data, NULL);
        opal_event_add(&recv_ev, 0);

        /* initialize the list of jobs for tracking dynamic allocations */
        OBJ_CONSTRUCT(&jobs, opal_list_t);
    }
    return ORTE_SUCCESS;
}
Beispiel #14
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
int orte_ras_base_open(void)
{
    int value, rc, param;
    orte_data_type_t tmp;
    char *requested;

    /* Debugging / verbose output */

    orte_ras_base.ras_output = opal_output_open(NULL);
    mca_base_param_reg_int_name("ras", "base_verbose", 
                                "Enable debugging for the RAS framework (nonzero = enabled)",
                                false, false, 0, &value);
    if (value != 0) {
        orte_ras_base.ras_output = opal_output_open(NULL);
    } else {
        orte_ras_base.ras_output = -1;
    }

    /* Defaults */

    orte_ras_base.ras_opened_valid = false;
    orte_ras_base.ras_using_proxy = false;
    orte_ras_base.ras_available_valid = false;

    /** register the base system types with the DSS */
    tmp = ORTE_RAS_NODE;
    if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ras_base_pack_node,
                                                     orte_ras_base_unpack_node,
                                                     (orte_dss_copy_fn_t)orte_ras_base_copy_node,
                                                     (orte_dss_compare_fn_t)orte_ras_base_compare_node,
                                                     (orte_dss_size_fn_t)orte_ras_base_size_node,
                                                     (orte_dss_print_fn_t)orte_ras_base_print_node,
                                                     (orte_dss_release_fn_t)orte_ras_base_std_obj_release,
                                                     ORTE_DSS_STRUCTURED,
                                                     "ORTE_RAS_NODE", &tmp))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /* Some systems do not want any RAS support. In those cases,
        * memory consumption is also an issue. For those systems, we
        * avoid opening the RAS components by checking for a directive
        * to use the "null" component.
        */
    param = mca_base_param_reg_string_name("ras", NULL, NULL,
                                           false, false, NULL, NULL);
    if (ORTE_ERROR == mca_base_param_lookup_string(param, &requested)) {
        return ORTE_ERROR;
    }
    if (NULL != requested && 0 == strcmp(requested, "null")) {
        /* the user has specifically requested that we use the "null"
        * component. In this case, that means we do NOT open any
        * components, and we simply use the default module we have
        * already defined above
        */
        orte_ras_base.ras_opened_valid = false;
        orte_ras = orte_ras_no_op; /* use the no_op module */
        return ORTE_SUCCESS;
    }

    /* check for timing tests */
    param = mca_base_param_reg_int_name("orte", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        orte_ras_base.timing = true;
    } else {
        orte_ras_base.timing = false;
    }
    
    /* Open up all available components */
    if (ORTE_SUCCESS != 
        mca_base_components_open("ras", orte_ras_base.ras_output,
                                 mca_ras_base_static_components, 
                                 &orte_ras_base.ras_opened, true)) {
        return ORTE_ERROR;
    }

    /* if we are not on a HNP, select the proxy 'module' */
    if (!orte_process_info.seed) {
        orte_ras = orte_ras_base_proxy_module;
        /* initialize the module */
        orte_ras_base_proxy_init(&rc);
        orte_ras_base.ras_using_proxy = true;
        return ORTE_SUCCESS;
    }

    /* All done */

    orte_ras_base.ras_opened_valid = true;
    return ORTE_SUCCESS;
}
Beispiel #15
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    opal_list_item_t *item, *next;
    orte_odls_job_t *jobdat = NULL;
    orte_odls_child_t *child;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
                "errmgr:default_orted:update_state() %s) "
                "------- %s state updated for process %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                ((NULL == proc) ? "App. Process" : 
                 (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
                (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
        ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        return ORTE_SUCCESS;
    }

    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* this is an update for an entire job */
        if (ORTE_JOBID_INVALID == job) {
            /* whatever happened, we don't know what job
             * it happened to
             */
            orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error",
                           true, orte_job_state_to_str(jobstate));
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* pack the "invalid" jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
            return rc;
        }

        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == job) {
                break;
            }
        }
        if (NULL == jobdat) {
            return ORTE_ERR_NOT_FOUND;
        }

        switch (jobstate) {
        case ORTE_JOB_STATE_FAILED_TO_START:
            failed_start(jobdat, exit_code);
            break;
        case ORTE_JOB_STATE_RUNNING:
            /* update all local child states */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
            break;
        case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
            /* update all procs in job */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
            /* order all local procs for this job to be killed */
            killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
        case ORTE_JOB_STATE_COMM_FAILED:
            /* kill all local procs */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* tell the caller we can't recover */
            return ORTE_ERR_UNRECOVERABLE;
            break;
        case ORTE_JOB_STATE_HEARTBEAT_FAILED:
            /* let the HNP handle this */
            return ORTE_SUCCESS;
            break;

        default:
            break;
        }
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the job info */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }
        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            return ORTE_SUCCESS;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            return ORTE_SUCCESS;
        }
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            orte_quit();
        }
        /* if not, then indicate we can continue */
        return ORTE_SUCCESS;
    }

    /* lookup the local jobdat for this job */
    jobdat = NULL;
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_next(item)) {
        jobdat = (orte_odls_job_t*)item;

        /* is this the specified job? */
        if (jobdat->jobid == proc->jobid) {
            break;
        }
    }
    if (NULL == jobdat) {
        /* must already be complete */
        return ORTE_SUCCESS;
    }

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jobdat->num_local_procs) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s pid %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid));
 
    /***  UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                    /* Decrement the number of local procs */
                    jobdat->num_local_procs--;
                    /* kill this proc */
                    killprocs(proc->jobid, proc->vpid);
                }
                return ORTE_SUCCESS;
            }
        }
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                }
                /* now pack the child's info */
                if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                /* remove the child from our local list as it is no longer alive */
                opal_list_remove_item(&orte_local_children, &child->super);
                /* Decrement the number of local procs */
                jobdat->num_local_procs--;

                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                     "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(child->name),
                                     jobdat->num_local_procs));
                
                /* release the child object */
                OBJ_RELEASE(child);
                /* done with loop */
                break;
            }
        }

        /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

 REPORT_STATE:
    /* find this proc in the local children so we can update its state */
    for (item = opal_list_get_first(&orte_local_children);
         item != opal_list_get_end(&orte_local_children);
         item = opal_list_get_next(item)) {
        child = (orte_odls_child_t*)item;
        mask = ORTE_NS_CMP_ALL;
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
            if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                child->state = state;
                if (0 < pid) {
                    child->pid = pid;
                }
                child->exit_code = exit_code;
            }
            /* done with loop */
            break;
        }
    }

    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack all the local child vpids and epochs */
            for (item = opal_list_get_first(&orte_local_children);
                 item != opal_list_get_end(&orte_local_children);
                 item = opal_list_get_next(item)) {
                child = (orte_odls_child_t*)item;
                if (child->name->jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto FINAL_CLEANUP;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return rc;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }        
        return rc;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == proc->jobid) {
                break;
            }
        }
        if (NULL == jobdat) {
            /* race condition - may not have been formed yet */
            return ORTE_SUCCESS;
        }

        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }

FINAL_CLEANUP:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jobdat->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = next) {
            child = (orte_odls_child_t*)item;
            next = opal_list_get_next(item);

            if (jobdat->jobid == child->name->jobid) {
                opal_list_remove_item(&orte_local_children, &child->super);
                OBJ_RELEASE(child);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jobdat->jobid);

        /* remove this job from our local job data since it is complete */
        opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
        OBJ_RELEASE(jobdat);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }

        /* indicate that the job is complete */
        return rc;
    }
    return ORTE_SUCCESS;
}
int orte_list_local_hnps(opal_list_t *hnps, bool connect)
{
    int ret;
    DIR *cur_dirp = NULL;
    struct dirent * dir_entry;
    char *contact_filename = NULL;
    orte_hnp_contact_t *hnp;
    char *headdir;
    
    /*
     * Check to make sure we have access to the top-level directory
     */
    headdir = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, NULL);
    
    if( ORTE_SUCCESS != (ret = opal_os_dirpath_access(headdir, 0) )) {
        /* it is okay not to find this as there may not be any
         * HNP's present, and we don't write our own session dir
         */
        if (ORTE_ERR_NOT_FOUND != ret) {
            ORTE_ERROR_LOG(ret);
        }
        goto cleanup;
    }
    
    /*
     * Open up the base directory so we can get a listing
     */
    if( NULL == (cur_dirp = opendir(headdir)) ) {
        goto cleanup;
    }
    /*
     * For each directory
     */
    while( NULL != (dir_entry = readdir(cur_dirp)) ) {
        
        /*
         * Skip the obvious
         */
        if( 0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
            0 == strncmp(dir_entry->d_name, "..", strlen("..")) ) {
            continue;
        }
        
        /*
         * See if a contact file exists in this directory and read it
         */
        contact_filename = opal_os_path( false, headdir,
                                         dir_entry->d_name, "contact.txt", NULL );
        
        hnp = OBJ_NEW(orte_hnp_contact_t);
        if (ORTE_SUCCESS == (ret = orte_read_hnp_contact_file(contact_filename, hnp, connect))) {
            opal_list_append(hnps, &(hnp->super));
        } else {
            OBJ_RELEASE(hnp);
        }
     }
    
cleanup:
    if( NULL != cur_dirp )
        closedir(cur_dirp);
    free(headdir);
    if( NULL != contact_filename)
        free(contact_filename);
    
    return (opal_list_is_empty(hnps) ? ORTE_ERR_NOT_FOUND : ORTE_SUCCESS);
}
Beispiel #17
0
static int pull_handle_info(orte_sstore_central_app_snapshot_info_t *handle_info )
{
    int ret, exit_status = ORTE_SUCCESS;
    opal_buffer_t *buffer = NULL;
    orte_sstore_central_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_sstore_base_handle_t loc_id;
    orte_rml_recv_cb_t* rb = NULL;

    buffer = OBJ_NEW(opal_buffer_t);

    /*
     * Ask the daemon to send us the info that we need
     */
    command = ORTE_SSTORE_CENTRAL_PULL;
    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SSTORE_CENTRAL_CMD))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(handle_info->id), 1, ORTE_SSTORE_HANDLE))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer,
                                                       ORTE_RML_TAG_SSTORE_INTERNAL,
                                                       orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    /* buffer should not be released here; the callback releases it */
    buffer = NULL;

    /*
     * Receive the response
     */
    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(app): pull() from %s -> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));

    rb = OBJ_NEW(orte_rml_recv_cb_t);
    rb->active = true;
    orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, ORTE_RML_TAG_SSTORE_INTERNAL,
                            0, orte_rml_recv_callback, rb);
    ORTE_WAIT_FOR_COMPLETION(rb->active);

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &loc_id, &count, ORTE_SSTORE_HANDLE))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }
    if( loc_id != handle_info->id ) {
        ; /* JJH Big problem */
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->seq_num), &count, OPAL_INT))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->global_ref_name), &count, OPAL_STRING))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->local_location), &count, OPAL_STRING))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->metadata_filename), &count, OPAL_STRING))) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(app): pull() from %s -> %s (%d, %d, %s)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON),
                         handle_info->id, 
                         handle_info->seq_num,
                         handle_info->global_ref_name
                         ));
 cleanup:
    if (NULL != buffer) {
        OBJ_RELEASE(buffer);
        buffer = NULL;
    }
    if (NULL != rb) {
        OBJ_RELEASE(rb);
        buffer = NULL;
    }

    return exit_status;
}
Beispiel #18
0
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
                                                        orte_proc_t *proc,
                                                        opal_list_t *local_snapshots)
{
    int exit_status = ORTE_SUCCESS;
    opal_list_item_t *item = NULL;
    orte_std_cntr_t i_app;
    int argc = 0;
    orte_app_context_t *cur_app_context = NULL;
    orte_app_context_t *new_app_context = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
    char *reference_fmt_str = NULL;
    char *location_str = NULL;
    char *cache_location_str = NULL;
    char *ref_location_fmt_str = NULL;
    char *tmp_str = NULL;
    char *global_snapshot_ref = NULL;
    char *global_snapshot_seq = NULL;
    char *sload;

    /*
     * Get the snapshot restart command for this process
     * JJH CLEANUP: Pass in the vpid_snapshot, so we don't have to look it up every time?
     */
    for(item  = opal_list_get_first(local_snapshots);
        item != opal_list_get_end(local_snapshots);
        item  = opal_list_get_next(item) ) {
        vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                                       &vpid_snapshot->process_name,
                                                       &proc->name) ) {
            break;
        }
        else {
            vpid_snapshot = NULL;
        }
    }

    if( NULL == vpid_snapshot ) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
                         &reference_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_LOC,
                         &location_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
                         &ref_location_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_REF,
                         &global_snapshot_ref);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_SEQ,
                         &global_snapshot_seq);

    /*
     * Find current app_context
     */
    cur_app_context = NULL;
    for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
        cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
                                                                            i_app);
        if( NULL == cur_app_context ) {
            continue;
        }
        if(proc->app_idx == cur_app_context->idx) {
            break;
        }
    }

    if( NULL == cur_app_context ) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * if > 1 processes in this app context
     *   Create a new app_context
     *   Copy over attributes
     *   Add it to the job_t data structure
     *   Associate it with this process in the job
     * else
     *   Reuse this app_context
     */
    if( cur_app_context->num_procs > 1 ) {

        /* Create a new app_context */
        opal_dss.copy((void**)&new_app_context, cur_app_context, ORTE_APP_CONTEXT);

        /* clear unused attributes */
        new_app_context->idx                    = cur_app_context->idx;
        free(new_app_context->app);
        new_app_context->app                    = NULL;
        new_app_context->num_procs              = 1;
        opal_argv_free(new_app_context->argv);
        new_app_context->argv                   = NULL;

        orte_remove_attribute(&new_app_context->attributes, ORTE_APP_PRELOAD_BIN);

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&sload,
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);
        orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
        free(sload);

        /* Add it to the job_t data structure */
        /*current_global_jobdata->num_apps++; */
        new_app_context->idx = (jobdata->num_apps);
        proc->app_idx = new_app_context->idx;

        opal_pointer_array_add(jobdata->apps, new_app_context);
        ++(jobdata->num_apps);

        /* Remove association with the old app_context */
        --(cur_app_context->num_procs);
    }
    else {
        new_app_context = cur_app_context;

        /* Cleanout old stuff */
        free(new_app_context->app);
        new_app_context->app = NULL;

        opal_argv_free(new_app_context->argv);
        new_app_context->argv = NULL;

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&sload,
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);
        orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
        free(sload);
    }

    /*
     * Update the app_context with the restart informaiton
     */
    new_app_context->app = strdup("opal-restart");
    opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
    opal_argv_append(&argc, &(new_app_context->argv), "-l");
    opal_argv_append(&argc, &(new_app_context->argv), location_str);
    opal_argv_append(&argc, &(new_app_context->argv), "-m");
    opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
    opal_argv_append(&argc, &(new_app_context->argv), "-r");
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }
    asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
    opal_argv_append(&argc, &(new_app_context->argv), tmp_str);

 cleanup:
    if( NULL != tmp_str) {
        free(tmp_str);
        tmp_str = NULL;
    }
    if( NULL != location_str ) {
        free(location_str);
        location_str = NULL;
    }
    if( NULL != cache_location_str ) {
        free(cache_location_str);
        cache_location_str = NULL;
    }
    if( NULL != reference_fmt_str ) {
        free(reference_fmt_str);
        reference_fmt_str = NULL;
    }
    if( NULL != ref_location_fmt_str ) {
        free(ref_location_fmt_str);
        ref_location_fmt_str = NULL;
    }

    return exit_status;
}
Beispiel #19
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *contact_path, *jobfam_dir;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /** setup callbacks for abort signals - from this point
     * forward, we need to abort in a manner that allows us
     * to cleanup. However, we cannot directly use libevent
     * to trap these signals as otherwise we cannot respond
     * to them if we are stuck in an event! So instead use
     * the basic POSIX trap functions to handle the signal,
     * and then let that signal handler do some magic to
     * avoid the hang
     *
     * NOTE: posix traps don't allow us to do anything major
     * in them, so use a pipe tied to a libevent event to
     * reach a "safe" place where the termination event can
     * be created
     */
    pipe(term_pipe);
    /* setup an event to attempt normal termination on signal */
    opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL);
    opal_event_set_priority(&term_handler, ORTE_ERROR_PRI);
    opal_event_add(&term_handler, NULL);

    /* Set both ends of this pipe to be close-on-exec so that no
       children inherit it */
    if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS ||
        opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) {
        error = "unable to set the pipe to CLOEXEC";
        goto error;
    }

    /* point the signal trap to a function that will activate that event */
    signal(SIGTERM, abort_signal_callback);
    signal(SIGINT, abort_signal_callback);
    signal(SIGHUP, abort_signal_callback);

    /** setup callbacks for signals we should foward */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback);
    setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback);
    setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif

    /* if we are using xml for output, put an mpirun start tag */
    if (orte_xml_output) {
        fprintf(orte_xml_fp, "<mpirun>\n");
        fflush(orte_xml_fp);
    }

    /* setup the global nidmap/pidmap object */
    orte_nidmap.bytes = NULL;
    orte_nidmap.size = 0;
    orte_pidmap.bytes = NULL;
    orte_pidmap.size = 0;

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
  
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Since we are the HNP, then responsibility for
     * defining the name falls to the PLM component for our
     * respective environment - hence, we have to open the PLM
     * first and select that component.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_select";
        goto error;
    }
    /* if we were spawned by a singleton, our jobid was given to us */
    if (NULL != orte_ess_base_jobid) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_string_to_jobid";
            goto error;
        }
        ORTE_PROC_MY_NAME->vpid = 0;
    } else {
        if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_set_hnp_name";
            goto error;
        }
    }
    /* Setup the communication infrastructure */
    
    /*
     * OOB Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }

    /*
     * Runtime Messaging Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }

    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* init the nidmap - just so we register that verbosity */
    orte_util_nidmap_init(NULL);

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    /* mark that the daemons have reported as we are the
     * only ones in the system right now, and we definitely
     * are running!
     */
    jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
   
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;

    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
#if OPAL_HAVE_HWLOC
    /* add it to the array of known topologies */
    opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology);
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    OBJ_RETAIN(node);  /* keep accounting straight */
    proc->node = node;
    proc->nodename = node->name;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);

    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* if we are to retain aliases, get ours */
    if (orte_retain_aliases) {
        opal_ifgetaliases(&node->alias);
        /* add our own local name to it */
        opal_argv_append_nosize(&node->alias, orte_process_info.nodename);
    }

    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;

    /*
     * Routed system
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_db_base_select(true))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }
    /* set our id */
    opal_db.set_id((opal_identifier_t*)ORTE_PROC_MY_NAME);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     */
    if (ORTE_SUCCESS != (ret = orte_plm.init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_init";
        goto error;
    }

    /*
     * Setup the remaining resource
     * management and errmgr frameworks - application procs
     * and daemons do not open these frameworks as they only use
     * the hnp proxy support in the PLM framework.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_ras_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_find_available";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_find_available";
        goto error;
    }
#if OPAL_HAVE_HWLOC
    {
        char *coprocessors, **sns;
        uint32_t h;
        int idx;

        /* if a topology file was given, then the rmaps framework open
         * will have reset our topology. Ensure we always get the right
         * one by setting our node topology afterwards
         */
        node->topology = opal_hwloc_topology;

        /* init the hash table, if necessary */
        if (NULL == orte_coprocessors) {
            orte_coprocessors = OBJ_NEW(opal_hash_table_t);
            opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
        }
        /* detect and add any coprocessors */
        coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
        if (NULL != coprocessors) {
            /* separate the serial numbers of the coprocessors
             * on this host
             */
            sns = opal_argv_split(coprocessors, ',');
            for (idx=0; NULL != sns[idx]; idx++) {
                /* compute the hash */
                OPAL_HASH_STR(sns[idx], h);
                /* mark that this coprocessor is hosted by this node */
                opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid));
            }
            opal_argv_free(sns);
            free(coprocessors);
            orte_coprocessors_detected = true;
        }
        /* see if I am on a coprocessor */
        coprocessors = opal_hwloc_base_check_on_coprocessor();
        if (NULL != coprocessors) {
            node->serial_number = coprocessors;
            orte_coprocessors_detected = true;
        }
    }
#endif

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* we are an hnp, so update the contact info field for later use */
    orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
    proc->rml_uri = strdup(orte_process_info.my_hnp_uri);

    /* we are also officially a daemon, so better update that field too */
    orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
    
    /* setup the orte_show_help system to recv remote output */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP,
                            ORTE_RML_PERSISTENT, orte_show_help_recv, NULL);

    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        
        /* Once the session directory location has been established, set
           the opal_output hnp file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* save my contact info in a file for others to find */
        jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
        contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
        free(jobfam_dir);
        
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s writing contact file %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             contact_path));
        
        if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s writing contact file failed with error %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_ERROR_NAME(ret)));
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s wrote contact file",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        }
        free(contact_path);
    }

    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }

    /* For HNP, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
    
    /* setup the dfs framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* if a tool has launched us and is requesting event reports,
     * then set its contact info into the comm system
     */
    if (orte_report_events) {
        if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) {
            error = "could not connect to tool";
            goto error;
        }
    }

    /* We actually do *not* want an HNP to voluntarily yield() the
       processor more than necessary.  Orterun already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.
     
       For example: when a message arrives at orterun, we want the
       OS to wake us up in a timely fashion (which most OS's
       seem good about doing) and then we want orterun to process
       the message as fast as possible.  If orterun yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules orterun to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    
    return ORTE_ERR_SILENT;
}
Beispiel #20
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = ORTE_SUCCESS;

    /***************
     * Initialize
     ***************/
    if (ORTE_SUCCESS != (ret = ckpt_init(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*************************************
     * Listing only Checkpoint References
     *************************************/
    if( orte_checkpoint_globals.list_only ) {
        if (ORTE_SUCCESS != (ret = list_all_snapshots())) {
            exit_status = ret;
            goto cleanup;
        }
        exit_status = ORTE_SUCCESS;
        goto cleanup;
    }

    /***************************
     * Find the HNP that we want to connect to, if it exists
     ***************************/
    if (ORTE_SUCCESS != (ret = find_hnp())) {
        /* Error printed by called function */
        exit_status = ret;
        goto cleanup;
    }

    /*******************************
     * Checkpoint the requested PID
     *******************************/
    is_checkpoint_finished    = false;
    is_checkpoint_recovered   = false;
    is_checkpoint_established = false;

    if( orte_checkpoint_globals.verbose ) {
        opal_output_verbose(10, orte_checkpoint_globals.output,
                            "orte_checkpoint: Checkpointing...");
        if (0 < orte_checkpoint_globals.pid) {
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t PID %d",
                                orte_checkpoint_globals.pid);
        } else if (ORTE_JOBID_INVALID != orte_checkpoint_globals.req_hnp){
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t Mpirun (%s)",
                                ORTE_JOBID_PRINT(orte_checkpoint_globals.req_hnp));
        }

        opal_output_verbose(10, orte_checkpoint_globals.output,
                            "\t Connected to Mpirun %s",
                            ORTE_NAME_PRINT(&orterun_hnp->name));
         
        if(orte_checkpoint_globals.options->term) {
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t Terminating after checkpoint\n");
        }
        if(orte_checkpoint_globals.options->stop) {
            opal_output_verbose(10, orte_checkpoint_globals.output,
                                "\t Stopping after checkpoint\n");
        }
    }

    if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.options)) ) {
        opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
                       orte_checkpoint_globals.pid, ret);
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Wait for the checkpoint to complete
     */
    if(!orte_checkpoint_globals.nowait) {
        while( !is_checkpoint_finished ) {
            opal_progress();
        }
    }

    if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status ||
        ORTE_SNAPC_CKPT_STATE_ERROR   == orte_checkpoint_globals.ckpt_status ) {
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if(!orte_checkpoint_globals.nowait) {
        pretty_print_reference();
    }

 cleanup:
    /***************
     * Cleanup
     ***************/
    if (ORTE_SUCCESS != (ret = ckpt_finalize())) {
        return ret;
    }

    return exit_status;
}
/*
 * NODE
 */
int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
                        int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, n, k, count;
    orte_node_t **nodes;
    uint8_t flag;
    orte_attribute_t *kv;

    /* unpack into array of orte_node_t objects */
    nodes = (orte_node_t**) dest;
    for (i=0; i < *num_vals; i++) {
        
        /* create the node object */
        nodes[i] = OBJ_NEW(orte_node_t);
        if (NULL == nodes[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        
        /* do not unpack the index - meaningless here */
        
        /* unpack the node name */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         &(nodes[i]->name), &n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* do not unpack the daemon name or launch id */
        
        /* unpack the number of procs on the node */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(nodes[i]->num_procs)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* do not unpack the proc info */
        
        /* unpack whether we are oversubscribed */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&flag), &n, OPAL_UINT8))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (flag) {
            ORTE_FLAG_SET(nodes[i], ORTE_NODE_FLAG_OVERSUBSCRIBED);
        }

        /* unpack the state */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(nodes[i]->state)), &n, ORTE_NODE_STATE))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* unpack the attributes */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            kv->local = ORTE_ATTR_GLOBAL;  // obviously not a local value
            opal_list_append(&nodes[i]->attributes, &kv->super);
        }
    }
    return ORTE_SUCCESS;
}
Beispiel #22
0
static int list_all_snapshots(void) {
    int ret, exit_status = ORTE_SUCCESS;
    opal_list_t *all_snapshots = NULL;
    opal_list_item_t* item = NULL;
    orte_sstore_base_global_snapshot_info_t *global_snapshot = NULL;
    int s;

    all_snapshots = OBJ_NEW(opal_list_t);

    if( ORTE_SUCCESS != (ret = orte_sstore_base_get_all_snapshots(all_snapshots, NULL)) ) {
        opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n",
                    orte_sstore_base_global_snapshot_dir);
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * For each reference
     */
    for(item  = opal_list_get_first(all_snapshots);
        item != opal_list_get_end(all_snapshots);
        item  = opal_list_get_next(item) ) {
        global_snapshot = (orte_sstore_base_global_snapshot_info_t*)item;

        /*
         * Get a list of valid sequence numbers
         */
        if( ORTE_SUCCESS != (ret = orte_sstore_base_find_all_seq_nums(global_snapshot,
                                                                      &(global_snapshot->num_seqs),
                                                                      &(global_snapshot->all_seqs)))) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }

        s = 0; /* Silence a compiler warning */
#if OPAL_ENABLE_CRDEBUG == 1
        /* Pretty print the result - C/R Debug version */
        if( orte_checkpoint_globals.enable_crdebug ) {
            for(s = 0; s < global_snapshot->num_seqs; ++s) {
                printf("-s %s %s\n", global_snapshot->all_seqs[s], global_snapshot->reference);
            }
        }
        else
#endif
        {
            /* Pretty print the result */
            printf("Snapshot Ref.: %s\t[",
                   global_snapshot->reference);
            if( 0 >= global_snapshot->num_seqs ) {
                printf("No Valid Checkpoints");
            } else {
                printf("%s",
                       opal_argv_join(global_snapshot->all_seqs, ','));
            }
            printf("]\n");
        }
    }

 cleanup:
    while (NULL != (item = opal_list_remove_first(all_snapshots))) {
        OBJ_RELEASE(item);
    }
    OBJ_RELEASE(all_snapshots);

    return exit_status;
}
/*
 * APP_CONTEXT
 */
int orte_dt_unpack_app_context(opal_buffer_t *buffer, void *dest,
                               int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    orte_app_context_t **app_context;
    int32_t i, max_n=1, count, k;
    orte_attribute_t *kv;

    /* unpack into array of app_context objects */
    app_context = (orte_app_context_t**) dest;
    for (i=0; i < *num_vals; i++) {

        /* create the app_context object */
        app_context[i] = OBJ_NEW(orte_app_context_t);
        if (NULL == app_context[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        /* get the app index number */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->idx),
                                                         &max_n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the application name */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->app),
                                                         &max_n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* get the number of processes */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->num_procs),
                                                         &max_n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* get the first rank for this app */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &(app_context[i]->first_rank),
                                                         &max_n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* get the number of argv strings that were packed */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &max_n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* if there are argv strings, allocate the required space for the char * pointers */
        if (0 < count) {
            app_context[i]->argv = (char **)malloc((count+1) * sizeof(char*));
            if (NULL == app_context[i]->argv) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            app_context[i]->argv[count] = NULL;

            /* and unpack them */
            max_n = count;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, app_context[i]->argv, &max_n, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }

        /* get the number of env strings */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &max_n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* if there are env strings, allocate the required space for the char * pointers */
        if (0 < count) {
            app_context[i]->env = (char **)malloc((count+1) * sizeof(char*));
            if (NULL == app_context[i]->env) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            app_context[i]->env[count] = NULL;

            /* and unpack them */
            max_n = count;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, app_context[i]->env, &max_n, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }

        /* unpack the cwd */
        max_n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &app_context[i]->cwd,
                                                         &max_n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the attributes */
        max_n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &max_n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            max_n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &max_n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* obviously, this isn't a local value */
            kv->local = false;
            opal_list_append(&app_context[i]->attributes, &kv->super);
        }
    }

    return ORTE_SUCCESS;
}
void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata)
{
    int rc, cnt;
    orte_process_name_t pname;
    char *hostname;
    orte_vpid_t daemon;
    orte_node_rank_t node_rank;
    orte_local_rank_t local_rank;
    orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata;
    opal_hwloc_locality_t locality;

    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                         "%s STORING PEER MODEX DATA",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* unpack the process name */
    cnt=1;
    while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &pname, &cnt, ORTE_NAME))) {
        /* unpack and store the hostname */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack and store the daemon vpid */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_DAEMON_VPID, &daemon, OPAL_UINT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack and store the node rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* unpack the local rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* compute the locality and store in the database */
#if OPAL_HAVE_HWLOC
        {
            char *cpuset;

            /* unpack and store the cpuset - could be NULL */
            cnt = 1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &cpuset, &cnt, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_CPUSET, cpuset, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                                 "%s store:peer:modex setting proc %s cpuset %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname), cpuset));

            if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) {
                /* if this data is from myself, then set locality to all */
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale ALL",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname)));
                locality = OPAL_PROC_ALL_LOCAL;
            } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
                /* this is on a different node, then mark as non-local */
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale NONLOCAL",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname)));
                locality = OPAL_PROC_NON_LOCAL;
            } else if (NULL == cpuset || NULL == orte_process_info.cpuset) {
                /* one or both of us is not bound, so all we can say is we are on the
                 * same node
                 */
                locality = OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 orte_process_info.cpuset,
                                                                 cpuset);
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname),
                                     opal_hwloc_base_print_locality(locality)));
            }
        }
#else
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) {
            /* if this data is from myself, then set locality to all */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:base:modex setting proc %s locale ALL",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname)));
            locality = OPAL_PROC_ALL_LOCAL;
        } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
            /* this is on a different node, then mark as non-local */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s store:peer:modex setting proc %s locale NONLOCAL",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname)));
            locality = OPAL_PROC_NON_LOCAL;
        } else {
            /* must be on our node */
            locality = OPAL_PROC_ON_NODE;
        }
#endif
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex: adding modex entry for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&pname)));
        
        /* update the modex database */
        if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&pname, rbuf))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }            
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex: completed modex entry for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&pname)));
    }    

 cleanup:
    /* flag the collective as complete */
    modex->active = false;
    /* cleanup the list, but don't release the
     * collective object as it was passed into us
     */
    opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super);
    /* notify that the modex is complete */
    if (NULL != modex->cbfunc) {
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                             "%s CALLING MODEX RELEASE",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        modex->cbfunc(NULL, modex->cbdata);
    } else {
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex NO MODEX RELEASE CBFUNC",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }
}
Beispiel #25
0
int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
{
    mca_oob_ud_msg_op_t *op = (mca_oob_ud_msg_op_t*)cbdata;

    orte_process_name_t hop;
    mca_oob_ud_peer_t *peer;
    mca_oob_ud_port_t *port;
    mca_oob_ud_msg_t  *req_msg;
    mca_oob_ud_req_t  *send_req;
    bool send_eager = false;
    char *pack_ptr;
    int rc, size, i;

    if (OPAL_EQUAL == orte_util_compare_name_fields
        (ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &op->msg->dst)) {
        return mca_oob_ud_send_self (op->msg);
    }

    /* if we have a route to this peer, then we can reach it */
    hop = orte_routed.get_route(NULL, &op->msg->dst);
    if (ORTE_JOBID_INVALID == hop.jobid ||
        ORTE_VPID_INVALID == hop.vpid) {
        ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
        return ORTE_ERR_UNREACH;
    }

    rc = mca_oob_ud_peer_lookup (&hop, &peer);
    if(ORTE_SUCCESS != rc || NULL == peer) {
        ORTE_ERROR_LOG((NULL == peer) ? ORTE_ERR_UNREACH : rc);
        return (NULL == peer) ? ORTE_ERR_UNREACH : rc;
    }

    opal_output_verbose(2, orte_oob_base_framework.framework_output,
                        "%s oob:ud:send_nb to pear %s via hop %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&op->msg->dst), ORTE_NAME_PRINT(&hop));

    /* NTH: TODO -- get a random port? */
    port = (mca_oob_ud_port_t *) opal_list_get_first (&((mca_oob_ud_device_t *)peer->peer_context)->ports);

    send_req = OBJ_NEW(mca_oob_ud_req_t);
    if (!send_req) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }

    /* fill in request */
    send_req->req_target = op->msg->dst;
    send_req->req_origin = op->msg->origin;
    send_req->req_tag    = op->msg->tag;
    send_req->req_seq_num  = op->msg->seq_num;

    if (op->msg->data != NULL) {
        size = op->msg->count;

        send_req->req_data_type = MCA_OOB_UD_REQ_TR;

        send_req->req_data.buf.p = (char *)calloc(size, sizeof(char));
        memcpy(send_req->req_data.buf.p, op->msg->data, op->msg->count);
        send_req->req_data.buf.size = op->msg->count;
    } else {
        MCA_OOB_UD_IOV_SIZE(op->msg, size);

        if (op->msg->iov != NULL) {
            send_req->req_data_type = MCA_OOB_UD_REQ_IOV;
            send_req->req_data.iov.uiov   = op->msg->iov;
            send_req->req_data.iov.count  = op->msg->count;
        } else {
            send_req->req_data_type = MCA_OOB_UD_REQ_BUF;

            opal_buffer_t *buffer;
            buffer = OBJ_NEW(opal_buffer_t);

            if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, op->msg->buffer))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(buffer);
                return rc;
            }

            if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&send_req->req_data.buf.p, &send_req->req_data.buf.size)))
            {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(buffer);
                free(send_req->req_data.buf.p);
                return rc;
            }
            OBJ_RELEASE(buffer);
        }
    }
    send_req->rml_msg = op->msg;
    send_req->req_cbdata = op->msg->cbdata;
    send_req->req_peer   = peer;
    send_req->req_mtu    = port->mtu;
    send_req->req_port   = port;
    send_req->req_rc     = 0;

    send_req->state      = MCA_OOB_UD_REQ_PENDING;
    send_req->type       = MCA_OOB_UD_REQ_SEND;

    OBJ_RETAIN(peer);

    if (size + sizeof (mca_oob_ud_msg_hdr_t) <= (unsigned int)port->mtu) {
        send_eager = true;
    }

    rc = mca_oob_ud_msg_get (port, send_req, &port->listen_qp, peer, false, &req_msg);
    if (ORTE_SUCCESS != rc) {
        OBJ_RELEASE (send_req);
        return rc;
    }

    /* fill in message header */
    req_msg->hdr->msg_type     = MCA_OOB_UD_MSG_REQUEST;
    req_msg->hdr->msg_rem_ctx  = send_req;

    req_msg->hdr->msg_origin   = op->msg->origin;
    req_msg->hdr->msg_target   = op->msg->dst;
    req_msg->hdr->msg_seq_num  = op->msg->seq_num;

    req_msg->hdr->msg_data.req.data_len = size;
    req_msg->hdr->msg_data.req.mtu      = port->mtu;
    req_msg->hdr->msg_data.req.tag      = op->msg->tag;

    if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
        opal_output_verbose(10, orte_oob_base_framework.framework_output,
                             "%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p."
                             "count = %d. uiov = %p.\n",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&op->msg->dst),
                             op->msg->tag, (unsigned long)size,
                             (void *) req_msg,
                             (void *) peer, (void *) send_req,
                              send_req->req_data.iov.count, (void *) send_req->req_data.iov.uiov);
    } else {
        opal_output_verbose(10, orte_oob_base_framework.framework_output,
                             "%s-%s send_nb: tag %d size %lu. msg: %p. peer = %p. req = %p."
                             "buffer = %p.\n",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&op->msg->dst),
                             op->msg->tag, (unsigned long)size,
                             (void *) req_msg,
                             (void *) peer, (void *) send_req, (void *) send_req->req_data.buf.p);
    }

    if (!send_eager) {
        mca_oob_ud_req_append_to_list (send_req, &mca_oob_ud_component.ud_active_sends);

        /* send request */
        return mca_oob_ud_msg_post_send (req_msg);
    }

    pack_ptr = (char *)(req_msg->hdr + 1);

    if (op->msg->iov != NULL) {
        for (i = 0 ; i < op->msg->count ; ++i) {
            memcpy (pack_ptr, op->msg->iov[i].iov_base, op->msg->iov[i].iov_len);
            pack_ptr += op->msg->iov[i].iov_len;
        }
    } else {
        memcpy(pack_ptr, send_req->req_data.buf.p, send_req->req_data.buf.size);
    }

    send_req->req_list = NULL;

    req_msg->hdr->msg_data.req.data_follows = true;

    req_msg->cbfunc = mca_oob_ud_send_cb;
    req_msg->req    = send_req;

    do {
        /* send request */
        rc = mca_oob_ud_msg_post_send (req_msg);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            break;
        }
    } while (0);

    return rc;
}
int orte_grpcomm_base_pack_modex_entries(opal_buffer_t *buf)
{
    int rc;
    int32_t num_entries;
    opal_value_t *kv;
    opal_list_t data;
    opal_list_item_t *item, *next;

    /* fetch our data */
    OBJ_CONSTRUCT(&data, opal_list_t);
    if (ORTE_SUCCESS != (rc = opal_db.fetch_multiple((opal_identifier_t*)ORTE_PROC_MY_NAME, NULL, &data))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* count the number of entries we will send, purging the rest */
    num_entries = 0;
    item = opal_list_get_first(&data);
    while (item != opal_list_get_end(&data)) {
        kv = (opal_value_t*)item;
        next = opal_list_get_next(item);
        /* if this is an entry we get from the nidmap, then don't include it here */
        if (0 == strcmp(kv->key, ORTE_DB_HOSTNAME) ||
            0 == strcmp(kv->key, ORTE_DB_DAEMON_VPID) ||
            0 == strcmp(kv->key, ORTE_DB_NODERANK) ||
            0 == strcmp(kv->key, ORTE_DB_LOCALRANK) ||
            0 == strcmp(kv->key, ORTE_DB_CPUSET)) {
            opal_list_remove_item(&data, item);
        } else {
            num_entries++;
        }
        item = next;
    }

    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:base:pack_modex: reporting %d entries",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_entries));
    
    /* put the number of entries into the buffer */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &num_entries, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* if there are entries, store them */
    while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) {
        if (ORTE_SUCCESS != (opal_dss.pack(buf, &kv, 1, OPAL_VALUE))) {
            ORTE_ERROR_LOG(rc);
            break;
        }
        OBJ_RELEASE(kv);
    }

 cleanup:
    while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) {
        OBJ_RELEASE(kv);
    }
    OBJ_DESTRUCT(&data);

    return rc;
}
Beispiel #27
0
int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
    int wr_index, wr_count, sge_count, sge_index, iov_index;
    unsigned int iov_left, iov_offset, packet_size;
    const unsigned int mtu = send_req->req_mtu;
    const struct timeval aquire_timeout = {0, 500000};
    mca_oob_ud_msg_t *com_msg;
    int data_len;
    int rc = ORTE_SUCCESS;

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:send_try sending to %s, tag = %d, "
                         "req = %p",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&send_req->req_peer->peer_name),
                         send_req->req_tag, (void *) send_req);

    do {
        if (NULL == send_req->req_qp) {
            rc = mca_oob_ud_qp_data_aquire (send_req->req_port, &send_req->req_qp);
            if (ORTE_SUCCESS != rc) {
                break;
            }
        }

        (void) mca_oob_ud_qp_purge (send_req->req_qp);

        rc = mca_oob_ud_msg_get (send_req->req_port, send_req, send_req->req_qp, send_req->req_peer, false,
                                 &com_msg);
        if (ORTE_SUCCESS != rc) {
            break;
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            if (NULL == send_req->req_data.iov.mr) {
                /* allocate space for memory registers */
                send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *));
                if (NULL == send_req->req_data.iov.mr) {
                    rc = ORTE_ERR_OUT_OF_RESOURCE;
                    ORTE_ERROR_LOG(rc);
                    break;
                }
            }

            rc = mca_oob_ud_register_iov (send_req->req_data.iov.uiov, send_req->req_data.iov.count,
                                          send_req->req_data.iov.mr, send_req->req_port->device->ib_pd,
                                          mtu, &sge_count, &wr_count, &data_len);

            if (ORTE_SUCCESS != rc) {
                break;
            }
        } else {
            data_len = send_req->req_data.buf.size;
            rc = mca_oob_ud_register_buf(send_req->req_data.buf.p, send_req->req_data.buf.size,
                                         &send_req->req_data.buf.mr, send_req->req_port->device->ib_pd,
                                         mtu, &sge_count, &wr_count);

            if (ORTE_SUCCESS != rc) {
                break;
            }
        }

        wr_count = (data_len + mtu - 1) / mtu;

        if (data_len > 0) {
            data_len = data_len + 0;
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            opal_output_verbose(5, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try sending %d bytes in %d "
                                 "work requests, %d sges. uiov = %p.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
                                 wr_count, sge_count, (void *) send_req->req_data.iov.uiov);
        } else {
            opal_output_verbose(5, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try sending %d bytes in %d "
                                 "work requests, %d sges. buf = %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), data_len,
                                 wr_count, sge_count, (void *) send_req->req_data.buf.p);
        }

        if (wr_count && NULL == send_req->req_wr.send) {
            send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr));
            if (NULL == send_req->req_wr.send) {
                rc = ORTE_ERR_OUT_OF_RESOURCE;
                ORTE_ERROR_LOG(rc);
                break;
            }
        }

        if (wr_count && NULL == send_req->req_sge) {
            send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));

            if (NULL == send_req->req_sge) {
                rc = ORTE_ERR_OUT_OF_RESOURCE;
                ORTE_ERROR_LOG(rc);
                break;
            }
        }

        if (MCA_OOB_UD_REQ_IOV == send_req->req_data_type) {
            opal_output_verbose(10, orte_oob_base_framework.framework_output,
                 "%s oob:ud:send_try posting message using iovec",
                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

            iov_left   = send_req->req_data.iov.uiov[0].iov_len;
            iov_offset = 0;
            iov_index  = 0;

            for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
                int sge_first = sge_index;

                packet_size = 0;

                do {
                    int to_send = min (iov_left, mtu - packet_size);

                    mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
                                        (char *)send_req->req_data.iov.uiov[iov_index].iov_base + iov_offset,
                                        to_send, send_req->req_data.iov.mr[iov_index]->lkey);

                    iov_offset  += to_send;
                    iov_left    -= to_send;
                    packet_size += to_send;

                    if (0 == iov_left) {
                        iov_index++;
                        iov_offset = 0;

                        if (iov_index < send_req->req_data.iov.count) {
                            iov_left = send_req->req_data.iov.uiov[iov_index].iov_len;
                        }
                    }
                } while ((packet_size < mtu) && (iov_left > 0));

                mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
                                        send_req->req_sge + sge_first,
                                        sge_index - sge_first, send_req->req_peer);

                /* we don't care about completions for data  */
                send_req->req_wr.send[wr_index].send_flags       = IBV_SEND_SOLICITED;

                /* sequence number */
                send_req->req_wr.send[wr_index].imm_data         = wr_index;
                send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
                send_req->req_wr.send[wr_index].opcode           = IBV_WR_SEND_WITH_IMM;

                if (wr_index + 1 < wr_count) {
                    send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
                }
            }
        } else {//data is in buffer
            unsigned int buffer_offset = 0;
            unsigned int buffer_size = send_req->req_data.buf.size;

            opal_output_verbose(10, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:send_try posting message using buffer",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

            for (wr_index = 0, sge_index = 0 ; wr_index < wr_count ; ++wr_index) {
                int sge_first = sge_index;

                packet_size = 0;

                do {
                    int to_send = min (buffer_size, mtu - packet_size);

                    mca_oob_ud_fill_sge(send_req->req_sge + sge_index++,
                                        (char *)send_req->req_data.buf.p + buffer_offset,
                                        to_send, send_req->req_data.buf.mr->lkey);

                    buffer_offset  += to_send;
                    buffer_size    -= to_send;
                    packet_size += to_send;
                } while ((packet_size < mtu) && (buffer_size > 0));

                mca_oob_ud_fill_send_wr(send_req->req_wr.send + wr_index,
                                        send_req->req_sge + sge_first,
                                        sge_index - sge_first, send_req->req_peer);

                /* we don't care about completions for data  */
                send_req->req_wr.send[wr_index].send_flags       = IBV_SEND_SOLICITED;

                /* sequence number */
                send_req->req_wr.send[wr_index].imm_data         = wr_index;
                send_req->req_wr.send[wr_index].wr.ud.remote_qpn = send_req->req_rem_qpn;
                send_req->req_wr.send[wr_index].opcode           = IBV_WR_SEND_WITH_IMM;

                opal_output_verbose(10, orte_oob_base_framework.framework_output,
                                     "%s oob:ud:send_try imm_data = %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), wr_index);

                if (wr_index + 1 < wr_count) {
                    send_req->req_wr.send[wr_index].next = send_req->req_wr.send + wr_index + 1;
                }
            }
        }

        /* send data */
        rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            break;
        }

        opal_output_verbose(10, orte_oob_base_framework.framework_output,
                             "%s oob:ud:send_try posting completion message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

        /* Fill in completion message. This message will go to the peers listen QP but
           must originate from our data qp to ensure that it is sent last. */
        com_msg->hdr->msg_type    = MCA_OOB_UD_MSG_COMPLETE;
        com_msg->hdr->msg_lcl_ctx = send_req->req_rem_ctx;
        com_msg->hdr->msg_rem_ctx = send_req;

        /* send message header */
        rc = mca_oob_ud_msg_post_send (com_msg);

        /* post_send already returned the message */
        com_msg = NULL;
    } while (0);

    if (ORTE_ERR_TEMP_OUT_OF_RESOURCE == rc) {
        /* set timer to retry post */
        mca_oob_ud_req_timer_set (send_req, &aquire_timeout, 1, mca_oob_ud_send_try_to);
        rc = ORTE_SUCCESS;
    }

    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        /* damn */
        return mca_oob_ud_send_complete (send_req, rc);
    }

    send_req->state = MCA_OOB_UD_REQ_ACTIVE;

    return rc;
}
/***************  MODEX SECTION **************/
void orte_grpcomm_base_modex(int fd, short args, void *cbdata)
{
    orte_grpcomm_caddy_t *caddy = (orte_grpcomm_caddy_t*)cbdata;
    orte_grpcomm_collective_t *modex = caddy->op;
    int rc;
    orte_namelist_t *nm;
    opal_list_item_t *item;
    bool found;
    orte_grpcomm_collective_t *cptr;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:base:modex: performing modex",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (0 == opal_list_get_size(&modex->participants)) {
        /* record the collective */
        modex->next_cbdata = modex;
        opal_list_append(&orte_grpcomm_base.active_colls, &modex->super);

        /* put our process name in the buffer so it can be unpacked later */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* add a wildcard name to the participants so the daemon knows
         * the jobid that is involved in this collective
         */
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = ORTE_PROC_MY_NAME->jobid;
        nm->name.vpid = ORTE_VPID_WILDCARD;
        opal_list_append(&modex->participants, &nm->super);
        modex->next_cb = orte_grpcomm_base_store_modex;
    } else {
        /* see if the collective is already present - a race condition
         * exists where other participants may have already sent us their
         * contribution. This would place the collective on the global
         * array, but leave it marked as "inactive" until we call
         * modex with the list of participants
         */
        found = false;
        for (item = opal_list_get_first(&orte_grpcomm_base.active_colls);
             item != opal_list_get_end(&orte_grpcomm_base.active_colls);
             item = opal_list_get_next(item)) {
            cptr = (orte_grpcomm_collective_t*)item;
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s CHECKING COLL id %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 cptr->id));
            
            if (modex->id == cptr->id) {
                found = true;
                /* remove the old entry - we will replace it
                 * with the modex one
                 */
                opal_list_remove_item(&orte_grpcomm_base.active_colls, item);
                break;
            }
        }
        if (found) {
            /* since it already exists, the list of
             * targets contains the list of procs
             * that have already sent us their info. Cycle
             * thru the targets and move those entries to
             * the modex object
             */
            while (NULL != (item = opal_list_remove_first(&cptr->targets))) {
                opal_list_append(&modex->targets, item);
            }
            /* copy the previously-saved data across */
            opal_dss.copy_payload(&modex->local_bucket, &cptr->local_bucket);
            /* cleanup */
            OBJ_RELEASE(cptr);
        }
        /* now add the modex to the global list of active collectives */
        modex->next_cb = orte_grpcomm_base_store_peer_modex;
        modex->next_cbdata = modex;
        opal_list_append(&orte_grpcomm_base.active_colls, &modex->super);

        /* this is not amongst our peers, but rather between a select
         * group of processes - e.g., during a connect/accept operation.
         * Thus, this requires we send additional info
         */

        /* pack the collective id */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &modex->id, 1, ORTE_GRPCOMM_COLL_ID_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* pack our name */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* pack our hostname */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.nodename, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
    
        /* pack our daemon's vpid */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
    
        /* pack our node rank */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_node_rank, 1, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
    
        /* pack our local rank */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_local_rank, 1, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
    
#if OPAL_HAVE_HWLOC
        /* pack our binding info so other procs can determine our locality */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.cpuset, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
#endif
    }

    /* pack the entries we have received */
    if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:base:full:modex: executing allgather",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* execute the allgather */
    if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:base:modex: modex posted",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    return;

 cleanup:
    return;
}
Beispiel #29
0
int orte_sstore_base_extract_global_metadata(orte_sstore_base_global_snapshot_info_t *global_snapshot)
{
    int ret, exit_status = ORTE_SUCCESS;
    FILE *metadata = NULL;
    char * token = NULL;
    char * value = NULL;
    orte_process_name_t proc;
    opal_list_item_t* item = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;

    /*
     * Cleanup the structure a bit, so we can refresh it below
     */
    while (NULL != (item = opal_list_remove_first(&global_snapshot->local_snapshots))) {
        OBJ_RELEASE(item);
    }

    if( NULL != global_snapshot->start_time ) {
        free( global_snapshot->start_time );
        global_snapshot->start_time = NULL;
    }

    if( NULL != global_snapshot->end_time ) {
        free( global_snapshot->end_time );
        global_snapshot->end_time = NULL;
    }

    /*
     * Open the metadata file
     */
    if (NULL == (metadata = fopen(global_snapshot->metadata_filename, "r")) ) {
        opal_output(orte_sstore_base_output,
                    "sstore:base:extract_global_metadata() Unable to open the file (%s)\n",
                    global_snapshot->metadata_filename);
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * Seek to the sequence number requested
     */
    if( ORTE_SUCCESS != (ret = orte_sstore_base_metadata_seek_to_seq_num(metadata, global_snapshot->seq_num))) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * Extract each token and make the records
     */
    do {
        if( ORTE_SUCCESS != orte_sstore_base_metadata_read_next_token(metadata, &token, &value) ) {
            break;
        }

        if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR,  strlen(SSTORE_METADATA_GLOBAL_SNAP_SEQ_STR)) ||
           0 == strncmp(token, SSTORE_METADATA_INTERNAL_MIG_SEQ_STR, strlen(SSTORE_METADATA_INTERNAL_MIG_SEQ_STR)) ) {
            break;
        }

        if( 0 == strncmp(token, SSTORE_METADATA_INTERNAL_PROCESS_STR, strlen(SSTORE_METADATA_INTERNAL_PROCESS_STR)) ) {
            orte_util_convert_string_to_process_name(&proc, value);

            /* Not the first process, so append it to the list */
            if( NULL != vpid_snapshot) {
                opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
            }

            vpid_snapshot = OBJ_NEW(orte_sstore_base_local_snapshot_info_t);
            vpid_snapshot->ss_handle = global_snapshot->ss_handle;

            vpid_snapshot->process_name.jobid  = proc.jobid;
            vpid_snapshot->process_name.vpid   = proc.vpid;
            ORTE_EPOCH_SET(vpid_snapshot->process_name.epoch,proc.epoch);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_CRS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_CRS_COMP_STR))) {
            vpid_snapshot->crs_comp = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_COMP_STR))) {
            vpid_snapshot->compress_comp = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR, strlen(SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX_STR))) {
            vpid_snapshot->compress_postfix = strdup(value);
        }
        else if(0 == strncmp(token, SSTORE_METADATA_INTERNAL_TIME_STR, strlen(SSTORE_METADATA_INTERNAL_TIME_STR)) ) {
            if( NULL == global_snapshot->start_time) {
                global_snapshot->start_time = strdup(value);
            }
            else {
                global_snapshot->end_time   = strdup(value);
            }
        }
        else if(0 == strncmp(token, SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR, strlen(SSTORE_METADATA_GLOBAL_AMCA_PARAM_STR))) {
            global_snapshot->amca_param  = strdup(value);
        }
    } while(0 == feof(metadata) );
    
    /* Append the last item */
    if( NULL != vpid_snapshot) {
        opal_list_append(&global_snapshot->local_snapshots, &(vpid_snapshot->super));
    }
    
 cleanup:
    if( NULL != metadata ) {
        fclose(metadata);
        metadata = NULL;
    }
    if( NULL != value ) {
        free(value);
        value = NULL;
    }
    if( NULL != token ) {
        free(token);
        token = NULL;
    }

    return exit_status;
}
Beispiel #30
0
int orte_iof_base_write_output(orte_process_name_t *name, orte_iof_tag_t stream,
                                unsigned char *data, int numbytes,
                               orte_iof_write_event_t *channel)
{
    char starttag[ORTE_IOF_BASE_TAG_MAX], endtag[ORTE_IOF_BASE_TAG_MAX], *suffix;
    orte_iof_write_output_t *output;
    int i, j, k, starttaglen, endtaglen, num_buffered;
    bool endtagged;
    char qprint[10];

    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s write:output setting up to write %d bytes to %s for %s on fd %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                         (ORTE_IOF_STDIN & stream) ? "stdin" : ((ORTE_IOF_STDOUT & stream) ? "stdout" : ((ORTE_IOF_STDERR & stream) ? "stderr" : "stddiag")),
                         ORTE_NAME_PRINT(name), 
                         (NULL == channel) ? -1 : channel->fd));

    /* setup output object */
    output = OBJ_NEW(orte_iof_write_output_t);
    
    /* write output data to the corresponding tag */
    if (ORTE_IOF_STDIN & stream) {
        /* copy over the data to be written */
        if (0 < numbytes) {
            /* don't copy 0 bytes - we just need to pass
             * the zero bytes so the fd can be closed
             * after it writes everything out
             */
            memcpy(output->data, data, numbytes);
        }
        output->numbytes = numbytes;
        goto process;
    } else if (ORTE_IOF_STDOUT & stream) {
        /* write the bytes to stdout */
        suffix = "stdout";
    } else if (ORTE_IOF_STDERR & stream) {
        /* write the bytes to stderr */
        suffix = "stderr";
    } else if (ORTE_IOF_STDDIAG & stream) {
        /* write the bytes to stderr */
        suffix = "stddiag";
    } else {
        /* error - this should never happen */
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                             "%s stream %0x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), stream));
        return ORTE_ERR_VALUE_OUT_OF_BOUNDS;
    }

    /* if this is to be xml tagged, create a tag with the correct syntax - we do not allow
     * timestamping of xml output
     */
    if (orte_xml_output) {
        snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "<%s rank=\"%s\">", suffix, ORTE_VPID_PRINT(name->vpid));
        snprintf(endtag, ORTE_IOF_BASE_TAG_MAX, "</%s>", suffix);
        goto construct;
    }
    
    /* if we are to timestamp output, start the tag with that */
    if (orte_timestamp_output) {
        time_t mytime;
        char *cptr;
        /* get the timestamp */
        time(&mytime);
        cptr = ctime(&mytime);
        cptr[strlen(cptr)-1] = '\0';  /* remove trailing newline */
        
        if (orte_tag_output) {
            /* if we want it tagged as well, use both */
            snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s[%s,%s]<%s>:",
                     cptr, ORTE_LOCAL_JOBID_PRINT(name->jobid),
                     ORTE_VPID_PRINT(name->vpid), suffix);
        } else {
            /* only use timestamp */
            snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "%s<%s>:", cptr, suffix);
        }
        /* no endtag for this option */
        memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX);
        goto construct;
    }
    
    if (orte_tag_output) {
        snprintf(starttag, ORTE_IOF_BASE_TAG_MAX, "[%s,%s]<%s>:",
                 ORTE_LOCAL_JOBID_PRINT(name->jobid),
                 ORTE_VPID_PRINT(name->vpid), suffix);
        /* no endtag for this option */
        memset(endtag, '\0', ORTE_IOF_BASE_TAG_MAX);
        goto construct;
    }
    
    /* if we get here, then the data is not to be tagged - just copy it
     * and move on to processing
     */
    if (0 < numbytes) {
        /* don't copy 0 bytes - we just need to pass
         * the zero bytes so the fd can be closed
         * after it writes everything out
         */
        memcpy(output->data, data, numbytes);
    }
    output->numbytes = numbytes;
    goto process;

construct:
    starttaglen = strlen(starttag);
    endtaglen = strlen(endtag);
    endtagged = false;
    /* start with the tag */
    for (j=0, k=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
        output->data[k++] = starttag[j];
    }        
    /* cycle through the data looking for <cr>
     * and replace those with the tag
     */
    for (i=0; i < numbytes && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; i++) {
        if (orte_xml_output) {
            if ('&' == data[i]) {
                if (k+5 >= ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                    ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                    goto process;
                }
                snprintf(qprint, 10, "&amp;");
                for (j=0; j < (int)strlen(qprint) && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                    output->data[k++] = qprint[j];
                }
            } else if ('<' == data[i]) {
                if (k+4 >= ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                    ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                    goto process;
                }
                snprintf(qprint, 10, "&lt;");
                for (j=0; j < (int)strlen(qprint) && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                    output->data[k++] = qprint[j];
                }
            } else if ('>' == data[i]) {
                if (k+4 >= ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                    ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                    goto process;
                }
                snprintf(qprint, 10, "&gt;");
                for (j=0; j < (int)strlen(qprint) && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                    output->data[k++] = qprint[j];
                }
            } else if (data[i] < 32 || data[i] > 127) {
                /* this is a non-printable character, so escape it too */
                if (k+7 >= ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                    ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                    goto process;
                }
                snprintf(qprint, 10, "&#%03d;", (int)data[i]);
                for (j=0; j < (int)strlen(qprint) && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                    output->data[k++] = qprint[j];
                }
                /* if this was a \n, then we also need to break the line with the end tag */
                if ('\n' == data[i] && (k+endtaglen+1) < ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                    /* we need to break the line with the end tag */
                    for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX-1; j++) {
                        output->data[k++] = endtag[j];
                    }
                    /* move the <cr> over */
                    output->data[k++] = '\n';
                    /* if this isn't the end of the data buffer, add a new start tag */
                    if (i < numbytes-1 && (k+starttaglen) < ORTE_IOF_BASE_TAGGED_OUT_MAX) {
                        for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                            output->data[k++] = starttag[j];
                            endtagged = false;
                        }
                    } else {
                        endtagged = true;
                    }
                }
            } else {
                output->data[k++] = data[i];
            }
        } else {
            if ('\n' == data[i]) {
                /* we need to break the line with the end tag */
                for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX-1; j++) {
                    output->data[k++] = endtag[j];
                }
                /* move the <cr> over */
                output->data[k++] = '\n';
                /* if this isn't the end of the data buffer, add a new start tag */
                if (i < numbytes-1) {
                    for (j=0; j < starttaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX; j++) {
                        output->data[k++] = starttag[j];
                        endtagged = false;
                    }
                } else {
                    endtagged = true;
                }
            } else {
                output->data[k++] = data[i];
            }
        }
    }
    if (!endtagged) {
        /* need to add an endtag */
        for (j=0; j < endtaglen && k < ORTE_IOF_BASE_TAGGED_OUT_MAX-1; j++) {
            output->data[k++] = endtag[j];
        }
        output->data[k++] = '\n';
    }
    output->numbytes = k;
    
process:
    /* add this data to the write list for this fd */
    opal_list_append(&channel->outputs, &output->super);

    /* record how big the buffer is */
    num_buffered = opal_list_get_size(&channel->outputs);
    
    /* is the write event issued? */
    if (!channel->pending) {
        /* issue it */
        OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                             "%s write:output adding write event",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        opal_event_add(channel->ev, 0);
        channel->pending = true;
    }
    
    return num_buffered;
}