static int
orte_grpcomm_bad_get_proc_attr(const orte_process_name_t proc,
                               const char * attribute_name, void **val, 
                               size_t *size)
{
    if( false == allgather_complete ) {
        ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1);
    }
    return orte_grpcomm_base_get_proc_attr(proc, attribute_name, val, size);
}
static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
    int rc;
    opal_buffer_t coll;
    orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:bad entering allgather",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* everyone sends data to their local daemon */
    OBJ_CONSTRUCT(&coll, opal_buffer_t);
    /* tell the daemon we are doing an allgather */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }    
    /* add our data to it */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll, sbuf))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }
    /* send to local daemon */
    if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &coll, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }
    OBJ_DESTRUCT(&coll);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:bad allgather buffer sent",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* now receive the final result. Be sure to do this in
     * a manner that allows us to return without being in a recv!
     */
    allgather_complete = false;
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER,
                                 ORTE_RML_NON_PERSISTENT, allgather_recv, rbuf);
    if (rc != ORTE_SUCCESS) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1);
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:bad allgather completed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    return ORTE_SUCCESS;
}
Beispiel #3
0
static int tool_close(const orte_process_name_t* src_name,
                      orte_iof_tag_t src_tag)
{
    /* if we are a tool, then we need to request the HNP to stop
     * forwarding data from this process/stream
     */
    
    opal_buffer_t *buf;
    orte_iof_tag_t tag;
    orte_process_name_t hnp;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s closing output for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(src_name)));
    
    buf = OBJ_NEW(opal_buffer_t);
    
    /* setup the tag to stop the copy */
    tag = src_tag | ORTE_IOF_CLOSE;
    
    /* pack the tag - we do this first so that flow control messages can
     * consist solely of the tag
     */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        return rc;
    }
    /* pack the name of the source */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, src_name, 1, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        return rc;
    }
    
    /* flag that the close is incomplete */
    mca_iof_tool_component.closed = false;

    /* send the buffer to the correct HNP */
    ORTE_HNP_NAME_FROM_JOB(&hnp, src_name->jobid);
    orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP,
                            0, send_cb, NULL);
    
    /* wait right here until the close is confirmed */
    ORTE_PROGRESSED_WAIT(mca_iof_tool_component.closed, 0, 1);
    
    return ORTE_SUCCESS;
}
static int barrier(void)
{
    opal_buffer_t buf;
    orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_BARRIER;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:bad entering barrier",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* everyone sends barrier to local daemon */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    /* tell the daemon we are doing a barrier */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return rc;
    }    
    /* send to local daemon */
    if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return rc;
    }
    OBJ_DESTRUCT(&buf);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:bad barrier sent",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* now receive the release. Be sure to do this in
     * a manner that allows us to return without being in a recv!
     */
    barrier_recvd = false;
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BARRIER,
                                 ORTE_RML_NON_PERSISTENT, barrier_recv, NULL);
    if (rc != ORTE_SUCCESS) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    ORTE_PROGRESSED_WAIT(barrier_recvd, 0, 1);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:bad received barrier release",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    return ORTE_SUCCESS;
}
int orte_routed_base_register_sync(bool setup)
{
    opal_buffer_t buffer;
    int rc;
    orte_daemon_cmd_flag_t command=ORTE_DAEMON_SYNC_BY_PROC;
    char *rml_uri;
    
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
                         "%s registering sync to daemon %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));
    
    /* we need to get the oob to establish
     * the connection - the oob will leave the connection "alive"
     * thereafter so we can communicate readily
     */
    
    OBJ_CONSTRUCT(&buffer, opal_buffer_t);
    
    /* if we are setting up, tell the daemon to send back a nidmap */
    if (setup) {
        command = ORTE_DAEMON_SYNC_WANT_NIDMAP;
    }


    /* tell the daemon to sync */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buffer);
        return rc;
    }
    
    /* add our contact info to the buffer so the daemon can explicitly
     * store it
     */
    rml_uri = orte_rml.get_contact_info();
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &rml_uri, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buffer);
        free(rml_uri);
        return rc;
    }
    if (NULL != rml_uri) free(rml_uri);
    
    /* send the sync command to our daemon */
    if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buffer);
        return rc;
    }
    OBJ_DESTRUCT(&buffer);
    
    /* get the ack - need this to ensure that the sync communication
     * gets serviced by the event library on the orted prior to the
     * process exiting
     */
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
                         "%s registering sync waiting for ack",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    sync_recvd = false;
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC,
                                 ORTE_RML_NON_PERSISTENT, report_sync, NULL);
    if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1);
    
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output,
                         "%s registering sync ack recvd",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    return ORTE_SUCCESS;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
    /* the linear module routes all proc communications through
     * the local daemon. Daemons must identify which of their
     * daemon-peers is "hosting" the specified recipient and
     * route the message to that daemon. Daemon contact info
     * is handled elsewhere, so all we need to do here is
     * ensure that the procs are told to route through their
     * local daemon, and that daemons are told how to route
     * for each proc
     */
    int rc;

    /* if I am a tool, then I stand alone - there is nothing to do */
    if (ORTE_PROC_IS_TOOL) {
        return ORTE_SUCCESS;
    }
    
    /* if I am a daemon or HNP, then I have to extract the routing info for this job
     * from the data sent to me for launch and update the routing tables to
     * point at the daemon for each proc
     */
    if (ORTE_PROC_IS_DAEMON) {
        
        OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                             "%s routed_linear: init routes for daemon job %s\n\thnp_uri %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(job),
                             (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri));
        
        if (NULL == ndat) {
            /* indicates this is being called during orte_init.
             * Get the HNP's name for possible later use
             */
            if (NULL == orte_process_info.my_hnp_uri) {
                /* fatal error */
                ORTE_ERROR_LOG(ORTE_ERR_FATAL);
                return ORTE_ERR_FATAL;
            }
            /* set the contact info into the hash table */
            if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) {
                ORTE_ERROR_LOG(rc);
                return(rc);
            }
            
            /* extract the hnp name and store it */
            if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
                                                               ORTE_PROC_MY_HNP, NULL))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }

            /* set our lifeline to the the HNP - we will abort if that connection is lost */
            lifeline = ORTE_PROC_MY_HNP;
            
            /* daemons will send their contact info back to the HNP as
             * part of the message confirming they are read to go. HNP's
             * load their contact info during orte_init
             */
        } else {
                /* ndat != NULL means we are getting an update of RML info
                 * for the daemons - so update our contact info and routes
                 */
                if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) {
                    ORTE_ERROR_LOG(rc);
                }
                return rc;
            }

        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output,
                             "%s routed_linear: completed init routes",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        return ORTE_SUCCESS;
    }
    

    if (ORTE_PROC_IS_HNP) {
        
        OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                             "%s routed_linear: init routes for HNP job %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(job)));
        
        if (NULL == ndat) {
            /* if ndat is NULL, then this is being called during init, so just
             * make myself available to catch any reported contact info
             */
            if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* the HNP has no lifeline */
            lifeline = NULL;
        } else {
            /* if this is for my own jobid, then I am getting an update of RML info
             * for the daemons - so update our contact info and routes
             */
            if (ORTE_PROC_MY_NAME->jobid == job) {
                if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            } else {
                /* if not, then I need to process the callback */
                if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            }
        }

        return ORTE_SUCCESS;
    }

    {  /* MUST BE A PROC */
        /* if ndat != NULL, then this is being invoked by the proc to
         * init a route to a specified process that is outside of our
         * job family. We want that route to go through our HNP, routed via
         * out local daemon - however, we cannot know for
         * certain that the HNP already knows how to talk to the specified
         * procs. For example, in OMPI's publish/subscribe procedures, the
         * DPM framework looks for an mca param containing the global ompi-server's
         * uri. This info will come here so the proc can setup a route to
         * the server - we need to pass the routing info to our HNP
         */
        if (NULL != ndat) {
            int rc;
            
            OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                                 "%s routed_linear: init routes w/non-NULL data",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            /* if this is for a job family of zero, then we know that the enclosed
             * procs are local slaves to our daemon. In that case, we can just ignore this
             * as our daemon - given that it had to spawn the local slave - already
             * knows how to talk to them
             */
            if (0 == ORTE_JOB_FAMILY(job)) {
                return ORTE_SUCCESS;
            }
            
            if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
                /* if this is for a different job family, then we route via our HNP
                 * to minimize connection counts to entities such as ompi-server, so
                 * start by sending the contact info to the HNP for update
                 */
                OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                                     "%s routed_linear_init_routes: diff job family - sending update to %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
                
                if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, ndat,
                                                   ORTE_RML_TAG_RML_INFO_UPDATE, 0))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                
                /* wait right here until the HNP acks the update to ensure that
                 * any subsequent messaging can succeed
                 */
                ack_recvd = false;
                rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
                                             ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
                
                ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1);
                
                OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                                     "%s routed_linear_init_routes: ack recvd",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                
                /* our get_route function automatically routes all messages for
                 * other job families via the HNP, so nothing more to do here
                 */
            }
            return ORTE_SUCCESS;
        }
        
        /* if ndat=NULL, then we are being called during orte_init. In this
         * case, we need to setup a few critical pieces of info
         */
        
        OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output,
                             "%s routed_linear: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job),
                             (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri,
                             (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri));
                
        if (NULL == orte_process_info.my_daemon_uri) {
            /* in this module, we absolutely MUST have this information - if
             * we didn't get it, then error out
             */
            opal_output(0, "%s ERROR: Failed to identify the local daemon's URI",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_output(0, "%s ERROR: This is a fatal condition when the linear router",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_output(0, "%s ERROR: has been selected - either select the unity router",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_output(0, "%s ERROR: or ensure that the local daemon info is provided",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERR_FATAL;
        }
            
        /* we have to set the HNP's name, even though we won't route messages directly
         * to it. This is required to ensure that we -do- send messages to the correct
         * HNP name
         */
        if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
                                                           ORTE_PROC_MY_HNP, NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* Set the contact info in the RML - this won't actually establish
         * the connection, but just tells the RML how to reach the daemon
         * if/when we attempt to send to it
         */
        if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_daemon_uri))) {
            ORTE_ERROR_LOG(rc);
            return(rc);
        }
        /* extract the daemon's name so we can update the routing table */
        if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
                                                           ORTE_PROC_MY_DAEMON, NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* set our lifeline to the local daemon - we will abort if this connection is lost */
        lifeline = ORTE_PROC_MY_DAEMON;
        
        /* register ourselves -this sends a message to the daemon (warming up that connection)
         * and sends our contact info to the HNP when all local procs have reported
         *
         * NOTE: it may seem odd that we send our contact info to the HNP - after all,
         * the HNP doesn't really need to know how to talk to us directly if we are
         * using this routing method. However, this is good for two reasons:
         *
         * (1) some debuggers and/or tools may need RML contact
         *     info to set themselves up
         *
         * (2) doing so allows the HNP to "block" in a dynamic launch
         *     until all procs are reported running, thus ensuring that no communication
         *     is attempted until the overall ORTE system knows how to talk to everyone -
         *     otherwise, the system can just hang.
         */
        if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* no answer is expected or coming */
        
        return ORTE_SUCCESS;
    }
}
Beispiel #7
0
/**
 * Initialization of data structures for running under a debugger
 * using the MPICH/TotalView parallel debugger interface. This stage
 * of initialization must occur after spawn
 * 
 * NOTE: We -always- perform this step to ensure that any debugger
 * that attaches to us post-launch of the application can get a
 * completed proctable
 */
void orte_debugger_init_after_spawn(orte_job_t *jdata)
{
    orte_proc_t *proc;
    orte_app_context_t *appctx;
    orte_vpid_t i, j;
    opal_buffer_t buf;
    orte_process_name_t rank0;
    int rc;

    if (MPIR_proctable) {
        /* already initialized */
        return;
    }

    /* fill in the proc table for the application processes */
    
    if (orte_debug_flag) {
        opal_output(0, "Info: Setting up debugger process table for applications\n");
    }
    
    MPIR_debug_state = 1;
    
    /* set the total number of processes in the job */
    MPIR_proctable_size = jdata->num_procs;
    
    /* allocate MPIR_proctable */
    MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
                                                     MPIR_proctable_size);
    if (MPIR_proctable == NULL) {
        opal_output(0, "Error: Out of memory\n");
        return;
    }
    
    /* initialize MPIR_proctable */
    for (j=0; j < jdata->num_procs; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
            continue;
        }
        /* store this data in the location whose index
         * corresponds to the proc's rank
         */
        i = proc->name.vpid;
        if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
            continue;
        }
        
        MPIR_proctable[i].host_name = strdup(proc->node->name);
        if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { 
            MPIR_proctable[i].executable_name = 
            opal_os_path( false, appctx->app, NULL ); 
        } else {
            MPIR_proctable[i].executable_name =
            opal_os_path( false, appctx->cwd, appctx->app, NULL ); 
        } 
        MPIR_proctable[i].pid = proc->pid;
    }

    if (orte_debug_flag) {
        dump();
    }

    /* if we are being launched under a debugger, then we must wait
     * for it to be ready to go and do some things to start the job
     */
    if (MPIR_being_debugged) {
        /* wait for all procs to have reported their contact info - this
         * ensures that (a) they are all into mpi_init, and (b) the system
         * has the contact info to successfully send a message to rank=0
         */
        ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
        
        (void) MPIR_Breakpoint();
        
        /* send a message to rank=0 to release it */
        OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
        rank0.jobid = jdata->jobid;
        rank0.vpid = 0;
        if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
            opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
        }
        OBJ_DESTRUCT(&buf);
    }
}
static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
    int rc;
    orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD;
    struct timeval ompistart, ompistop;
    opal_buffer_t coll;
    orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER;
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:basic entering allgather",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) {
        gettimeofday(&ompistart, NULL);
    }
    
    /* everyone sends data to their local daemon */
    OBJ_CONSTRUCT(&coll, opal_buffer_t);
    /* tell the daemon to collect the data */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &command, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }
    /* tell the daemon we are doing an allgather */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }    
    /* add our data to it */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll, sbuf))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }
    /* send to local daemon */
    if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &coll, ORTE_RML_TAG_DAEMON, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&coll);
        return rc;
    }
    OBJ_DESTRUCT(&coll);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:basic allgather buffer sent",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* setup the buffer that will recv the results */
    allgather_buf = OBJ_NEW(opal_buffer_t);
    
    /* now receive the final result. Be sure to do this in
     * a manner that allows us to return without being in a recv!
     */
    allgather_complete = false;
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER,
                                 ORTE_RML_NON_PERSISTENT, allgather_recv, NULL);
    if (rc != ORTE_SUCCESS) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1);
    
    /* copy payload to the caller's buffer */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(allgather_buf);
        return rc;
    }
    OBJ_RELEASE(allgather_buf);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s allgather buffer received",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (orte_timing) {
        if (ORTE_PROC_MY_NAME->vpid == 0) {
            /* setup a receive to hear when the rank=N proc has received the data
             * release - in most xcast schemes, this will always be the final recvr
             */
            barrier_timer = false;
            rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER,
                                    ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL);
            if (ORTE_SUCCESS != rc) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1);
            gettimeofday(&ompistop, NULL);
            opal_output(0, "%s allgather: time to complete %ld usec",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                                   (ompistop.tv_usec - ompistart.tv_usec)));
        } else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) {
            /* if we are rank=N, send a message back to indicate
             * the xcast completed for timing purposes
             */
            orte_process_name_t name;
            opal_buffer_t buf;
            
            name.jobid = ORTE_PROC_MY_NAME->jobid;
            name.vpid = 0;
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            rc = ORTE_SUCCESS;
            OBJ_DESTRUCT(&buf);
        }
    }
    
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:basic allgather completed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    return ORTE_SUCCESS;
}
Beispiel #9
0
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
                                   int *num_procs, orte_proc_t ***proc_info_array)
#endif
{
    int ret;
    int32_t cnt, cnt_procs, n;
    opal_buffer_t *cmd;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
    orte_proc_t **proc_info;

    /* set default response */
    *num_procs = 0;
    *proc_info_array = NULL;
    
    /* query the HNP for info on the procs in this job */
    cmd = OBJ_NEW(opal_buffer_t);
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
#if ORTE_ENABLE_EPOCH
    if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
#endif
    /* define a max time to wait for send to complete */
    timer_fired = false;
    error_exit = ORTE_SUCCESS;
    ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
    
    /* do the send */
    if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
                                           send_cbfunc, NULL))) {
        ORTE_ERROR_LOG(ret);
        OBJ_RELEASE(cmd);
        return ret;
    }
    
    /* wait for send to complete */
    ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
    
    /* release the buffer */
    OBJ_RELEASE(cmd);
    
    /* did it succeed? */
    if (ORTE_SUCCESS != error_exit) {
        return error_exit;
    }

    /* define a max time to wait for an answer */
    timer_fired = false;
    error_exit = ORTE_SUCCESS;
    ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
    
    /* get the answer */
    OBJ_CONSTRUCT(&answer, opal_buffer_t);
    if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_TOOL,
                                                      ORTE_RML_NON_PERSISTENT,
                                                      recv_info,
                                                      NULL))) {
        /* cancel the timer */
        if (NULL != quicktime) {
            opal_event_evtimer_del(quicktime);
	    free(quicktime);
	    quicktime = NULL;
        }
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&answer);
        return ret;
    }
    
    ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
    
    if (ORTE_SUCCESS != error_exit) {
        OBJ_DESTRUCT(&answer);
        return error_exit;
    }
    
    cnt = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&answer);
        return ret;
    }

    /* allocate the required memory */
    if (0 < cnt_procs) {
        proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*));
        /* unpack the procs */
        for (n=0; n < cnt_procs; n++) {
            cnt = 1;
            if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) {
                ORTE_ERROR_LOG(ret);
                OBJ_DESTRUCT(&answer);
                free(proc_info);
                return ret;
            }
        }
        *proc_info_array = proc_info;
        *num_procs = (int)cnt_procs;
    }
    OBJ_DESTRUCT(&answer);

    return ORTE_SUCCESS;
}
Beispiel #10
0
/* report an event to a connected tool */
int orte_util_comm_report_event(orte_comm_event_t ev)
{
    int rc, i;
    opal_buffer_t buf;
    orte_node_t *node;
    
    /* if nothing is connected, ignore this */
    if (!tool_connected) {
        return ORTE_SUCCESS;
    }
    
    /* init a buffer for the data */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    /* flag the type of event */
    opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT);

    switch (ev) {
        case ORTE_COMM_EVENT_ALLOCATE:
            /* loop through nodes, storing just node names */
            for (i=0; i < orte_node_pool->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                    continue;
                }
                opal_dss.pack(&buf, &node->name, 1, OPAL_STRING);
            }
            break;
        
        case ORTE_COMM_EVENT_MAP:
            break;
        
        case ORTE_COMM_EVENT_LAUNCH:
            break;
        
        default:
            ORTE_ERROR_LOG(ORTE_ERROR);
            OBJ_DESTRUCT(&buf);
            return ORTE_ERROR;
            break;
    }
    
    /* do the send */
    if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return rc;
    }
    
    if (step) {
        /* the caller wants to wait until an ack is received -
         * define a max time to wait for an answer
         */
        OBJ_CONSTRUCT(&answer, opal_buffer_t);
        timer_fired = false;
        error_exit = ORTE_SUCCESS;
        ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
        
        /* get the answer */
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                           ORTE_RML_TAG_TOOL,
                                                           ORTE_RML_NON_PERSISTENT,
                                                           recv_info,
                                                           NULL))) {
            /* cancel the timer */
            if (NULL != quicktime) {
                opal_event_evtimer_del(quicktime);
		free(quicktime);
		quicktime = NULL;
            }
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&answer);
            return rc;
        }
        
        ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
        
        /* cleanup */
        OBJ_DESTRUCT(&answer);

        if (ORTE_SUCCESS != error_exit) {
            return error_exit;
        }
    }
    
    return ORTE_SUCCESS;
}
Beispiel #11
0
int
main(int argc, char *argv[]){
    int count;
    int msgsize;
    uint8_t *msg;
    int i, j, rc;
    orte_process_name_t peer;
    double maxpower;
    
    /*
     * Init
     */
    orte_init(&argc, &argv, ORTE_PROC_NON_MPI);

    if (argc > 1) {
        count = atoi(argv[1]);
        if (count < 0) {
            count = INT_MAX-1;
        }
    } else {
        count = MAX_COUNT;
    }
    
    peer.jobid = ORTE_PROC_MY_NAME->jobid;
    
    for (j=1; j < count+1; j++) {
        peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs;
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
        
        /* rank0 starts ring */
        if (ORTE_PROC_MY_NAME->vpid == 0) {
            /* setup the initiating buffer - put random sized message in it */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            
            maxpower = (double)(j%7);
            msgsize = (int)pow(10.0, maxpower);
            opal_output(0, "Ring %d message size %d bytes", j, msgsize);
            msg = (uint8_t*)malloc(msgsize);
            opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE);
            
            if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) {
                opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
            /* wait for it to come around */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
        } else {
            /* wait for msg */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            /* send it along */
            if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) {
                opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
        }
    }

    orte_finalize();

    return 0;
}
Beispiel #12
0
/* For a complete description of this algorithm, please look at
 * ompi/mca/coll/tuned/coll_tuned_allgather.c
 */
static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
                             orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids)
{
    orte_vpid_t rank, distance, nv;
    int32_t num_remote, total_entries, cnt;
    opal_buffer_t collection, buf;
    orte_process_name_t peer;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:coll:recdub algo employed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* initialize */
    total_entries = num_entries;
    
    /* start by seeding the collection with our own data */
    OBJ_CONSTRUCT(&collection, opal_buffer_t);
    opal_dss.copy_payload(&collection, sendbuf);
    
    /* collective is constrained to take place within the specified jobid */
    peer.jobid = jobid;
    
    /* Communication step:
     At every step i, rank r:
     - exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
     */
    /* find my position in the group of participants. This
     * value is the "rank" we will use in the algo
     */
    rank = ORTE_VPID_INVALID;
    for (nv=0; nv < np; nv++) {
        if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) {
            rank = nv;
            break;
        }
    }
    
    /* check for bozo case */
    if (ORTE_VPID_INVALID == rank) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    
    for (distance = 0x1; distance < np; distance<<=1) {
        
        /* first send my current contents */
        nv = rank ^ distance;
        peer.vpid = vpids[nv];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, &collection);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:recdub sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        
        /* now setup to recv from my other partner */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* and wait for it to get here */
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        
        /* extract the number of entries in the remote buffer */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        /* add it to our running total */
        total_entries += num_remote;
        
        /* transfer the data to our collection */
        opal_dss.copy_payload(&collection, &bucket);
        
        /* cleanup */
        OBJ_DESTRUCT(&bucket);
    }
    
    /* output of a collective begins with the total number of entries */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* transfer the collected data */
    opal_dss.copy_payload(recvbuf, &collection);
    
    /* cleanup */
    OBJ_DESTRUCT(&collection);
    
    return ORTE_SUCCESS;
}
Beispiel #13
0
/*
 * The Two-Proc Algorithm
 *
 * One sends to zero, zero waits to recv from one
 * Zero adds its data to message, sends result back to one
 */
static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries,
                   orte_jobid_t jobid, orte_vpid_t *vpids)
{
    orte_process_name_t peer;
    int32_t num_remote, cnt;
    int rc;
    opal_buffer_t buf;
    
    peer.jobid = jobid;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                         "%s grpcomm:coll:two-proc algo employed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (vpids[0] == ORTE_PROC_MY_NAME->vpid) {
        /* I send first */
        peer.vpid = vpids[1];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        /* setup a temp buffer so I can inform the other side as to the
         * number of entries in my buffer
         */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, sendbuf);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        
        /* wait for reply */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc got my return message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
    } else {
        /* if I am not the start, then I recv first */
        num_recvd = 0;
        OBJ_CONSTRUCT(&bucket, opal_buffer_t);
        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                          ORTE_RML_TAG_DAEMON_COLLECTIVE,
                                                          ORTE_RML_NON_PERSISTENT,
                                                          orte_grpcomm_base_coll_recv,
                                                          NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        
        ORTE_PROGRESSED_WAIT(false, num_recvd, 1);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc got my starting message",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        /* send my data back */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32);
        opal_dss.copy_payload(&buf, sendbuf);
        peer.vpid = vpids[0];
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
                             "%s grpcomm:coll:two-proc sending to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&peer)));
        if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
    }
    
    /* extract the number of entries in the remote buffer */
    cnt = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* output of a collective begins with the total number of entries */
    num_remote += num_entries;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* xfer my data */
    opal_dss.copy_payload(recvbuf, sendbuf);
    /* xfer the recvd data */
    opal_dss.copy_payload(recvbuf, &bucket);
    
    /* cleanup */
    OBJ_DESTRUCT(&bucket);
    
    return ORTE_SUCCESS;
}
static int daemon_collective(orte_process_name_t *sender, opal_buffer_t *data)
{
    orte_jobid_t jobid;
    orte_odls_job_t *jobdat;
    orte_routed_tree_t *child;
    orte_std_cntr_t n;
    opal_list_t daemon_tree;
    opal_list_item_t *item, *next;
    int32_t num_contributors;
    opal_buffer_t buf;
    orte_process_name_t my_parent, proc;
    orte_vpid_t daemonvpid;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s odls: daemon collective called",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* unpack the jobid using this collective */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* lookup the job record for it */
    jobdat = NULL;
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_next(item)) {
        jobdat = (orte_odls_job_t*)item;
        
        /* is this the specified job? */
        if (jobdat->jobid == jobid) {
            break;
        }
    }
    if (NULL == jobdat) {
        /* race condition - someone sent us a collective before we could
         * parse the add_local_procs cmd. Just add the jobdat object
         * and continue
         */
        jobdat = OBJ_NEW(orte_odls_job_t);
        jobdat->jobid = jobid;
        opal_list_append(&orte_local_jobdata, &jobdat->super);
    }
    
    /* it may be possible to get here prior to having actually finished processing our
     * local launch msg due to the race condition between different nodes and when
     * they start their individual procs. Hence, we have to first ensure that we
     * -have- finished processing the launch msg, or else we won't know whether
     * or not to wait before sending this on
     */
    ORTE_PROGRESSED_WAIT(jobdat->launch_msg_processed, 0, 1);
    
    /* unpack the collective type */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->collective_type, &n, ORTE_GRPCOMM_COLL_T))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* unpack the number of contributors in this data bucket */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    jobdat->num_contributors += num_contributors;
    
    /* xfer the data */
    opal_dss.copy_payload(&jobdat->collection_bucket, data);
    
    /* count the number of participants collected */
    jobdat->num_collected++;
    
    /* if we haven't already done so, figure out how many participants we
     * should be expecting
     */
    if (jobdat->num_participating < 0) {
        if (0 < jobdat->num_local_procs) {
            /* we have children, so account for our own participation */
            jobdat->num_participating = 1;
        } else {
            jobdat->num_participating = 0;
        }
        /* now see if anyone else will be sending us something */
        OBJ_CONSTRUCT(&daemon_tree, opal_list_t);
        orte_routed.get_routing_tree(&daemon_tree);
        /* unfortunately, there is no simple way to determine which of our "child"
         * daemons in the routing tree will be sending us something. All we can do
         * is brute force a search, though we attempt to keep it as short as possible
         */
        proc.jobid = jobid;
        proc.vpid = 0;
        while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) {
            /* get the daemon that hosts this proc */
            daemonvpid = orte_ess.proc_get_daemon(&proc);
            /* is this daemon one of our children, or at least its contribution
             * will pass through one of our children
             */
            item = opal_list_get_first(&daemon_tree);
            while (item != opal_list_get_end(&daemon_tree)) {
                next = opal_list_get_next(item);
                child = (orte_routed_tree_t*)item;
                if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) {
                    /* it does - add to num_participating */
                    jobdat->num_participating++;
                    /* remove this from the list so we don't double count it */
                    opal_list_remove_item(&daemon_tree, item);
                    /* done with search */
                    break;
                }
                item = next;
            }
            proc.vpid++;
        }
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:bad: daemon collective for job %s from %s type %ld num_collected %d num_participating %d num_contributors %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid),
                         ORTE_NAME_PRINT(sender),
                         (long)jobdat->collective_type, jobdat->num_collected,
                         jobdat->num_participating, jobdat->num_contributors));
    
    if (jobdat->num_collected == jobdat->num_participating) {
        /* if I am the HNP, go process the results */
        if (ORTE_PROC_IS_HNP) {
            goto hnp_process;
        }
        
        /* if I am not the HNP, send to my parent */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        /* pack the jobid */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* pack the collective type */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->collective_type, 1, ORTE_GRPCOMM_COLL_T))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* pack the number of contributors */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* xfer the payload*/
        opal_dss.copy_payload(&buf, &jobdat->collection_bucket);
        /* reset everything for next collective */
        jobdat->num_contributors = 0;
        jobdat->num_collected = 0;
        OBJ_DESTRUCT(&jobdat->collection_bucket);
        OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t);
        /* send it */
        my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
        my_parent.vpid = orte_routed.get_routing_tree(NULL);
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                             "%s grpcomm:bad: daemon collective not the HNP - sending to parent %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&my_parent)));
        if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
    }
    return ORTE_SUCCESS;
    
hnp_process:
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:bad: daemon collective HNP - xcasting to job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jobid)));
    /* setup a buffer to send the results back to the job members */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
    if (ORTE_GRPCOMM_BARRIER == jobdat->collective_type) {
        /* reset everything for next collective */
        jobdat->num_contributors = 0;
        jobdat->num_collected = 0;
        OBJ_DESTRUCT(&jobdat->collection_bucket);
        OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t);
        /* don't need anything in this for a barrier */
        if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, ORTE_RML_TAG_BARRIER))) {
            ORTE_ERROR_LOG(rc);
        }
    } else if (ORTE_GRPCOMM_ALLGATHER == jobdat->collective_type) {
        int32_t numc;
        /* add the data */
        numc = jobdat->num_contributors;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* reset everything for next collective */
        jobdat->num_contributors = 0;
        jobdat->num_collected = 0;
        OBJ_DESTRUCT(&jobdat->collection_bucket);
        OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t);
        /* send the buffer */
        if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, ORTE_RML_TAG_ALLGATHER))) {
            ORTE_ERROR_LOG(rc);
        }
    } else {
        /* no other collectives currently supported! */
        ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
        rc = ORTE_ERR_NOT_IMPLEMENTED;
    }
    
cleanup:
    OBJ_DESTRUCT(&buf);
    
    return ORTE_SUCCESS;    
}
static int onesided_barrier(void)
{
    int num_participating;
    opal_list_t daemon_tree;
    opal_buffer_t buf;
    orte_process_name_t my_parent;
    opal_event_t *quicktime=NULL;
    struct timeval quicktimeval;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:bad: onesided barrier called",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* if we are not to use the barrier, then just return */
    if (!orte_orted_exit_with_barrier) {
        if (ORTE_PROC_IS_HNP) {
            /* if we are the HNP, we need to do a little delay to give
             * the orteds a chance to exit before we leave
             */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                                 "%s grpcomm:bad: onesided barrier adding delay timer",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            quicktimeval.tv_sec = 0;
            quicktimeval.tv_usec = 100;
            timer_fired = false;
            ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb);
            ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
        }
        return ORTE_SUCCESS;
    }
    
    /* initialize things */
    num_onesided_barrier_recvd = 0;
    num_participating = 0;
    
    /* figure out how many participants we should be expecting */
    OBJ_CONSTRUCT(&daemon_tree, opal_list_t);
    my_parent.jobid = ORTE_PROC_MY_NAME->jobid;
    my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree);
    num_participating = opal_list_get_size(&daemon_tree);
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:bad: onesided barrier num_participating %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_participating));
    
    /* set the recv */
    if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                                      ORTE_RML_TAG_ONESIDED_BARRIER,
                                                      ORTE_RML_NON_PERSISTENT,
                                                      onesided_barrier_recv,
                                                      NULL))) {
        ORTE_ERROR_LOG(rc);
    }
    
    /* wait to recv them */
    ORTE_PROGRESSED_WAIT(false, num_onesided_barrier_recvd, num_participating);
    
    /* cancel the recv */
    orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER);

    /* if I am the HNP, then we are done */
    if (ORTE_PROC_IS_HNP) {
        return ORTE_SUCCESS;
    }
    
    /* send a zero-byte msg to my parent */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    /* send it */
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:bad:onsided:barrier not the HNP - sending to parent %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&my_parent)));
    if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return rc;
    }
    OBJ_DESTRUCT(&buf);
    
    return ORTE_SUCCESS;
}
int orte_grpcomm_base_allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf)
{
    opal_list_item_t *item;
    orte_namelist_t *peer, *root;
    orte_std_cntr_t num_peers;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm: entering allgather_list",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* the first entry on the list is the "root" that collects
     * all the data - everyone else just sends and gets back
     * the results
     */
    root = (orte_namelist_t*)opal_list_get_first(names);
    
    /***   NON-ROOT   ***/
    if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) {
        /* everyone but root sends data */
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                             "%s allgather_list: sending my data to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&root->name)));
        
        if (0 > orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0)) {
            ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
            return ORTE_ERR_COMM_FAILURE;
        }
        
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                             "%s allgather_list: buffer sent",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        /* setup the buffer that will recv the results */
        allgather_buf = OBJ_NEW(opal_buffer_t);
        
        /* now receive the final result from rank=0. Be sure to do this in
         * a manner that allows us to return without being in a recv!
         */
        allgather_num_recvd = 0;
        allgather_failed = false;
        rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST,
                                     ORTE_RML_NON_PERSISTENT, allgather_client_recv, NULL);
        if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        
        ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, 1);
        
        /* if the allgather failed, return an error */
        if (allgather_failed) {
            ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
            OBJ_RELEASE(allgather_buf);
            return ORTE_ERR_COMM_FAILURE;
        }
        
        /* copy payload to the caller's buffer */
        if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(allgather_buf);
            return rc;
        }
        OBJ_RELEASE(allgather_buf);
        
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                             "%s allgather_list: buffer received",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
        return ORTE_SUCCESS;
    }
    
    
    /***   ROOT   ***/
    /* count how many peers are participating, including myself */
    num_peers = (orte_std_cntr_t)opal_list_get_size(names);

    /* seed the outgoing buffer with the num_procs so it can be unpacked */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &num_peers, 1, ORTE_STD_CNTR))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* put my own information into the outgoing buffer */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, sbuf))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* setup the recv conditions */
    allgather_failed = false;
    allgather_num_recvd = 0;
    
    /* setup the buffer that will recv the results */
    allgather_buf = OBJ_NEW(opal_buffer_t);
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s allgather_list: waiting to recv %ld inputs",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (long)num_peers-1));
    
    /* post the non-blocking recv */
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST,
                                 ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL);
    if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, num_peers-1);
    
    /* cancel the lingering recv */
    if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(allgather_buf);
        return rc;
    }
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s allgather_list: received all data",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* copy the received info to the caller's buffer */
    if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(allgather_buf);
        return rc;
    }
    OBJ_RELEASE(allgather_buf);
    
    /* broadcast the results */
    allgather_num_sent = 0;
    for (item = opal_list_get_first(names);
         item != opal_list_get_end(names);
         item = opal_list_get_next(item)) {
        peer = (orte_namelist_t*)item;
        
        /* skip myself */
        if (OPAL_EQUAL == opal_dss.compare(&root->name, &peer->name, ORTE_NAME)) {
            continue;
        }
        
        /* transmit the buffer to this process */
        if (0 > orte_rml.send_buffer_nb(&peer->name, rbuf, ORTE_RML_TAG_ALLGATHER_LIST,
                                        0, allgather_send_cb, 0)) {
            ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
            return ORTE_ERR_COMM_FAILURE;
        }
    }
    
    ORTE_PROGRESSED_WAIT(false, allgather_num_sent, num_peers-1);
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm: allgather_list completed",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    return ORTE_SUCCESS;
}