Exemplo n.º 1
0
/*
 * Check the status of the connection. If the connection failed, will retry
 * later. Otherwise, send this processes identifier to the peer on the
 * newly connected socket.
 */
static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer)
{
    int so_error = 0;
    opal_socklen_t so_length = sizeof(so_error);

    /* unregister from receiving event notifications */
    opal_event_del(&peer->peer_send_event);

    /* check connect completion status */
    if(getsockopt(peer->peer_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
        opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: getsockopt() failed: %s (%d)\n", 
            ORTE_NAME_ARGS(orte_process_info.my_name),
            ORTE_NAME_ARGS(&(peer->peer_name)),
            strerror(opal_socket_errno),
            opal_socket_errno);
        mca_oob_tcp_peer_close(peer);
        return;
    }

    if(so_error == EINPROGRESS) {
        opal_event_add(&peer->peer_send_event, 0);
        return;
    } else if (so_error == ECONNREFUSED || so_error == ETIMEDOUT) {
        struct timeval tv = { 1,0 };
        if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
            opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
                        "connection failed: %s (%d) - retrying\n", 
                        ORTE_NAME_ARGS(orte_process_info.my_name),
                        ORTE_NAME_ARGS(&(peer->peer_name)),
                        strerror(so_error),
                        so_error);
        }
        mca_oob_tcp_peer_shutdown(peer);
        opal_evtimer_add(&peer->peer_timer_event, &tv);
        return;
    } else if(so_error != 0) {
        /* No need to worry about the return code here - we return regardless
           at this point, and if an error did occur a message has already been
           printed for the user */
        mca_oob_tcp_peer_try_connect(peer);
        return;
    }

    if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
        opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: "
                    "sending ack, %d",
                    ORTE_NAME_ARGS(orte_process_info.my_name),
                    ORTE_NAME_ARGS(&(peer->peer_name)), so_error);
    }

    if(mca_oob_tcp_peer_send_connect_ack(peer) == ORTE_SUCCESS) {
        peer->peer_state = MCA_OOB_TCP_CONNECT_ACK;
        opal_event_add(&peer->peer_recv_event, 0);
    } else {
        opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_complete_connect: unable to send connect ack.",
            ORTE_NAME_ARGS(orte_process_info.my_name),
            ORTE_NAME_ARGS(&(peer->peer_name)));
        mca_oob_tcp_peer_close(peer);
    }
}
/* this function automatically gets periodically called
 * by the event library so we can check on the state
 * of the various orteds
 */
static void check_heartbeat(int fd, short dummy, void *arg)
{
    int v;
    orte_proc_t *proc;
    orte_job_t *daemons;
    struct timeval timeout;
    bool died = false;
    opal_event_t *tmp = (opal_event_t*)arg;
    struct timeval now;
    
    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:base:check_heartbeat",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* if we are aborting or shutting down, ignore this */
    if (orte_abnormal_term_ordered || 0 == orte_heartbeat_rate) {
        return;
    }
    
    /* get the job object for the daemons */
    if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return;
    }
    
    /* get current time */
    gettimeofday(&timeout, NULL);
    
    /* cycle through the daemons - make sure we check them all
     * in case multiple daemons died so all of those that did die
     * can be appropriately flagged
     */
    for (v=1; v < daemons->procs->size; v++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) {
            continue;
        }
        if ((timeout.tv_sec - proc->beat) > HEARTBEAT_CK*orte_heartbeat_rate) {
            /* declare this orted dead */
            proc->state = ORTE_PROC_STATE_ABORTED;
            proc->exit_code = ORTE_ERROR_DEFAULT_EXIT_CODE;
            if (NULL == daemons->aborted_proc) {
                daemons->aborted_proc = proc;
            }
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            died = true;
        }
    }
    
    /* if any daemon died, abort */
    if (died) {
        orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1,
                                    ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_ABORTED);
        return;
    }
    
    /* reset the timer */
    now.tv_sec = HEARTBEAT_CK*orte_heartbeat_rate;
    now.tv_usec = 0;
    opal_evtimer_add(tmp, &now);
}
void orte_plm_base_heartbeat(int fd, short event, void *arg)
{
    opal_buffer_t buf;
    orte_plm_cmd_flag_t command = ORTE_PLM_HEARTBEAT_CMD;
    opal_event_t *tmp = (opal_event_t*)arg;
    struct timeval now;
    int rc;
    
    /* setup the buffer */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
    /* tell the HNP this is a heartbeat */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
    
    /* send heartbeat to HNP */
    if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_PLM, 0))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
    
    /* reset the timer */
    now.tv_sec = orte_heartbeat_rate;
    now.tv_usec = 0;
    opal_evtimer_add(tmp, &now);
    
CLEANUP:
    OBJ_DESTRUCT(&buf);
}
Exemplo n.º 4
0
static int mca_oob_tcp_peer_start_connect(mca_oob_tcp_peer_t* peer)
{
    int flags;

    /* create socket */
    peer->peer_state = MCA_OOB_TCP_CONNECTING;
    peer->peer_sd = socket(AF_INET, SOCK_STREAM, 0);
    if (peer->peer_sd < 0) {
        /* if we didn't successfully connect, wait 1 second and then try again */
        struct timeval tv = { 1,0 };
        opal_output(0, 
            "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_start_connect: socket() failed: %s (%d)\n",
            ORTE_NAME_ARGS(orte_process_info.my_name),
            ORTE_NAME_ARGS(&(peer->peer_name)),
            strerror(opal_socket_errno),
            opal_socket_errno);
        mca_oob_tcp_peer_shutdown(peer);
        opal_evtimer_add(&peer->peer_timer_event, &tv);
        return ORTE_ERR_UNREACH;
    }

    /* setup socket options */
    mca_oob_tcp_set_socket_options(peer->peer_sd);

    /* setup event callbacks */
    mca_oob_tcp_peer_event_init(peer);

    /* setup the socket as non-blocking */
    if((flags = fcntl(peer->peer_sd, F_GETFL, 0)) < 0) {
        opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_GETFL) failed: %s (%d)\n", 
            ORTE_NAME_ARGS(orte_process_info.my_name),
            ORTE_NAME_ARGS(&(peer->peer_name)),
            strerror(opal_socket_errno),
            opal_socket_errno);
    } else {
       flags |= O_NONBLOCK;
        if(fcntl(peer->peer_sd, F_SETFL, flags) < 0)
            opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_connect: fcntl(F_SETFL) failed: %s (%d)\n", 
                ORTE_NAME_ARGS(orte_process_info.my_name),
                ORTE_NAME_ARGS(&(peer->peer_name)),
                strerror(opal_socket_errno),
                opal_socket_errno);
    }

    /* 
     * We should parse all the IP addresses exported by the peer and try to connect to each of them.
     */

    return mca_oob_tcp_peer_try_connect(peer);
}
Exemplo n.º 5
0
Arquivo: orted.c Projeto: aosm/openmpi
static void halt_vm(void)
{
    int ret;
    struct timeval tv = { 1, 0 };
    opal_event_t* event;
    opal_list_t attrs;
    opal_list_item_t *item;
    
    /* terminate the vm - this will also wake us up so we can exit */
    OBJ_CONSTRUCT(&attrs, opal_list_t);
    orte_rmgr.add_attribute(&attrs, ORTE_NS_INCLUDE_DESCENDANTS, ORTE_UNDEF, NULL, ORTE_RMGR_ATTR_OVERRIDE);
    ret = orte_pls.terminate_orteds(0, &orte_abort_timeout, &attrs);
    while (NULL != (item = opal_list_remove_first(&attrs))) OBJ_RELEASE(item);
    OBJ_DESTRUCT(&attrs);
    
    /* setup a delay to give the orteds time to complete their departure */
    if (NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
        opal_evtimer_set(event, exit_callback, NULL);
        opal_evtimer_add(event, &tv);
    }
}
Exemplo n.º 6
0
static int orte_pls_rsh_launch_threaded(orte_jobid_t jobid)
{
    struct timeval tv = { 0, 0 };
    struct opal_event event;
    struct orte_pls_rsh_stack_t stack;

    OBJ_CONSTRUCT(&stack, orte_pls_rsh_stack_t);

    stack.jobid = jobid;
    if( opal_event_progress_thread() ) {
        stack.rc = orte_pls_rsh_launch( jobid );
    } else {
        opal_evtimer_set(&event, orte_pls_rsh_launch_cb, &stack);
        opal_evtimer_add(&event, &tv);

        OPAL_THREAD_LOCK(&stack.mutex);
        while (stack.complete == false) {
            opal_condition_wait(&stack.cond, &stack.mutex);
        }
        OPAL_THREAD_UNLOCK(&stack.mutex);
    }
    OBJ_DESTRUCT(&stack);
    return stack.rc;
}
Exemplo n.º 7
0
/*
 *  Receive the peers globally unique process identification from a newly
 *  connected socket and verify the expected response. If so, move the
 *  socket to a connected state.
 */
static int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* peer)
{
    mca_oob_tcp_hdr_t hdr;
    if((mca_oob_tcp_peer_recv_blocking(peer, &hdr, sizeof(hdr))) != sizeof(hdr)) {
        /* If the peer state is still CONNECT_ACK, that indicates that
           the error was a reset from the remote host because the
           connection was not able to be fully established.  In that
           case, Clean up the connection and give it another go.  */
        if (peer->peer_state == MCA_OOB_TCP_CONNECT_ACK) {
            struct timeval tv = { 1,0 };
            if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
                opal_output(0,
                            "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack "
                            "connect failed during receive.  Restarting (%s).",
                            ORTE_NAME_ARGS(orte_process_info.my_name),
                            ORTE_NAME_ARGS(&(peer->peer_name)),
                            strerror(opal_socket_errno));
            }
            opal_event_del(&peer->peer_recv_event);
            mca_oob_tcp_peer_shutdown(peer);
            opal_evtimer_add(&peer->peer_timer_event, &tv);
            return ORTE_SUCCESS;
        } else {
            mca_oob_tcp_peer_close(peer);
            return ORTE_ERR_UNREACH;
        }
    }
    MCA_OOB_TCP_HDR_NTOH(&hdr);
    if(hdr.msg_type != MCA_OOB_TCP_CONNECT) {
        opal_output(0, "mca_oob_tcp_peer_recv_connect_ack: invalid header type: %d\n", 
                    hdr.msg_type);
        mca_oob_tcp_peer_close(peer);
        return ORTE_ERR_UNREACH;
    }

    /* compare the peers name to the expected value */
    if(memcmp(&peer->peer_name, &hdr.msg_src, sizeof(orte_process_name_t)) != 0) {
        opal_output(0, "[%lu,%lu,%lu]-[%lu,%lu,%lu] mca_oob_tcp_peer_recv_connect_ack: "
            "received unexpected process identifier [%d,%d,%d]\n",
            ORTE_NAME_ARGS(orte_process_info.my_name),
            ORTE_NAME_ARGS(&(peer->peer_name)),
            ORTE_NAME_ARGS(&(hdr.msg_src)));
        mca_oob_tcp_peer_close(peer);
        return ORTE_ERR_UNREACH;
    }

    /* if we have an invalid name or do not have one assigned at all -
     * use the name returned by the peer.  This needs to be a LITERAL
     * comparison - we do NOT want wildcard values to return EQUAL
     */
    if(orte_process_info.my_name == NULL) {
        orte_ns.create_process_name(&orte_process_info.my_name, 
            hdr.msg_dst.cellid, hdr.msg_dst.jobid, hdr.msg_dst.vpid);
    } else if (orte_ns.compare_fields(ORTE_NS_CMP_ALL, orte_process_info.my_name, ORTE_NAME_INVALID) == ORTE_EQUAL) {
        *orte_process_info.my_name = hdr.msg_dst;
    }

    /* connected */
    mca_oob_tcp_peer_connected(peer);
    if(mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT) {
        mca_oob_tcp_peer_dump(peer, "connected");
    }
    return ORTE_SUCCESS;
}
Exemplo n.º 8
0
static void check_debugger(int fd, short event, void *arg)
{
    struct timeval now;
    opal_event_t *tmp = (opal_event_t*)arg;
    orte_job_t *jdata;
    orte_app_context_t *app;
    char cwd[OPAL_PATH_MAX];
    int rc;
    int32_t ljob;

    if (MPIR_being_debugged) {
        if (orte_debug_flag) {
            opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
            MPIR_executable_path);
        }
        
        /* a debugger has attached! All the MPIR_Proctable
         * data is already available, so we only need to
         * check to see if we should spawn any daemons
         */
        if ('\0' != MPIR_executable_path[0]) {
            /* this will be launched just like a regular job,
             * so we do not use the global orte_debugger_daemon
             * as this is reserved for co-location upon startup
             */
            jdata = OBJ_NEW(orte_job_t);
            /* create a jobid for these daemons - this is done solely
             * to avoid confusing the rest of the system's bookkeeping
             */
            orte_plm_base_create_jobid(jdata);
            /* flag the job as being debugger daemons */
            jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
            /* unless directed, we do not forward output */
            if (!MPIR_forward_output) {
                jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
            }
            /* set the mapping policy to "pernode" so we only get
             * one debugger daemon on each node
             */
            jdata->map = OBJ_NEW(orte_job_map_t);
            jdata->map->npernode = 1;
            /* add it to the global job pool */
            ljob = ORTE_LOCAL_JOBID(jdata->jobid);
            opal_pointer_array_set_item(orte_job_data, ljob, jdata);
            /* create an app_context for the debugger daemon */
            app = OBJ_NEW(orte_app_context_t);
            app->app = strdup((char*)MPIR_executable_path);
            if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
                orte_show_help("help-orterun.txt", "orterun:init-failure",
                               true, "get the cwd", rc);
                OBJ_RELEASE(jdata);
                goto RELEASE;
            }
            app->cwd = strdup(cwd);
            app->user_specified_cwd = false;
            opal_argv_append_nosize(&app->argv, app->app);
            build_debugger_args(app);
            opal_pointer_array_add(jdata->apps, &app->super);
            jdata->num_apps = 1;
            /* now go ahead and spawn this job */
            if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        
    RELEASE:
        /* notify the debugger that all is ready */
        MPIR_Breakpoint();

    } else {
        /* reissue the timer to wake us up again */
        now.tv_sec = orte_debugger_check_rate;
        now.tv_usec = 0;
        opal_evtimer_add(tmp, &now);
    }
}
Exemplo n.º 9
0
static void
rml_oob_recv_route_callback(int status,
                            struct orte_process_name_t* peer,
                            struct iovec* iov,
                            int count,
                            orte_rml_tag_t tag,
                            void *cbdata)
{
    orte_rml_oob_msg_header_t *hdr = 
        (orte_rml_oob_msg_header_t*) iov[0].iov_base;
    int real_tag;
    int ret;
    orte_process_name_t next, origin;

    /* BWB -- propogate errors here... */
    assert(status >= 0);

    ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);

    origin = hdr->origin;

    next = orte_routed.get_route(&hdr->destination);
    if (next.vpid == ORTE_VPID_INVALID) {
        opal_output(0, "%s:route_callback tried routing message from %s to %s:%d, can't find route",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&origin),
                    ORTE_NAME_PRINT(&hdr->destination),
                    hdr->tag);
        opal_backtrace_print(stderr);
        orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
    }

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
        opal_output(0, "%s:route_callback trying to get message from %s to %s:%d, routing loop",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&origin),
                    ORTE_NAME_PRINT(&hdr->destination),
                    hdr->tag);
        opal_backtrace_print(stderr);
        orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
    }

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, &hdr->destination)) {
        real_tag = hdr->tag;
    } else {
        real_tag = ORTE_RML_TAG_RML_ROUTE;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                         "%s routing message from %s for %s to %s (tag: %d)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&hdr->origin),
                         ORTE_NAME_PRINT(&hdr->destination),
                         ORTE_NAME_PRINT(&next),
                         hdr->tag));

    ORTE_RML_OOB_MSG_HEADER_HTON(*hdr);

    ret = orte_rml_oob_module.active_oob->oob_send_nb(&next,
                                                      &origin,
                                                      iov,
                                                      count,
                                                      real_tag,
                                                      0,
                                                      rml_oob_recv_route_send_callback,
                                                      NULL);

    if (ORTE_SUCCESS != ret) {
        if (ORTE_ERR_ADDRESSEE_UNKNOWN == ret) {
            /* no route -- queue and hope we find a route */
            orte_rml_oob_queued_msg_t *qmsg = OBJ_NEW(orte_rml_oob_queued_msg_t);
            OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                                 "%s: no OOB information for %s.  Queuing for later.",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&next)));
            ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);
            qmsg->payload[0].iov_base = (char *) malloc(iov[0].iov_len);
            if (NULL == qmsg->payload[0].iov_base) abort();
            qmsg->payload[0].iov_len = iov[0].iov_len;
            memcpy(qmsg->payload[0].iov_base, iov[0].iov_base, iov[0].iov_len);
            OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
            opal_list_append(&orte_rml_oob_module.queued_routing_messages,
                             &qmsg->super);
            if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
                opal_evtimer_add(orte_rml_oob_module.timer_event,
                                 &orte_rml_oob_module.timeout);
            }
            OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
        } else {
            opal_output(0,
                        "%s failed to send message to %s: %s (rc = %d)",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&next),
                        opal_strerror(ret),
                        ret);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }
    }
}
Exemplo n.º 10
0
static void 
rml_oob_queued_progress(int fd, short event, void *arg)
{
    orte_rml_oob_queued_msg_t *qmsg;
    orte_rml_oob_msg_header_t *hdr;
    int real_tag;
    int ret;
    orte_process_name_t next, origin;

    while (true) {
        OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
        qmsg = (orte_rml_oob_queued_msg_t*) opal_list_remove_first(&orte_rml_oob_module.queued_routing_messages);
        OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
        if (NULL == qmsg) break;

        hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
        origin = hdr->origin;

        next = orte_routed.get_route(&hdr->destination);
        if (next.vpid == ORTE_VPID_INVALID) {
            opal_output(0,
                        "%s:queued progress tried routing message from %s to %s:%d, can't find route",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&hdr->origin),
                        ORTE_NAME_PRINT(&hdr->destination),
                        hdr->tag);
            opal_backtrace_print(stderr);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }

        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
            opal_output(0, "%s:queued progress trying to get message from %s to %s:%d, routing loop",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&hdr->origin),
                        ORTE_NAME_PRINT(&hdr->destination),
                        hdr->tag);
            opal_backtrace_print(stderr);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }

        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, &hdr->destination)) {
            real_tag = hdr->tag;
        } else {
            real_tag = ORTE_RML_TAG_RML_ROUTE;
        }

        OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                             "%s routing message from %s for %s to %s (tag: %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&hdr->origin),
                             ORTE_NAME_PRINT(&hdr->destination),
                             ORTE_NAME_PRINT(&next),
                             hdr->tag));

        ORTE_RML_OOB_MSG_HEADER_HTON(*hdr);

        ret = orte_rml_oob_module.active_oob->oob_send_nb(&next,
                                                          &origin,
                                                          qmsg->payload,
                                                          1,
                                                          real_tag,
                                                          0,
                                                          rml_oob_recv_route_queued_send_callback,
                                                          qmsg);

        if (ORTE_SUCCESS != ret) {
            if (ORTE_ERR_ADDRESSEE_UNKNOWN == ret) {
                /* still no route -- try again */
                ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);
                OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
                opal_list_append(&orte_rml_oob_module.queued_routing_messages,
                                 &qmsg->super);
                if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
                    opal_evtimer_add(orte_rml_oob_module.timer_event,
                                     &orte_rml_oob_module.timeout);
                }
                OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
            } else {
                opal_output(0,
                            "%s failed to send message from %s to %s:%d %s (rc = %d)",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&next),
                            ORTE_NAME_PRINT(&origin),
                            real_tag,
                            ORTE_ERROR_NAME(ret),
                            ret);
                abort();
            }
        }
    }
}
Exemplo n.º 11
0
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout)
{
    int rc;
    orte_buffer_t cmd;
    orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS;
    opal_list_item_t *item;
    orte_pls_daemon_info_t *dmn;
    opal_event_t* event = NULL;

    OPAL_TRACE(1);

    OBJ_CONSTRUCT(&cmd, orte_buffer_t);
    
    /* pack the command */
    if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&cmd);
        return rc;
    }
    
    /* pack the jobid */
    if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &job, 1, ORTE_JOBID))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&cmd);
        return rc;
    }
    
    /* send the commands as fast as we can */
    for (item = opal_list_get_first(daemons);
         item != opal_list_get_end(daemons);
         item = opal_list_get_next(item)) {
        dmn = (orte_pls_daemon_info_t*)item;

        if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED,
                                        0, orte_pls_base_orted_send_cb, NULL)) {
            ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
            OBJ_DESTRUCT(&cmd);
            return rc;
        }
            
        orted_cmd_num_active++;
    }
    OBJ_DESTRUCT(&cmd);

    /* post the receive for the ack's */
    rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK,
                                 ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL);
    if (rc != ORTE_SUCCESS) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* define the default completion status */
    completion_status = ORTE_SUCCESS;
    
    /* wait for all commands to have been received */
    OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock);
    if (orted_cmd_num_active > 0) {
        /* setup a delay to give the orteds time to complete their departure - wake us up if they
        * don't exit by the prescribed time
        */
        if (NULL != timeout &&  /* only do this if the user gave us a time to wait */
            NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) {
            opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL);
            opal_evtimer_add(event, timeout);
        }
        /* now go to sleep until woken up */
        opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock);
    }
    OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock);
    
    /* log an error if one occurred */
    if (ORTE_SUCCESS != completion_status) {
        ORTE_ERROR_LOG(completion_status);
    }

    /* if started, kill the timer event so it doesn't hit us later */
    if (NULL != event) {
        opal_evtimer_del(event);
        free(event);
    }    
    
    /* we're done! */
    return completion_status;
}
Exemplo n.º 12
0
static int show_help(const char *filename, const char *topic,
                     const char *output, orte_process_name_t *sender)
{
    int rc;
    tuple_list_item_t *tli = NULL;
    orte_namelist_t *pnli;
    time_t now = time(NULL);

    /* If we're aggregating, check for duplicates.  Otherwise, don't
       track duplicates at all and always display the message. */
    if (orte_help_want_aggregate) {
        rc = get_tli(filename, topic, &tli);
    } else {
        rc = ORTE_ERR_NOT_FOUND;
    }

    /* Was it already displayed? */
    if (ORTE_SUCCESS == rc) {
        /* Yes.  But do we want to print anything?  That's complicated.

           We always show the first message of a given (filename,
           topic) tuple as soon as it arrives.  But we don't want to
           show duplicate notices often, because we could get overrun
           with them.  So we want to gather them up and say "We got N
           duplicates" every once in a while.

           And keep in mind that at termination, we'll unconditionally
           show all accumulated duplicate notices.

           A simple scheme is as follows:
           - when the first of a (filename, topic) tuple arrives
             - print the message
             - if a timer is not set, set T=now
           - when a duplicate (filename, topic) tuple arrives
             - if now>(T+5) and timer is not set (due to
               non-pre-emptiveness of our libevent, a timer *could* be
               set!)
               - print all accumulated duplicates
               - reset T=now
             - else if a timer was not set, set the timer for T+5
             - else if a timer was set, do nothing (just wait)
           - set T=now when the timer expires
        */
        ++tli->tli_count_since_last_display;
        if (now > show_help_time_last_displayed + 5 && !show_help_timer_set) {
            show_accumulated_duplicates(0, 0, NULL);
        } else if (!show_help_timer_set) {
            opal_evtimer_set(&show_help_timer_event,
                             show_accumulated_duplicates, NULL);
            opal_evtimer_add(&show_help_timer_event, &show_help_interval);
            show_help_timer_set = true;
        }
    }
    /* Not already displayed */
    else if (ORTE_ERR_NOT_FOUND == rc) {
        if (orte_xml_output) {
            char *tmp;
            tmp = xml_format((unsigned char*)output);
            fprintf(orte_xml_fp, "%s", tmp);
            fflush(orte_xml_fp);
            free(tmp);
        } else {
            fprintf(stderr, "%s", output);
        }
        if (!show_help_timer_set) {
            show_help_time_last_displayed = now;
        }
    }
    /* Some other error occurred */
    else {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /* If we're aggregating, add this process name to the list */
    if (orte_help_want_aggregate) {
        pnli = OBJ_NEW(orte_namelist_t);
        if (NULL == pnli) {
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        pnli->name = *sender;
        opal_list_append(&(tli->tli_processes), &(pnli->item));
    }
    return ORTE_SUCCESS;
}