Ejemplo n.º 1
0
static int update_route(orte_process_name_t *target,
                        orte_process_name_t *route)
{
    if (target->jobid == ORTE_JOBID_INVALID ||
        target->vpid == ORTE_VPID_INVALID) {
        return ORTE_ERR_BAD_PARAM;
    }

    /* if I am an application process, we don't update the route since
     * we automatically route everything through the local daemon
     */
    if (ORTE_PROC_IS_APP) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
                         "%s routed_binomial_update: %s --> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target),
                         ORTE_NAME_PRINT(route)));


    /* if I am a daemon and the target is my HNP, then check
     * the route - if it isn't direct, then we just flag that
     * we have a route to the HNP
     */
    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
        OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
        hnp_direct = false;
        return ORTE_SUCCESS;
    }

    return ORTE_SUCCESS;
}
Ejemplo n.º 2
0
void orte_sstore_central_local_recv(int status,
                                    orte_process_name_t* sender,
                                    opal_buffer_t* buffer,
                                    orte_rml_tag_t tag,
                                    void* cbdata)
{
    int ret;
    orte_sstore_central_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_sstore_base_handle_t loc_id;
    orte_sstore_central_local_snapshot_info_t *handle_info = NULL;

    if( ORTE_RML_TAG_SSTORE_INTERNAL != tag ) {
        return;
    }

    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(local): process_cmd(%s)",
                         ORTE_NAME_PRINT(sender)));

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &loc_id, &count, ORTE_SSTORE_HANDLE )) ) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }

    /*
     * Find the referenced handle (Create if it does not exist)
     */
    if(NULL == (handle_info = find_handle_info(loc_id)) ) {
        handle_info = create_new_handle_info(loc_id);
    }

    /*
     * Process the command
     */
    if( ORTE_SSTORE_CENTRAL_PULL == command ) {
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, sender)) {
            process_global_pull(sender, buffer, handle_info);
        } else {
            process_app_pull(sender, buffer, handle_info);
        }
    }
    else if( ORTE_SSTORE_CENTRAL_PUSH == command ) {
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, sender)) {
            process_global_push(sender, buffer, handle_info);
        } else {
            process_app_push(sender, buffer, handle_info);
        }
    }

 cleanup:
    return;
}
Ejemplo n.º 3
0
/*
 * One of our local procs wants us to close the specifed
 * stream(s), thus terminating any potential io to/from it.
 * For the orted, this just means closing the local fd
 */
static int orted_close(const orte_process_name_t* peer,
                       orte_iof_tag_t source_tag)
{
    opal_list_item_t *item, *next_item;
    orte_iof_sink_t* sink;
    orte_ns_cmp_bitmask_t mask;

    OPAL_THREAD_LOCK(&mca_iof_orted_component.lock);
    
    for(item = opal_list_get_first(&mca_iof_orted_component.sinks);
        item != opal_list_get_end(&mca_iof_orted_component.sinks);
        item = next_item ) {
        sink = (orte_iof_sink_t*)item;
        next_item = opal_list_get_next(item);
        
        mask = ORTE_NS_CMP_ALL;

        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, peer) &&
           (source_tag & sink->tag)) {

            /* No need to delete the event or close the file
             * descriptor - the destructor will automatically
             * do it for us.
             */
            opal_list_remove_item(&mca_iof_orted_component.sinks, item);
            OBJ_RELEASE(item);
            break;
        }
    }
    OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock);

    return ORTE_SUCCESS;
}
Ejemplo n.º 4
0
static int
_process_name_compare(const opal_process_name_t p1, const opal_process_name_t p2)
{
    orte_process_name_t* o1 = (orte_process_name_t*)&p1;
    orte_process_name_t* o2 = (orte_process_name_t*)&p2;
    return orte_util_compare_name_fields(ORTE_NS_CMP_ALL, o1, o2);
}
Ejemplo n.º 5
0
int orte_sstore_central_global_unpack(orte_process_name_t* peer, opal_buffer_t* buffer, orte_sstore_base_handle_t *handle)
{
    int ret, exit_status = ORTE_SUCCESS;

    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(global): unpack()"));

    /*
     * Unpack the handle id
     */
    if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID,
                                                   ORTE_PROC_MY_NAME,
                                                   peer)) {
        /*
         * Differ to the orted version, so if we have application then they get updated too
         */
        if( ORTE_SUCCESS != (ret = orte_sstore_central_local_unpack(peer, buffer, handle)) ) {
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            goto cleanup;
        }
    }

 cleanup:
    return exit_status;
}
Ejemplo n.º 6
0
/**
* PROC
 */
int orte_dt_compare_proc(orte_proc_t *value1, orte_proc_t *value2, opal_data_type_t type)
{
    orte_ns_cmp_bitmask_t mask;

    /** check vpids */
    mask = ORTE_NS_CMP_VPID;

    return orte_util_compare_name_fields(mask, &value1->name, &value2->name);
}
/* Find endpoint for specific subnet/lid/message */
static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* process_name,
        uint64_t subnet_id, uint16_t lid, uint8_t message_type)
{
    size_t i;
    mca_btl_openib_proc_t *ib_proc;
    mca_btl_openib_endpoint_t *ib_endpoint = NULL;
    bool found = false;

    BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
                "jobid %d, vpid %d, sid %d, lid %d",
                process_name->jobid, process_name->vpid, subnet_id, lid));
    /* find ibproc */
    OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
    for (ib_proc = (mca_btl_openib_proc_t*)
            opal_list_get_first(&mca_btl_openib_component.ib_procs);
            ib_proc != (mca_btl_openib_proc_t*)
            opal_list_get_end(&mca_btl_openib_component.ib_procs);
            ib_proc  = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
        if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                    &ib_proc->proc_guid, process_name) == OPAL_EQUAL) {
            found = true;
            break;
        }
    }
    /* we found our ib_proc, lets find endpoint now */
    if (found) {
        for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
            ib_endpoint = ib_proc->proc_endpoints[i];
            /* we need to check different
             * lid for different message type */
            if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type ||
                    ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) {
                /* response message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->ib_addr->lid == lid) {
                    break; /* Found one */
                }
            } else {
                /* request message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->endpoint_btl->lid == lid) {
                    break; /* Found one */
                }
            }
        }
        if (NULL == ib_endpoint) {
                BTL_ERROR(("can't find suitable endpoint for this peer\n"));
        }
    } else {
            BTL_ERROR(("can't find suitable endpoint for this peer\n"));
    }
    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
    return ib_endpoint;
}
Ejemplo n.º 8
0
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
    orte_ns_cmp_bitmask_t mask;

    mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
    /* if it is me, the local rank is zero */
    if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
        return 0;
    }
    
    /* otherwise, no idea */
    return ORTE_LOCAL_RANK_INVALID;
}
Ejemplo n.º 9
0
static char* proc_get_hostname(orte_process_name_t *proc)
{
    orte_ns_cmp_bitmask_t mask;

    mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
    /* if it is me, the answer is my nodename */
    if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
        return orte_process_info.nodename;
    }
    
    /* otherwise, no idea */
    return NULL;
}
Ejemplo n.º 10
0
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
{
    orte_ns_cmp_bitmask_t mask;

    mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;

    /* if it is me, the answer is my daemon's vpid */
    if (OPAL_EQUAL == orte_util_compare_name_fields(mask, proc, ORTE_PROC_MY_NAME)) {
        return ORTE_PROC_MY_DAEMON->vpid;
    }

    /* otherwise, no idea */
    return ORTE_VPID_INVALID;
}
Ejemplo n.º 11
0
static int route_lost(const orte_process_name_t *route)
{
    opal_list_item_t *item;
    orte_routed_tree_t *child;

    OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                         "%s route to %s lost",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(route)));

    /* if we lose the connection to the lifeline and we are NOT already,
     * in finalize, tell the OOB to abort.
     * NOTE: we cannot call abort from here as the OOB needs to first
     * release a thread-lock - otherwise, we will hang!!
     */
    if (!orte_finalizing &&
        NULL != lifeline &&
        OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                             "%s routed:binomial: Connection to lifeline %s lost",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(lifeline)));
        return ORTE_ERR_FATAL;
    }

    /* if we are the HNP or a daemon, is it a daemon, and one of my children? if so, then
     * remove it from the child list
     */
    if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
        route->jobid == ORTE_PROC_MY_NAME->jobid) {
        for (item = opal_list_get_first(&my_children);
             item != opal_list_get_end(&my_children);
             item = opal_list_get_next(item)) {
            child = (orte_routed_tree_t*)item;
            if (child->vpid == route->vpid) {
                OPAL_OUTPUT_VERBOSE((4, orte_routed_base_framework.framework_output,
                                     "%s routed_binomial: removing route to child daemon %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(route)));
                opal_list_remove_item(&my_children, item);
                OBJ_RELEASE(item);
                return ORTE_SUCCESS;
            }
        }
    }

    /* we don't care about this one, so return success */
    return ORTE_SUCCESS;
}
Ejemplo n.º 12
0
static void mca_oob_tcp_msg_ident(mca_oob_tcp_msg_t* msg, mca_oob_tcp_peer_t* peer)
{
    orte_process_name_t src = msg->msg_hdr.msg_src;
    
    OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);

    if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &peer->peer_name, &src) != OPAL_EQUAL) {
        opal_hash_table_remove_value_uint64(&mca_oob_tcp_component.tcp_peers, 
                                            orte_util_hash_name(&peer->peer_name));
        peer->peer_name = src;
        opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peers, 
                                         orte_util_hash_name(&peer->peer_name), peer);
    }
    OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
}
Ejemplo n.º 13
0
int orte_rml_oob_send_nb(orte_process_name_t* peer,
                         struct iovec* iov,
                         int count,
                         orte_rml_tag_t tag,
                         orte_rml_callback_fn_t cbfunc,
                         void* cbdata)
{
    orte_rml_send_request_t *req;

    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output,
                         "%s rml_send to peer %s at tag %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(peer), tag));

    if (ORTE_RML_TAG_INVALID == tag) {
        /* cannot send to an invalid tag */
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    if( NULL == peer ||
        OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
        /* cannot send to an invalid peer */
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* get ourselves into an event to protect against
     * race conditions and threads
     */
    req = OBJ_NEW(orte_rml_send_request_t);
    req->post.dst = *peer;
    req->post.iov = iov;
    req->post.count = count;
    req->post.tag = tag;
    req->post.cbfunc.iov = cbfunc;
    req->post.cbdata = cbdata;
    /* setup the event for the send callback */
    opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req);
    opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
    opal_event_active(&req->ev, OPAL_EV_WRITE, 1);

    return ORTE_SUCCESS;
}
Ejemplo n.º 14
0
bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint,
                                 struct sockaddr* addr, int sd)
{
    mca_btl_tcp2_proc_t* this_proc = mca_btl_tcp2_proc_local();
    mca_btl_tcp2_proc_t *endpoint_proc = btl_endpoint->endpoint_proc;
    int cmpval;

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);

    if(NULL == btl_endpoint->endpoint_addr) {
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return false;
    }

    cmpval = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, 
                                    &endpoint_proc->proc_ompi->proc_name,
                                    &this_proc->proc_ompi->proc_name);
    if((btl_endpoint->endpoint_sd < 0) ||
       (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED &&
        cmpval < 0)) {
        mca_btl_tcp2_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_sd = sd;
        if(mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) {
            mca_btl_tcp2_endpoint_close(btl_endpoint);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return false;
        }
        mca_btl_tcp2_endpoint_event_init(btl_endpoint);
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
        mca_btl_tcp2_endpoint_connected(btl_endpoint);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
        mca_btl_tcp2_endpoint_dump(btl_endpoint, "accepted");
#endif
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return true;
    }
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
    return false;
}
Ejemplo n.º 15
0
static void cleanup_local_proc(orte_job_t *jdata,
                               orte_process_name_t *proc)
{
    orte_proc_t *pptr;
    int i;

    /* see if this is a local proc to me */
    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, proc, &pptr->name)) {
            opal_pointer_array_set_item(orte_local_children, i, NULL);
            OBJ_RELEASE(pptr);
            jdata->num_local_procs--;
            return;
        }
    }
}
Ejemplo n.º 16
0
static int route_lost(const orte_process_name_t *route)
{
    /* if we lose the connection to the lifeline and we are NOT already,
     * in finalize, tell the OOB to abort.
     * NOTE: we cannot call abort from here as the OOB needs to first
     * release a thread-lock - otherwise, we will hang!!
     */
    if (!orte_finalizing &&
        NULL != lifeline &&
        OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
        opal_output(0, "%s routed:linear: Connection to lifeline %s lost",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(lifeline));
        return ORTE_ERR_FATAL;
    }

    /* we don't care about this one, so return success */
    return ORTE_SUCCESS;
}
Ejemplo n.º 17
0
/*
 *  Receive the endpoints globally unique process identification from a newly
 *  connected socket and verify the expected response. If so, move the
 *  socket to a connected state.
 */
static int mca_btl_tcp2_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
{
    orte_process_name_t guid;
    mca_btl_tcp2_proc_t* btl_proc = btl_endpoint->endpoint_proc;

    if((mca_btl_tcp2_endpoint_recv_blocking(btl_endpoint, &guid, sizeof(orte_process_name_t))) != sizeof(orte_process_name_t)) {
        return OMPI_ERR_UNREACH;
    }
    ORTE_PROCESS_NAME_NTOH(guid);
    /* compare this to the expected values */
    if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                                    &btl_proc->proc_ompi->proc_name,
                                                    &guid)) {
        BTL_ERROR(("received unexpected process identifier %s", 
                   ORTE_NAME_PRINT(&guid)));
        mca_btl_tcp2_endpoint_close(btl_endpoint);
        return OMPI_ERR_UNREACH;
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 18
0
static orte_sstore_central_local_app_snapshot_info_t *find_app_handle_info(orte_sstore_central_local_snapshot_info_t *handle_info,
                                                                           orte_process_name_t *name)
{
    orte_sstore_central_local_app_snapshot_info_t *app_info = NULL;
    opal_list_item_t* item = NULL;
    orte_ns_cmp_bitmask_t mask;

    for(item  = opal_list_get_first(handle_info->app_info_handle);
        item != opal_list_get_end(handle_info->app_info_handle);
        item  = opal_list_get_next(item) ) {
        app_info = (orte_sstore_central_local_app_snapshot_info_t*)item;

        mask = ORTE_NS_CMP_ALL;

        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &app_info->name, name)) {
            return app_info;
        }
    }

    return NULL;
}
Ejemplo n.º 19
0
static oshmem_proc_t *
oshmem_proc_find_and_add(const orte_process_name_t * name, bool* isnew)
{
    oshmem_proc_t *proc, *rproc = NULL;
    orte_ns_cmp_bitmask_t mask;

    /* return the proc-struct which matches this jobid+process id */
    mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
    OPAL_THREAD_LOCK(&oshmem_proc_lock);
    for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list);
            proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list);
            proc = (oshmem_proc_t*) opal_list_get_next(proc)) {
        if (OPAL_EQUAL
                == orte_util_compare_name_fields(mask,
                                                 (orte_process_name_t*)&proc->super.proc_name,
                                                 name)) {
            rproc = proc;
            *isnew = false;
            break;
        }
    }

    /* if we didn't find this proc in the list, create a new
     * proc_t and append it to the list
     */
    if (NULL == rproc) {
        *isnew = true;
        rproc = OBJ_NEW(oshmem_proc_t);
        if (NULL != rproc) {
            opal_list_append(&oshmem_proc_list, (opal_list_item_t*)rproc);
            rproc->super.proc_name = *(opal_process_name_t*)name;
        }
        /* caller had better fill in the rest of the proc, or there's
         going to be pain later... */
    }

    OPAL_THREAD_UNLOCK(&oshmem_proc_lock);

    return rproc;
}
Ejemplo n.º 20
0
oshmem_proc_t * oshmem_proc_find(const orte_process_name_t * name)
{
    oshmem_proc_t *proc, *rproc = NULL;
    orte_ns_cmp_bitmask_t mask;

    /* return the proc-struct which matches this jobid+process id */
    mask = ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID;
    OPAL_THREAD_LOCK(&oshmem_proc_lock);
    for (proc = (oshmem_proc_t*) opal_list_get_first(&oshmem_proc_list);
            proc != (oshmem_proc_t*) opal_list_get_end(&oshmem_proc_list);
            proc = (oshmem_proc_t*) opal_list_get_next(proc)) {
        if (OPAL_EQUAL
                == orte_util_compare_name_fields(mask,
                                                 (orte_process_name_t*)&proc->super.proc_name,
                                                 name)) {
            rproc = proc;
            break;
        }
    } OPAL_THREAD_UNLOCK(&oshmem_proc_lock);

    return rproc;
}
Ejemplo n.º 21
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    orte_ns_cmp_bitmask_t mask;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_app: job %s reported state %s"
                         " for proc %s state %s exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         orte_job_state_to_str(jobstate),
                         (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state), exit_code));
    
    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        mask = ORTE_NS_CMP_ALL;
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
            return ORTE_SUCCESS;
        }
        /* see is this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            return ORTE_ERR_UNRECOVERABLE;
        }
    }
    return ORTE_SUCCESS;
}
Ejemplo n.º 22
0
static orte_rml_channel_t * get_channel ( orte_process_name_t * peer,
        opal_list_t *qos_attributes,
        bool recv)
{
    orte_rml_channel_t *channel = NULL;
    int32_t i = 0;
    /* search available channels and return channel that matches the attributes */
    for (i=0; i < orte_rml_base.open_channels.size; i++) {
        if (NULL != (channel = (orte_rml_channel_t*) opal_pointer_array_get_item (&orte_rml_base.open_channels, i))) {
            /* compare basic properties */
            if ((OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &channel->peer, peer)) &&
                    ((orte_rml_channel_open == channel->state) ||
                     (orte_rml_channel_opening == channel->state)) &&
                    (channel->recv == recv))
            {
                /* compare channel attributes */
                if( ORTE_SUCCESS == orte_qos_cmp_channel ( channel->qos, channel->qos_channel_ptr, qos_attributes))
                    return channel;

            }
        }
    }
    return NULL;
}
Ejemplo n.º 23
0
int orte_global_comm(orte_process_name_t *recipient,
                     opal_buffer_t *buf, orte_rml_tag_t tag,
                     orte_default_cbfunc_t cbfunc)
{
    int ret;
    orte_ns_cmp_bitmask_t mask;

    mask = ORTE_NS_CMP_ALL;
    
    if (OPAL_EQUAL == orte_util_compare_name_fields(mask, recipient, ORTE_PROC_MY_NAME) &&
        NULL != cbfunc) {
        /* if I am the recipient and a direct fn is provided, use a message event */
        ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buf, tag, cbfunc);
        ret = ORTE_SUCCESS;
    } else {
        /* go ahead and send it */
        if (0 > (ret = orte_rml.send_buffer(recipient, buf, tag, 0))) {
            ORTE_ERROR_LOG(ret);
        } else {
            ret = ORTE_SUCCESS;
        }
    }
    return ret;
}
Ejemplo n.º 24
0
int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
        orte_sstore_base_handle_t ss_handle,
        int ckpt_status)
{
    int ret, exit_status = ORTE_SUCCESS;
    opal_buffer_t *loc_buffer = NULL;
    orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_UPDATE_CMD;
    char *global_snapshot_handle = NULL;
    char *tmp_str = NULL;
    int seq_num;
    orte_ns_cmp_bitmask_t mask;

    /*
     * Noop if invalid peer, or peer not specified (JJH Double check this)
     */
    if( NULL == peer ||
            OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
        /*return ORTE_ERR_BAD_PARAM;*/
        return ORTE_SUCCESS;
    }

    mask = ORTE_NS_CMP_ALL;

    /*
     * Do not send to self, as that is silly.
     */
    if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
        OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                             "%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
        return ORTE_SUCCESS;
    }

    /*
     * Pass on the checkpoint state.
     */
    orte_snapc_ckpt_state_notify(ckpt_status);

    OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                         "%s) base:ckpt_update_cmd: Sending update command <status %d>\n",
                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                         ckpt_status));

    /********************
     * Send over the status of the checkpoint
     * - ckpt_state
     * - global snapshot handle (upon finish only)
     * - sequence number        (upon finish only)
     ********************/
    if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        OBJ_RELEASE(loc_buffer);
        goto cleanup;
    }

    if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &ckpt_status, 1, OPAL_INT))) {
        opal_output(orte_snapc_base_framework.framework_output,
                    "%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
                    ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                    ret, __LINE__);
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        OBJ_RELEASE(loc_buffer);
        goto cleanup;
    }

    if( ORTE_SNAPC_CKPT_STATE_RECOVERED == ckpt_status ||
            ORTE_SNAPC_CKPT_STATE_ESTABLISHED  == ckpt_status ||
            ORTE_SNAPC_CKPT_STATE_STOPPED   == ckpt_status ||
            ORTE_SNAPC_CKPT_STATE_ERROR     == ckpt_status ) {

        if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
            if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
                                       SSTORE_METADATA_GLOBAL_SNAP_REF,
                                       &global_snapshot_handle)) ) {
                opal_output(orte_snapc_base_framework.framework_output,
                            "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
                            ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
                ORTE_ERROR_LOG(ret);
                /* Do not exit here, continue so that we can inform the tool
                 * that the checkpoint has failed
                 */
            }

            if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
                                       SSTORE_METADATA_GLOBAL_SNAP_SEQ,
                                       &tmp_str)) ) {
                opal_output(orte_snapc_base_framework.framework_output,
                            "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
                            ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
                ORTE_ERROR_LOG(ret);
                /* Do not exit here, continue so that we can inform the tool
                 * that the checkpoint has failed
                 */
            }

            if( NULL != tmp_str ) {
                seq_num = atoi(tmp_str);
            } else {
                seq_num = -1;
            }
        } else {
            /* Checkpoint Error Case */
            global_snapshot_handle = NULL;
            seq_num = -1;
        }

        OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                             "%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                             ckpt_status, global_snapshot_handle, seq_num));

        if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &global_snapshot_handle, 1, OPAL_STRING))) {
            opal_output(orte_snapc_base_framework.framework_output,
                        "%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
                        ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                        ret, __LINE__);
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            OBJ_RELEASE(loc_buffer);
            goto cleanup;
        }

        if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &seq_num, 1, OPAL_INT))) {
            opal_output(orte_snapc_base_framework.framework_output,
                        "%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
                        ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                        ret, __LINE__);
            ORTE_ERROR_LOG(ret);
            exit_status = ret;
            OBJ_RELEASE(loc_buffer);
            goto cleanup;
        }
    }

    if (0 > (ret = orte_rml.send_buffer_nb(peer, loc_buffer,
                                           ORTE_RML_TAG_CKPT,
                                           orte_rml_send_callback, NULL))) {
        opal_output(orte_snapc_base_framework.framework_output,
                    "%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
                    ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                    ret, __LINE__);
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        OBJ_RELEASE(loc_buffer);
        goto cleanup;
    }

cleanup:
    if( NULL != global_snapshot_handle ) {
        free(global_snapshot_handle);
        global_snapshot_handle = NULL;
    }
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }

    return exit_status;
}
Ejemplo n.º 25
0
int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
        opal_buffer_t* buffer,
        opal_crs_base_ckpt_options_t *options,
        orte_jobid_t *jobid)
{
    int ret, exit_status = ORTE_SUCCESS;
    orte_std_cntr_t count = 1;
    orte_ns_cmp_bitmask_t mask;

    mask = ORTE_NS_CMP_ALL;

    /*
     * Do not send to self, as that is silly.
     */
    if (OPAL_EQUAL ==
            orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
        OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                             "%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                         "%s) base:ckpt_init_cmd: Receiving commands\n",
                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));

    /********************
     * Receive command line checkpoint request:
     * - Command (already received)
     * - options
     * - jobid
     ********************/
    if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) {
        opal_output(orte_snapc_base_framework.framework_output,
                    "%s) base:ckpt_init_cmd: Error: Unpack (options) Failure (ret = %d)\n",
                    ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    count = 1;
    if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, jobid, &count, ORTE_JOBID)) ) {
        opal_output(orte_snapc_base_framework.framework_output,
                    "%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
                    ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                    ret, __LINE__);
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
                         "%s) base:ckpt_init_cmd: Received [%d, %d, %s]\n",
                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
                         (int)(options->term),
                         (int)(options->stop),
                         ORTE_JOBID_PRINT(*jobid)));

cleanup:
    return exit_status;
}
Ejemplo n.º 26
0
/* this is the read handler for my own child procs and stdin
 */
void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata)
{
    orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata;
    unsigned char data[ORTE_IOF_BASE_MSG_MAX];
    int32_t numbytes;
    opal_list_item_t *item;
    orte_iof_proc_t *proct;
    int i, j;
    orte_ns_cmp_bitmask_t mask;
    orte_job_t *jdata;
    orte_iof_job_t *iofjob;
    orte_node_t *node;
    orte_proc_t *daemon;
    orte_job_map_t *map;
    bool write_out=false;

    /* read up to the fragment size */
#if !defined(__WINDOWS__)
    numbytes = read(fd, data, sizeof(data));
#else
    {
        DWORD readed;
        HANDLE handle = (HANDLE)_get_osfhandle(fd);
        ReadFile(handle, data, sizeof(data), &readed, NULL);
        numbytes = (int)readed;
    }
#endif  /* !defined(__WINDOWS__) */
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s iof:mrhnp:read handler read %d bytes from %s:%d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                         ORTE_NAME_PRINT(&rev->name), fd));

    if (numbytes < 0) {
        /* either we have a connection error or it was a non-blocking read */
        
        /* non-blocking, retry */
        if (EAGAIN == errno || EINTR == errno) {
            opal_event_add(rev->ev, 0);
            return;
        } 

        OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                             "%s iof:mrhnp:read handler %s Error on connection:%d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&rev->name), fd));
        /* Un-recoverable error. Allow the code to flow as usual in order to
         * to send the zero bytes message up the stream, and then close the
         * file descriptor and delete the event.
         */
        numbytes = 0;
    }
    
    /* if job termination has been ordered, just ignore the
     * data and delete the stdin read event, if that is what fired
     */
    if (orte_job_term_ordered) {
        if (ORTE_IOF_STDIN & rev->tag) {
            OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev);
        }
        return;
    }

    if (ORTE_IOF_STDIN & rev->tag) {
        /* The event has fired, so it's no longer active until we
         * re-add it
         */
        mca_iof_mr_hnp_component.stdinev->active = false;    
        /* if this was read from my stdin, I need to send this input to all
         * daemons who host mapper procs
         */
        for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
            if (NULL == (iofjob = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
                continue;
            }
            jdata = iofjob->jdata;
            OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                                 "%s read %d bytes from stdin - writing to job %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            map = jdata->map;
            for (i=0; i < map->nodes->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                    continue;
                }
                daemon = node->daemon;

                if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
                    /* if it is me, then send the bytes down the stdin pipe
                     * for every local proc (they are all on my proct list) - we even send 0 byte events
                     * down the pipe so it forces out any preceding data before
                     * closing the output stream. We add a 0 byte message if
                     * numbytes < sizeof(data) as this means the chunk we read
                     * was the end of the file.
                     */
                    for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
                         item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
                         item = opal_list_get_next(item)) {
                        proct = (orte_iof_proc_t*)item;
                        if (proct->name.jobid == jdata->jobid) {
                            if (NULL == proct->sink) {
                                opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
                                continue;
                            }
                            if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev)) {
                                /* getting too backed up - stop the read event for now if it is still active */
                                if (mca_iof_mr_hnp_component.stdinev->active) {
                                    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                                                         "buffer backed up - holding"));
                                    mca_iof_mr_hnp_component.stdinev->active = false;
                                }
                                return;
                            }
                            if (0 < numbytes && numbytes < (int)sizeof(data)) {
                                /* need to write a 0-byte event to clear the stream and close it */
                                orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev);
                                proct->sink = NULL;
                            }
                        }
                    }
                } else {
                    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                                         "%s sending %d bytes from stdin to daemon %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                                         ORTE_NAME_PRINT(&daemon->name)));
                
                    /* send the data to the daemon so it can
                     * write it to all local procs from this job.
                     * If the connection closed,
                     * numbytes will be zero so zero bytes will be
                     * sent - this will tell the daemon to close
                     * the fd for stdin to that proc
                     */
                    send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
                    if (0 < numbytes && numbytes < (int)sizeof(data)) {
                        /* need to send a 0-byte message to clear the stream and close it */
                        send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, 0);
                    }
                }
            }
        }
        /* if num_bytes was zero, then we need to terminate the event */
        if (0 == numbytes || numbytes < (int)sizeof(data)) {
            /* this will also close our stdin file descriptor */
            if (NULL != mca_iof_mr_hnp_component.stdinev) {
                OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev);
            }
        } else {
            /* if we are looking at a tty, then we just go ahead and restart the
             * read event assuming we are not backgrounded
             */
            if (orte_iof_mrhnp_stdin_check(fd)) {
                restart_stdin(fd, 0, NULL);
            } else {
                /* delay for awhile and then restart */
                ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI);
            }
        }
        return;
    }

    if (ORTE_IOF_STDOUT & rev->tag && 0 < numbytes) {
        /* see if we need to forward this output */
        jdata = orte_get_job_data_object(rev->name.jobid);
        if (ORTE_JOBID_INVALID == jdata->stdout_target) {
            /* end of the chain - just output the info */
            write_out = true;
            goto PROCESS;
        }
        /* it goes to the next job in the chain */
        jdata = orte_get_job_data_object(jdata->stdout_target);
        map = jdata->map;
        for (i=0; i < map->nodes->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                continue;
            }
            daemon = node->daemon;

            if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) {
                /* if it is me, then send the bytes down the stdin pipe
                 * for every local proc (they are all on my proct list)
                 */
                for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
                     item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
                     item = opal_list_get_next(item)) {
                    proct = (orte_iof_proc_t*)item;
                    if (proct->name.jobid == jdata->jobid) {
                        if (NULL == proct->sink) {
                            opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name));
                            continue;
                        }
                        orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev);
                    }
                }
            } else {
                OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                                     "%s sending %d bytes from stdout of %s to daemon %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                                     ORTE_NAME_PRINT(&rev->name),
                                     ORTE_NAME_PRINT(&daemon->name)));
                
                /* send the data to the daemon so it can
                 * write it to all local procs from this job
                 */
                send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes);
            }
        }
    }
    
 PROCESS:
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output,
                         "%s read %d bytes from %s of %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes,
                         (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"),
                         ORTE_NAME_PRINT(&rev->name)));
    
    if (0 == numbytes) {
        /* if we read 0 bytes from the stdout/err/diag, find this proc
         * on our list and
         * release the appropriate event. This will delete the
         * read event and close the file descriptor
         */
        for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
             item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
             item = opal_list_get_next(item)) {
            proct = (orte_iof_proc_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) {
                /* found it - release corresponding event. This deletes
                 * the read event and closes the file descriptor
                 */
                if (rev->tag & ORTE_IOF_STDOUT) {
                    OBJ_RELEASE(proct->revstdout);
                } else if (rev->tag & ORTE_IOF_STDERR) {
                    OBJ_RELEASE(proct->revstderr);
                } else if (rev->tag & ORTE_IOF_STDDIAG) {
                    OBJ_RELEASE(proct->revstddiag);
                }
                /* check to see if they are all done */
                if (NULL == proct->revstdout &&
                    NULL == proct->revstderr &&
                    NULL == proct->revstddiag) {
                    /* this proc's iof is complete */
                    opal_list_remove_item(&mca_iof_mr_hnp_component.procs, item);
                    ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE);
                    OBJ_RELEASE(proct);
                }
                break;
            }
        }
        return;
    } else {
        /* output this to our local output */
        if (ORTE_IOF_STDOUT & rev->tag) {
            if (write_out) {
                orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev);
            }
        } else {
            orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev);
        }
    }
    
    /* re-add the event */
    opal_event_add(rev->ev, 0);

    return;
}
Ejemplo n.º 27
0
static void sstore_central_global_recv(int status,
                                       orte_process_name_t* sender,
                                       opal_buffer_t* buffer,
                                       orte_rml_tag_t tag,
                                       void* cbdata)
{
    int ret;
    orte_sstore_central_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_sstore_base_handle_t loc_id;
    orte_sstore_central_global_snapshot_info_t *handle_info = NULL;

    if( ORTE_RML_TAG_SSTORE_INTERNAL != tag ) {
        return;
    }

    /*
     * If this was an application process contacting us, then act like an orted
     * instead of an HNP
     */
    if(OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID,
                                                   ORTE_PROC_MY_NAME,
                                                   sender)) {
        orte_sstore_central_local_recv(status, sender, buffer, tag, cbdata);
        return;
    }


    OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle,
                         "sstore:central:(global): process_cmd(%s)",
                         ORTE_NAME_PRINT(sender)));

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }

    count = 1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &loc_id, &count, ORTE_SSTORE_HANDLE )) ) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }

    /*
     * Find the referenced handle
     */
    if(NULL == (handle_info = find_handle_info(loc_id)) ) {
        ; /* JJH big problem */
    }

    /*
     * Process the command
     */
    if( ORTE_SSTORE_CENTRAL_PULL == command ) {
        process_local_pull(sender, buffer, handle_info);
    }
    else if( ORTE_SSTORE_CENTRAL_PUSH == command ) {
        process_local_push(sender, buffer, handle_info);
    }

 cleanup:
    return;
}
Ejemplo n.º 28
0
static int route_lost(const orte_process_name_t *route)
{
    opal_list_item_t *item;
    orte_routed_tree_t *child;
    orte_routed_jobfam_t *jfam;
    uint16_t jfamily;
    int i;

    OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                         "%s route to %s lost",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(route)));

    /* if the route is to a different job family and we are the HNP, look it up */
    if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
        ORTE_PROC_IS_HNP) {
        jfamily = ORTE_JOB_FAMILY(route->jobid);
        for (i=0; i < orte_routed_jobfams.size; i++) {
            if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
                continue;
            }
            if (jfam->job_family == jfamily) {
                OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                                     "%s routed_radix: route to %s lost",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOB_FAMILY_PRINT(route->jobid)));
                opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
                OBJ_RELEASE(jfam);
                break;
            }
        }
    }

    /* if we lose the connection to the lifeline and we are NOT already,
     * in finalize, tell the OOB to abort.
     * NOTE: we cannot call abort from here as the OOB needs to first
     * release a thread-lock - otherwise, we will hang!!
     */
    if (!orte_finalizing &&
        NULL != lifeline &&
        OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                             "%s routed:radix: Connection to lifeline %s lost",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(lifeline)));
        return ORTE_ERR_FATAL;
    }

    /* if we are the HNP or daemon, and the route is a daemon,
     * see if it is one of our children - if so, remove it
     */
    if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) &&
        route->jobid == ORTE_PROC_MY_NAME->jobid) {
        for (item = opal_list_get_first(&my_children);
             item != opal_list_get_end(&my_children);
             item = opal_list_get_next(item)) {
            child = (orte_routed_tree_t*)item;
            if (child->vpid == route->vpid) {
                opal_list_remove_item(&my_children, item);
                OBJ_RELEASE(item);
                return ORTE_SUCCESS;
            }
        }
    }

    /* we don't care about this one, so return success */
    return ORTE_SUCCESS;
}
Ejemplo n.º 29
0
static orte_process_name_t get_route(orte_process_name_t *target)
{
    orte_process_name_t *ret, daemon;
    opal_list_item_t *item;
    orte_routed_tree_t *child;
    int i;
    orte_routed_jobfam_t *jfam;
    uint16_t jfamily;

    if (!orte_routing_is_enabled) {
        ret = target;
        goto found;
    }

    /* initialize */
    daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
    daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;

    if (target->jobid == ORTE_JOBID_INVALID ||
        target->vpid == ORTE_VPID_INVALID) {
        ret = ORTE_NAME_INVALID;
        goto found;
    }

    /* if it is me, then the route is just direct */
    if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) {
        ret = target;
        goto found;
    }
    
    /* if I am an application process, always route via my local daemon */
    if (ORTE_PROC_IS_APP) {
        ret = ORTE_PROC_MY_DAEMON;
        goto found;
    }

    /* if I am a tool, the route is direct if target is in
     * my own job family, and to the target's HNP if not
     */
    if (ORTE_PROC_IS_TOOL) {
        if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
            ret = target;
            goto found;
        } else {
            ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
            ret = &daemon;
            goto found;
        }
    }
    
    /******     HNP AND DAEMONS ONLY     ******/
    
    /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
    if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
        /* if I am a daemon, route this via the HNP */
        if (ORTE_PROC_IS_DAEMON) {
            ret = ORTE_PROC_MY_HNP;
            goto found;
        }
        
        /* if I am the HNP or a tool, then I stored a route to
         * this job family, so look it up
         */
        jfamily = ORTE_JOB_FAMILY(target->jobid);
        for (i=0; i < orte_routed_jobfams.size; i++) {
            if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
                continue;
            }
            if (jfam->job_family == jfamily) {
                OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                                     "%s routed_binomial: route to %s found",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOB_FAMILY_PRINT(target->jobid)));
                ret = &jfam->route;
                goto found;
            }
        }
        /* not found - so we have no route */
        ret = ORTE_NAME_INVALID;
        goto found;
    }
     
    /* THIS CAME FROM OUR OWN JOB FAMILY... */

    /* if this is going to the HNP, then send it direct if we don't know
     * how to get there - otherwise, send it via the tree
     */
    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
        if (!hnp_direct || orte_static_ports) {
            OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                                 "%s routing to the HNP through my parent %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT)));
            ret = ORTE_PROC_MY_PARENT;
            goto found;
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                                 "%s routing direct to the HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ret = ORTE_PROC_MY_HNP;
            goto found;
        }
    }
    
    daemon.jobid = ORTE_PROC_MY_NAME->jobid;
    /* find out what daemon hosts this proc */
    if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ret = ORTE_NAME_INVALID;
        goto found;
    }
    
    /* if the daemon is me, then send direct to the target! */
    if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
        ret = target;
        goto found;
    } else if (orte_process_info.num_procs < mca_routed_radix_component.max_connections) {
        /* if the job is small enough, send direct to the target's daemon */
        ret = &daemon;
        goto found;
    } else {
        /* search routing tree for next step to that daemon */
        for (item = opal_list_get_first(&my_children);
             item != opal_list_get_end(&my_children);
             item = opal_list_get_next(item)) {
            child = (orte_routed_tree_t*)item;
            if (child->vpid == daemon.vpid) {
                /* the child is hosting the proc - just send it there */
                ret = &daemon;
                goto found;
            }
            /* otherwise, see if the daemon we need is below the child */
            if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) {
                /* yep - we need to step through this child */
                daemon.vpid = child->vpid;
                ret = &daemon;
                goto found;
            }
        }
    }
    
    /* if we get here, then the target daemon is not beneath
     * any of our children, so we have to step up through our parent
     */
    daemon.vpid = ORTE_PROC_MY_PARENT->vpid;
    
    ret = &daemon;
    
found:
    OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
                         "%s routed_radix_get(%s) --> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target), 
                         ORTE_NAME_PRINT(ret)));
    
    return *ret;
}
Ejemplo n.º 30
0
static int update_route(orte_process_name_t *target,
                        orte_process_name_t *route)
{ 
    int i;
    orte_routed_jobfam_t *jfam;
    uint16_t jfamily;
    
    if (target->jobid == ORTE_JOBID_INVALID ||
        target->vpid == ORTE_VPID_INVALID) {
        return ORTE_ERR_BAD_PARAM;
    }

    /* if I am an application process, we don't update the route since
     * we automatically route everything through the local daemon
     */
    if (ORTE_PROC_IS_APP) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
                         "%s routed_radix_update: %s --> %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target), 
                         ORTE_NAME_PRINT(route)));


    /* if I am a daemon and the target is my HNP, then check
     * the route - if it isn't direct, then we just flag that
     * we have a route to the HNP
     */
    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) &&
        OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) {
        hnp_direct = false;
        return ORTE_SUCCESS;
    }

    /* if this is from a different job family, then I need to
     * track how to send messages to it
     */
    if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
        
        /* if I am a daemon, then I will automatically route
         * anything to this job family via my HNP - so nothing to do
         * here, just return
         */
        if (ORTE_PROC_IS_DAEMON) {
            return ORTE_SUCCESS;
        }
        
        OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
                             "%s routed_radix_update: diff job family routing job %s --> %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(target->jobid), 
                             ORTE_NAME_PRINT(route)));
        
        /* see if this target is already present */
        jfamily = ORTE_JOB_FAMILY(target->jobid);
        for (i=0; i < orte_routed_jobfams.size; i++) {
            if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
                continue;
            }
            if (jfam->job_family == jfamily) {
                OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                                     "%s routed_radix: updating route to %s via %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOB_FAMILY_PRINT(target->jobid),
                                     ORTE_NAME_PRINT(route)));
                jfam->route.jobid = route->jobid;
                jfam->route.vpid = route->vpid;
                return ORTE_SUCCESS;
            }
        }

        /* not there, so add the route FOR THE JOB FAMILY*/
        OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
                             "%s routed_radix: adding route to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOB_FAMILY_PRINT(target->jobid)));
        jfam = OBJ_NEW(orte_routed_jobfam_t);
        jfam->job_family = jfamily;
        jfam->route.jobid = route->jobid;
        jfam->route.vpid = route->vpid;
        opal_pointer_array_add(&orte_routed_jobfams, jfam);
        return ORTE_SUCCESS;
    }
    
    return ORTE_SUCCESS;
}