Exemple #1
0
static int udp_recv_buffer(orte_process_name_t *name,
                           orte_rmcast_channel_t channel,
                           orte_rmcast_tag_t tag,
                           orte_rmcast_seq_t *seq_num,
                           opal_buffer_t *buf)
{
    rmcast_base_recv_t *recvptr;
    int ret;
    orte_rmcast_channel_t chan;

    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s rmcast:udp: recv_buffer called on multicast channel %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel));

    if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_input_channel->channel;
    } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_output_channel->channel;
    } else {
        chan = channel;
    }
    
    if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
                                                           ORTE_RMCAST_NON_PERSISTENT,
                                                           NULL, NULL, NULL, true))) {
        ORTE_ERROR_LOG(ret);
        ORTE_RELEASE_THREAD(&ctl);
        return ret;
    }
    ORTE_RELEASE_THREAD(&ctl);
    
    recvptr->ctl.active = true;
    ORTE_ACQUIRE_THREAD(&recvptr->ctl);
    
    /* xfer the data */
    if (NULL != name) {
        /* caller requested id of sender */
        name->jobid = recvptr->name.jobid;
        name->vpid = recvptr->name.vpid;
        ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
    }
    *seq_num = recvptr->seq_num;
    if (ORTE_SUCCESS != (ret = opal_dss.copy_payload(buf, recvptr->buf))) {
        ORTE_ERROR_LOG(ret);
    }
    /* release the data */
    OBJ_RELEASE(recvptr);
    
    return ret;
}
Exemple #2
0
static int udp_recv(orte_process_name_t *name,
                    orte_rmcast_channel_t channel,
                    orte_rmcast_tag_t tag,
                    orte_rmcast_seq_t *seq_num,
                    struct iovec **msg, int *count)
{
    rmcast_base_recv_t *recvptr;
    int ret;
    orte_rmcast_channel_t chan;

    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_input_channel->channel;
    } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_output_channel->channel;
    } else {
        chan = channel;
    }
    
    if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(&recvptr, chan, tag,
                                                           ORTE_RMCAST_NON_PERSISTENT,
                                                           NULL, NULL, NULL, true))) {
        ORTE_ERROR_LOG(ret);
        ORTE_RELEASE_THREAD(&ctl);
        return ret;
    }
    ORTE_RELEASE_THREAD(&ctl);
    
    recvptr->ctl.active = true;
    ORTE_ACQUIRE_THREAD(&recvptr->ctl);
    
    /* xfer the data */
    if (NULL != name) {
        /* caller requested id of sender */
        name->jobid = recvptr->name.jobid;
        name->vpid = recvptr->name.vpid;
        ORTE_EPOCH_SET(name->epoch,recvptr->name.epoch);
    }
    *seq_num = recvptr->seq_num;
    *msg = recvptr->iovec_array;
    *count = recvptr->iovec_count;
    
    /* carefully release the recv */
    recvptr->iovec_array = NULL;
    recvptr->iovec_count = 0;
    OBJ_RELEASE(recvptr);
    
    return ORTE_SUCCESS;
}
Exemple #3
0
static void enable_comm(void)
{
    ORTE_ACQUIRE_THREAD(&ctl);
    orte_rmcast_base_start_threads();
    comm_enabled = true;
    ORTE_RELEASE_THREAD(&ctl);
}
Exemple #4
0
static void disable_comm(void)
{
    ORTE_ACQUIRE_THREAD(&ctl);
    comm_enabled = false;
    orte_rmcast_base_stop_threads();
    ORTE_RELEASE_THREAD(&ctl);
}
Exemple #5
0
static int udp_send(orte_rmcast_channel_t channel,
                    orte_rmcast_tag_t tag,
                    struct iovec *msg, int count)
{
    rmcast_base_send_t snd;
    int ret;
    
    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    /* queue it to be sent - preserves order! */
    OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
    snd.iovec_array = msg;
    snd.iovec_count = count;
    snd.tag = tag;
    
    if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
        ORTE_ERROR_LOG(ret);
    }
    
    /* carefully release the send */
    snd.iovec_array = NULL;
    snd.iovec_count = 0;
    OBJ_DESTRUCT(&snd);
    
    ORTE_RELEASE_THREAD(&ctl);
    return ret;
}
Exemple #6
0
static int udp_send_buffer_nb(orte_rmcast_channel_t channel,
                              orte_rmcast_tag_t tag,
                              opal_buffer_t *buf,
                              orte_rmcast_callback_buffer_fn_t cbfunc,
                              void *cbdata)
{
    int ret;
    rmcast_base_send_t snd;
    
    ORTE_ACQUIRE_THREAD(&ctl);

    if (!comm_enabled) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_ERR_COMM_DISABLED;
    }

    /* queue it to be sent - preserves order! */
    OBJ_CONSTRUCT(&snd, rmcast_base_send_t);
    snd.buf = buf;
    snd.tag = tag;
    snd.cbfunc_buffer = cbfunc;
    snd.cbdata = cbdata;
    
    if (ORTE_SUCCESS != (ret = send_data(&snd, channel))) {
        ORTE_ERROR_LOG(ret);
    }

    /* carefully release the send */
    snd.buf = NULL;
    OBJ_DESTRUCT(&snd);

    ORTE_RELEASE_THREAD(&ctl);
    return ret;
}
Exemple #7
0
static int udp_recv_buffer_nb(orte_rmcast_channel_t channel,
                                orte_rmcast_tag_t tag,
                                orte_rmcast_flag_t flags,
                                orte_rmcast_callback_buffer_fn_t cbfunc, void *cbdata)
{
    orte_rmcast_channel_t chan;
    int ret;
    
    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s rmcast:udp: recv_buffer_nb called on multicast channel %d tag %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, tag));
    
    ORTE_ACQUIRE_THREAD(&ctl);

    if (ORTE_RMCAST_GROUP_INPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_input_channel->channel;
    } else if (ORTE_RMCAST_GROUP_OUTPUT_CHANNEL == channel) {
        chan = orte_rmcast_base.my_output_channel->channel;
    } else {
        chan = channel;
    }
    
    if (ORTE_SUCCESS != (ret = orte_rmcast_base_queue_recv(NULL, chan, tag, flags,
                                                           NULL, cbfunc, cbdata, false))) {
        if (ORTE_EXISTS == ret) {
            ret = ORTE_SUCCESS;
        } else {
            ORTE_ERROR_LOG(ret);
        }
    }
    ORTE_RELEASE_THREAD(&ctl);
    
    return ret;
}
void orte_rmcast_base_stop_threads(void)
{
    opal_buffer_t *msg=NULL;

    OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output,
                         "%s rmcast:base: stopping recv processing thread",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl);
    if (orte_rmcast_base.recv_process_ctl.running) {
        ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl);
        opal_fd_write(orte_rmcast_base.recv_pipe[1], sizeof(opal_buffer_t*), &msg);
        opal_thread_join(&orte_rmcast_base.recv_process, NULL);
        ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl);
    }
    ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl);

    OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output,
                         "%s rmcast:base: all threads stopped",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
static void* rcv_processing_thread(opal_object_t *obj)
{
    orte_rmcast_msg_t *msg;
    int rc;
    struct timespec tp={0, 10};

    OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output,
                         "%s rmcast:base: recv processing thread operational",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl);
    orte_rmcast_base.recv_process_ctl.running = true;
    ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl);

    while (1) {
        /* block here until a trigger arrives */
        if (0 > (rc = opal_fd_read(orte_rmcast_base.recv_pipe[0],
                                   sizeof(orte_rmcast_msg_t*), &msg))) {
            /* if something bad happened, punt */
            opal_output(0, "%s PUNTING THREAD", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl);
            orte_rmcast_base.recv_process_ctl.running = false;
            ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl);
            /* give a little delay to ensure the main thread gets into
             * opal_thread_join before we exit
             */
            nanosleep(&tp, NULL);
            return OPAL_THREAD_CANCELLED;
        }
        /* check to see if we were told to stop */
        if (NULL == msg) {
            ORTE_ACQUIRE_THREAD(&orte_rmcast_base.recv_process_ctl);
            orte_rmcast_base.recv_process_ctl.running = false;
            ORTE_RELEASE_THREAD(&orte_rmcast_base.recv_process_ctl);
            return OPAL_THREAD_CANCELLED;
        }

        /* process it - processing function releases the msg */
        orte_rmcast.process_msg(msg);
    }
}
Exemple #10
0
static int open_channel(const char *app,
                        const char *version,
                        const char *release,
                        orte_jobid_t jobid,
                        orcm_pnp_open_channel_cbfunc_t cbfunc)
{
    orcm_triplet_t *triplet;
    orcm_triplet_group_t *grp;
    orcm_pnp_request_t *request;
    opal_list_item_t *item;
    int i, rc;
    bool done;
    
    if (NULL == cbfunc) {
        /* makes no sense */
        ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
        return ORCM_ERR_BAD_PARAM;
    }

    OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                         "%s pnp:default:open_channel for %s:%s:%s job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (NULL == app) ? "NULL" : app,
                         (NULL == version) ? "NULL" : version,
                         (NULL == release) ? "NULL": release,
                         ORTE_JOBID_PRINT(jobid)));

    /* protect the global arrays */
    ORTE_ACQUIRE_THREAD(&local_thread);
    
    /* see if we already know this triplet - automatically
     * creates it if not
     */
    triplet = orcm_get_triplet(app, version, release, true);
    /* record the policy */
    triplet->pnp_cb_policy = jobid;
    triplet->pnp_cbfunc = cbfunc;

    /* if the jobid is wildcard, we execute the callback for every group */
    if (ORTE_JOBID_WILDCARD == jobid) {
        /* cycle thru this triplet's known groups */
        for (i=0; i < triplet->groups.size; i++) {
            if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
                continue;
            }
            grp->pnp_cbfunc = cbfunc;
            if (ORCM_PNP_INVALID_CHANNEL != grp->input) {
                /* if the user requested a callback, they probably intend to send
                 * something to this triplet - so ensure the channel to its input is open.
                 * No need to release threads first as this call cannot result in callbacks
                 */
                if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(grp->input, triplet->string_id, NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
                    ORTE_ERROR_LOG(rc);
                    ORTE_RELEASE_THREAD(&triplet->ctl);
                    continue;
                }
                /* release the threads before doing the callback in
                 * case the caller sends messages
                 */
                ORTE_RELEASE_THREAD(&triplet->ctl);
                ORTE_RELEASE_THREAD(&local_thread);
                cbfunc(app, version, release, grp->input);
                /* reacquire the threads */
                ORTE_ACQUIRE_THREAD(&local_thread);
                ORTE_ACQUIRE_THREAD(&triplet->ctl);
                /* flag that this group has executed its callback */
                grp->pnp_cbfunc = NULL;
            }
        }
        /* release the threads */
        ORTE_RELEASE_THREAD(&triplet->ctl);
        ORTE_RELEASE_THREAD(&local_thread);
        return ORCM_SUCCESS;
    }

    if (ORTE_JOBID_INVALID == jobid) {
        /* see if we have know about any group with this triplet */
        done = false;
        for (i=0; i < triplet->groups.size; i++) {
            if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
                continue;
            }
            grp->pnp_cbfunc = cbfunc;
            if (ORCM_PNP_INVALID_CHANNEL != grp->input) {
                /* if the user requested a callback, they probably intend to send
                 * something to this triplet - so ensure the channel to its input is open.
                 * No need to release threads first as this call cannot result in callbacks
                 */
                if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(grp->input, triplet->string_id, NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
                    ORTE_ERROR_LOG(rc);
                    continue;
                }
                 /* flag that we already did the callback so we don't do it again */
                done = true;
                ORTE_RELEASE_THREAD(&triplet->ctl);
                ORTE_RELEASE_THREAD(&local_thread);
                cbfunc(app, version, release, grp->input);
                /* reacquire the threads */
                ORTE_ACQUIRE_THREAD(&local_thread);
                ORTE_ACQUIRE_THREAD(&triplet->ctl);
                break;
            }
        }
        /* if we did the callback, remove any remaining cbfunc entries to ensure
         * that we only do this once for the triplet
         */
        if (done) {
            for (i=0; i < triplet->groups.size; i++) {
                if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
                    continue;
                }
                grp->pnp_cbfunc = NULL;
            }
        }
        /* release the threads */
        ORTE_RELEASE_THREAD(&triplet->ctl);
        ORTE_RELEASE_THREAD(&local_thread);
        return ORCM_SUCCESS;
    }

    /* left with the case of a specific jobid - record the policy */
    done = false;
    for (i=0; i < triplet->groups.size; i++) {
        if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
            continue;
        }
        if (grp->jobid == jobid) {
            /* found the group */
            grp->pnp_cbfunc = cbfunc;
            done = true;  /* flag that we found the group */
            if (ORCM_PNP_INVALID_CHANNEL != grp->input) {
                /* if the user requested a callback, they probably intend to send
                 * something to this triplet - so ensure the channel to its input is open.
                 * No need to release threads first as this call cannot result in callbacks
                 */
                if (ORTE_SUCCESS != (rc = orte_rmcast.open_channel(grp->input, triplet->string_id, NULL, -1, NULL, ORTE_RMCAST_XMIT))) {
                    ORTE_ERROR_LOG(rc);
                    continue;
                }
                 /* release the threads before doing the callback in
                 * case the caller sends messages
                 */
                ORTE_RELEASE_THREAD(&triplet->ctl);
                ORTE_RELEASE_THREAD(&local_thread);
                cbfunc(app, version, release, grp->input);
                /* reacquire the threads */
                ORTE_ACQUIRE_THREAD(&local_thread);
                ORTE_ACQUIRE_THREAD(&triplet->ctl);
                /* flag that this group has executed its callback */
                grp->pnp_cbfunc = NULL;
                break;
            }
        }
    }
    /* if we didn't find the group, then we have to add it */
    if (!done) {
        grp = OBJ_NEW(orcm_triplet_group_t);
        grp->jobid = jobid;
        grp->pnp_cbfunc = cbfunc;
        opal_pointer_array_add(&triplet->groups, grp);
    }

    ORTE_RELEASE_THREAD(&triplet->ctl);
    ORTE_RELEASE_THREAD(&local_thread);
    return ORCM_SUCCESS;
}
Exemple #11
0
static int cancel_receive(const char *app,
                          const char *version,
                          const char *release,
                          orcm_pnp_channel_t channel,
                          orcm_pnp_tag_t tag)
{
    orcm_pnp_channel_obj_t *chan;
    orcm_pnp_request_t *req;
    orcm_triplet_t *triplet;
    orcm_triplet_group_t *grp;
    opal_list_item_t *item, *next;
    char *string_id;
    int ret=ORCM_SUCCESS;
    int i;
    
    OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                         "%s pnp:default:cancel_recv app %s version %s release %s channel %s tag %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (NULL == app) ? "NULL" : app,
                         (NULL == version) ? "NULL" : version,
                         (NULL == release) ? "NULL" : release,
                         orcm_pnp_print_channel(channel),
                         orcm_pnp_print_tag(tag)));
    
    /* since we are modifying global lists, lock
     * the thread
     */
    ORTE_ACQUIRE_THREAD(&local_thread);
    
    /* if this is the wildcard channel, loop across all channels */
    if (ORCM_PNP_WILDCARD_CHANNEL == channel) {
        /* get the string id for this triplet */
        ORCM_CREATE_STRING_ID(&string_id, app, version, release);
        for (i=0; i < orcm_pnp_base.channels.size; i++) {
            if (NULL == (chan = (orcm_pnp_channel_obj_t*)opal_pointer_array_get_item(&orcm_pnp_base.channels, i))) {
                continue;
            }
            item = opal_list_get_first(&chan->recvs);
            while (item != opal_list_get_end(&chan->recvs)) {
                next = opal_list_get_next(item);
                req = (orcm_pnp_request_t*)item;
                if (0 == strcasecmp(string_id, ORCM_WILDCARD_STRING_ID) ||
                    0 == strcasecmp(string_id, req->string_id)) {
                    if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                        opal_list_remove_item(&chan->recvs, item);
                        OBJ_RELEASE(item);
                    }
                }
                item = next;
            }
        }
        goto cleanup;
    }

    /* are we looking at the group input channel? */
    if (ORCM_PNP_GROUP_INPUT_CHANNEL == channel) {
        triplet = orcm_get_triplet(app, version, release, false);
        if (NULL != triplet) {
            /* remove the triplet-stored recvs */
            item = opal_list_get_first(&triplet->input_recvs);
            while (item != opal_list_get_end(&triplet->input_recvs)) {
                next = opal_list_get_next(item);
                req = (orcm_pnp_request_t*)item;
                if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                    opal_list_remove_item(&triplet->input_recvs, item);
                    OBJ_RELEASE(item);
                }
                item = next;
            }
            for (i=0; i < triplet->groups.size; i++) {
                if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
                    continue;
                }
                if (ORCM_PNP_INVALID_CHANNEL != grp->input) {
                    /* just look thru the default group input channel */
                    chan = (orcm_pnp_channel_obj_t*)opal_pointer_array_get_item(&orcm_pnp_base.channels, grp->input);
                    if (NULL == chan) {
                        /* nothing to do */
                        ORTE_RELEASE_THREAD(&triplet->ctl);
                        goto cleanup;
                    }
                    item = opal_list_get_first(&chan->recvs);
                    while (item != opal_list_get_end(&chan->recvs)) {
                        next = opal_list_get_next(item);
                        req = (orcm_pnp_request_t*)item;
                        if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                            opal_list_remove_item(&chan->recvs, item);
                            OBJ_RELEASE(item);
                        }
                        item = next;
                    }
                }
            }
            /* release the triplet */
            ORTE_RELEASE_THREAD(&triplet->ctl);
        }
        goto cleanup;
    }

    /* are we looking at the group output channel? */
    if (ORCM_PNP_GROUP_OUTPUT_CHANNEL == channel) {
        triplet = orcm_get_triplet(app, version, release, false);
        if (NULL != triplet) {
            /* remove the triplet-stored recvs */
            item = opal_list_get_first(&triplet->output_recvs);
            while (item != opal_list_get_end(&triplet->output_recvs)) {
                next = opal_list_get_next(item);
                req = (orcm_pnp_request_t*)item;
                if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                    opal_list_remove_item(&triplet->output_recvs, item);
                    OBJ_RELEASE(item);
                }
                item = next;
            }
            for (i=0; i < triplet->groups.size; i++) {
                if (NULL == (grp = (orcm_triplet_group_t*)opal_pointer_array_get_item(&triplet->groups, i))) {
                    continue;
                }
                if (ORCM_PNP_INVALID_CHANNEL != grp->output) {
                    /* just look thru the default group output channel */
                    chan = (orcm_pnp_channel_obj_t*)opal_pointer_array_get_item(&orcm_pnp_base.channels, grp->output);
                    if (NULL == chan) {
                        /* nothing to do */
                        ORTE_RELEASE_THREAD(&triplet->ctl);
                        goto cleanup;
                    }
                    item = opal_list_get_first(&chan->recvs);
                    while (item != opal_list_get_end(&chan->recvs)) {
                        next = opal_list_get_next(item);
                        req = (orcm_pnp_request_t*)item;
                        if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                            opal_list_remove_item(&chan->recvs, item);
                            OBJ_RELEASE(item);
                        }
                        item = next;
                    }
                }
            }
            /* release the triplet */
            ORTE_RELEASE_THREAD(&triplet->ctl);
        }
        goto cleanup;
    }

    /* if this isn't either input or output channel, then get the channel object */
    if (NULL != (chan = (orcm_pnp_channel_obj_t*)opal_pointer_array_get_item(&orcm_pnp_base.channels, channel))) {
        ORCM_CREATE_STRING_ID(&string_id, app, version, release);
        item = opal_list_get_first(&chan->recvs);
        while (item != opal_list_get_end(&chan->recvs)) {
            next = opal_list_get_next(item);
            req = (orcm_pnp_request_t*)item;
            if (0 == strcasecmp(string_id, req->string_id)) {
                if (ORCM_PNP_TAG_WILDCARD == tag || tag == req->tag) {
                    opal_list_remove_item(&chan->recvs, item);
                    OBJ_RELEASE(item);
                }
            }
            item = next;
        }
        free(string_id);
    }

 cleanup:
    /* clear the thread */
    ORTE_RELEASE_THREAD(&local_thread);
    
    return ret;
}
Exemple #12
0
static int register_receive(const char *app,
                            const char *version,
                            const char *release,
                            orcm_pnp_channel_t channel,
                            orcm_pnp_tag_t tag,
                            orcm_pnp_callback_fn_t cbfunc,
                            void *cbdata)
{
    orcm_triplet_t *triplet, *trp;
    int i;
    int ret=ORCM_SUCCESS;
    orcm_pnp_channel_obj_t *recvr;
    orcm_pnp_request_t *req;

    OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                         "%s pnp:default:register_recv app %s version %s release %s channel %s tag %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (NULL == app) ? "NULL" : app,
                         (NULL == version) ? "NULL" : version,
                         (NULL == release) ? "NULL" : release,
                         orcm_pnp_print_channel(channel),
                         orcm_pnp_print_tag(tag)));
    
    /* bozo check - can't receive on an invalid channel */
    if (ORCM_PNP_INVALID_CHANNEL == channel) {
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* since we are modifying global lists, lock
     * the thread
     */
    ORTE_ACQUIRE_THREAD(&local_thread);
    
    /* get a triplet object for this triplet - creates
     * it if one doesn't already exist
     */
    triplet = orcm_get_triplet(app, version, release, true);        
    
    /* if the triplet involves wildcards, we treat it separately. Such
     * recvs are maintained on a separate list so they can be properly
     * applied to any subsequent triplets covered by the wildcard
     */
    if (NULL != strchr(triplet->string_id, '@')) {
        /* if we were given an INPUT or OUTPUT channel, then we
         * have to record the recv so we can apply it to triplets
         * as they become known since we don't know the channel
         */
        if (ORCM_PNP_GROUP_INPUT_CHANNEL == channel ||
            ORCM_PNP_GROUP_OUTPUT_CHANNEL == channel) {
            /* store this recv on this wildcard triplet so we retain a record of it,
             * ensuring no duplicates
             */
            if (ORCM_PNP_GROUP_INPUT_CHANNEL == channel) {
                if (NULL == orcm_pnp_base_find_request(&triplet->input_recvs, triplet->string_id, tag)) {
                    /* create it */
                    req = OBJ_NEW(orcm_pnp_request_t);
                    req->string_id = strdup(triplet->string_id);
                    req->tag = tag;
                    req->cbfunc = cbfunc;
                    req->cbdata = cbdata;
                    opal_list_append(&triplet->input_recvs, &req->super);
                }
            } else {
                if (NULL == orcm_pnp_base_find_request(&triplet->output_recvs, triplet->string_id, tag)) {
                    /* create it */
                    req = OBJ_NEW(orcm_pnp_request_t);
                    req->string_id = strdup(triplet->string_id);
                    req->tag = tag;
                    req->cbfunc = cbfunc;
                    req->cbdata = cbdata;
                    opal_list_append(&triplet->output_recvs, &req->super);
                }
            }

            /* lock the global triplet arrays for our use */
            ORTE_ACQUIRE_THREAD(&orcm_triplets->ctl);

            /* check all known triplets to find those that match */
            for (i=0; i < orcm_triplets->array.size; i++) {
                if (NULL == (trp = (orcm_triplet_t*)opal_pointer_array_get_item(&orcm_triplets->array, i))) {
                    continue;
                }
                if (trp == triplet) {
                    /* don't copy from ourselves */
                    continue;
                }
                /* lock the triplet thread */
                ORTE_ACQUIRE_THREAD(&trp->ctl);
                if (orcm_triplet_cmp(trp->string_id, triplet->string_id)) {
                    /* triplet matches - transfer the recv */
                    if (ORCM_SUCCESS != (ret = orcm_pnp_base_record_recv(trp, channel, tag, cbfunc, cbdata))) {
                        ORTE_ERROR_LOG(ret);
                    }
                }
                /* release this triplet */
                ORTE_RELEASE_THREAD(&trp->ctl);
            }

            /* release the global arrays */
            ORTE_RELEASE_THREAD(&orcm_triplets->ctl);
        } else {
            /* if we were given a specific channel, then we can add this
             * recv to it
             */
            if (NULL == (recvr = (orcm_pnp_channel_obj_t*)opal_pointer_array_get_item(&orcm_pnp_base.channels, channel))) {
                recvr = OBJ_NEW(orcm_pnp_channel_obj_t);
                recvr->channel = channel;
                opal_pointer_array_set_item(&orcm_pnp_base.channels, recvr->channel, recvr);
            }
            if (NULL == (req = orcm_pnp_base_find_request(&recvr->recvs, triplet->string_id, tag))) {
                /* not already present - create it */
                req = OBJ_NEW(orcm_pnp_request_t);
                req->string_id = strdup(req->string_id);
                req->tag = req->tag;
                req->cbfunc = cbfunc;
                req->cbdata = cbdata;
                opal_list_append(&recvr->recvs, &req->super);
            }
            if (channel < ORCM_PNP_SYS_CHANNEL) {
                /* can't register rmcast recvs on group_input, group_output, and wildcard channels */
                goto cleanup;
            }
            /* open this channel - will just return if already open */
            if (ORCM_SUCCESS != (ret = orte_rmcast.open_channel(channel, triplet->string_id,
                                                                NULL, -1, NULL,
                                                                ORTE_RMCAST_RECV))) {
                if (ORTE_EXISTS != ret) {
                    ORTE_ERROR_LOG(ret);
                    goto cleanup;
                }
            }
            /* setup to listen to it - will just return if we already are */
            if (ORTE_SUCCESS != (ret = orte_rmcast.recv_buffer_nb(channel, ORTE_RMCAST_TAG_WILDCARD,
                                                                  ORTE_RMCAST_PERSISTENT,
                                                                  orcm_pnp_base_recv_input_buffers, NULL))) {
                if (ORTE_EXISTS == ret) {
                    ret = ORTE_SUCCESS;
                    goto cleanup;
                }
                ORTE_ERROR_LOG(ret);
            }
        }

    } else {
        /* we are dealing with a non-wildcard triplet - record the request */
        if (ORCM_SUCCESS != (ret = orcm_pnp_base_record_recv(triplet, channel, tag, cbfunc, cbdata))) {
            ORTE_ERROR_LOG(ret);
        }
    }

 cleanup:
    /* clear the threads */
    ORTE_RELEASE_THREAD(&triplet->ctl);
    ORTE_RELEASE_THREAD(&local_thread);
    
    return ret;
}
Exemple #13
0
static void activate(void)
{
    int rc;
    DIR *dirp;

    /* take control */
    ORTE_ACQUIRE_THREAD(&orcm_cfgi_base.ctl);

    if (enabled) {
        /* we get reentered when daemons reappear so that
         * any pending jobs can be started
         */
        check_installed(true);
        /* release control */
        ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
        return;
    }
    enabled = true;

    /* check for existence of the directory. If it doesn't yet
     * exist, then we have to use the timer until it shows up
     */
    if (NULL == (dirp = opendir(mca_orcm_cfgi_file_component.dir))) {
        if (0 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
            orte_show_help("help-cfgi-file.txt", "no-dir",
                           true, mca_orcm_cfgi_file_component.dir);
        }
        timer_in_use = true;
        goto fallback;
    }

#ifdef HAVE_SYS_INOTIFY_H
    /* setup to watch the config dir - CREATE always is followed by
     * a MODIFY event, so don't need both
     */
    if (0 > (watch = inotify_add_watch(notifier, mca_orcm_cfgi_file_component.dir,
                                           IN_DELETE | IN_MODIFY | IN_MOVE))) {
        /* error */
        close(notifier);
        goto fallback;
    }
    /* start the watcher event */
    probe_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
    opal_event_set(opal_event_base, probe_ev, notifier,
                   OPAL_EV_READ|OPAL_EV_PERSIST, inotify_handler, NULL);
    timer_in_use = false;
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
    /* process it the first time */
    check_config(0, 0, NULL);
    return;
#endif

 fallback:

    /* setup the probe timer */
    if (0 <  mca_orcm_cfgi_file_component.rate) {
        probe_time.tv_sec = mca_orcm_cfgi_file_component.rate;
        probe_time.tv_usec = 0;
        probe_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_evtimer_set(opal_event_base, probe_ev, check_config, NULL);
        timer_in_use = true;
        /* process it the first time */
        ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
        check_config(0, 0, NULL);
        return;
    }

    opal_output(0, "%s CANNOT ACTIVATE INSTALL CONFIG MONITORING",
                   ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    enabled = false;
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
}
Exemple #14
0
static int default_output_nb(orcm_pnp_channel_t channel,
                             orte_process_name_t *recipient,
                             orcm_pnp_tag_t tag,
                             struct iovec *msg, int count,
                             opal_buffer_t *buffer,
                             orcm_pnp_callback_fn_t cbfunc,
                             void *cbdata)
{
    int i, ret;
    orcm_pnp_send_t *send;
    opal_buffer_t *buf;
    orcm_pnp_channel_t chan;

    /* if we have not announced, ignore this message */
    if (NULL == orcm_pnp_base.my_string_id) {
        return ORCM_ERR_NOT_AVAILABLE;
    }

    if (!orcm_pnp_base.comm_enabled) {
        return ORCM_ERR_COMM_DISABLED;
    }

    /* protect against threading */
    ORTE_ACQUIRE_THREAD(&local_thread);
    
    send = OBJ_NEW(orcm_pnp_send_t);
    send->tag = tag;
    send->msg = msg;
    send->count = count;
    send->buffer = buffer;
    send->cbfunc = cbfunc;
    send->cbdata = cbdata;

    /* setup the message for xmission */
    if (ORTE_SUCCESS != (ret = orcm_pnp_base_construct_msg(&buf, buffer, tag, msg, count))) {
        ORTE_ERROR_LOG(ret);
        ORTE_RELEASE_THREAD(&local_thread);
        return ret;
    }
    
    /* if this is intended for everyone who might be listening to my output,
     * multicast it
     */
    if (NULL == recipient ||
        (ORTE_JOBID_WILDCARD == recipient->jobid &&
         ORTE_VPID_WILDCARD == recipient->vpid)) {
        /* if this is going on the group channel, then substitute that channel here */
        if (ORCM_PNP_GROUP_OUTPUT_CHANNEL == channel) {
            chan = orcm_pnp_base.my_output_channel->channel;
        } else if (ORCM_PNP_GROUP_INPUT_CHANNEL == channel) {
            chan = orcm_pnp_base.my_input_channel->channel;
        } else {
            chan = channel;
        }
        OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                             "%s pnp:default:sending_nb multicast of %d %s to channel %s tag %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == msg) ? (int)buffer->bytes_used : count,
                             (NULL == msg) ? "bytes" : "iovecs",
                             orcm_pnp_print_channel(channel),
                             orcm_pnp_print_tag(tag)));
        
        /* release thread prior to send */
        ORTE_RELEASE_THREAD(&local_thread);
        /* send the data to the channel */
        if (ORCM_SUCCESS != (ret = orte_rmcast.send_buffer_nb(chan, tag, buf,
                                                              rmcast_callback, send))) {
            ORTE_ERROR_LOG(ret);
        }
        return ret;
    }
    
    /* if only one name field is WILDCARD, I don't know how to send
     * it - at least, not right now
     */
    if (ORTE_JOBID_WILDCARD == recipient->jobid ||
        ORTE_VPID_WILDCARD == recipient->vpid) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED);
        OBJ_RELEASE(send);
        ORTE_RELEASE_THREAD(&local_thread);
        return ORTE_ERR_NOT_IMPLEMENTED;
    }
    
    /* intended for a specific recipient, send it over p2p */
    OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                         "%s pnp:default:sending_nb p2p message of %d %s to %s tag %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (NULL == msg) ? (int)buffer->bytes_used : count,
                         (NULL == msg) ? "bytes" : "iovecs",
                         ORTE_NAME_PRINT(recipient),
                         orcm_pnp_print_tag(tag)));

    /* release thread prior to send */
    ORTE_RELEASE_THREAD(&local_thread);
    
    /* send the msg */
    if (0 > (ret = orte_rml.send_buffer_nb(recipient, buf,
                                           ORTE_RML_TAG_MULTICAST_DIRECT, 0,
                                           rml_callback, send))) {
        ORTE_ERROR_LOG(ret);
    } else {
        ret = ORCM_SUCCESS;
    }
    return ret;
}
Exemple #15
0
static void resend_data(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    int n, rc;
    orte_rmcast_channel_t channel;
    orte_rmcast_seq_t start;
    rmcast_base_channel_t *ch;
    rmcast_send_log_t *log;
    opal_buffer_t *recover;

    /* block any further ops until we complete the missing
     * message repair
     */
    ORTE_ACQUIRE_THREAD(&ctl);

    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel, &n, ORTE_RMCAST_CHANNEL_T))) {
        ORTE_ERROR_LOG(rc);
        goto release;
    }

    /* if the channel is UINT32_MAX, then we know that this is a
     * a response from a sender telling us that our request for
     * missing messages is too far behind, so we should just
     * abort
     */
    if (UINT32_MAX == channel) {
        opal_output(0, "%s CANNOT RECOVER FROM LOST MESSAGE - TOO FAR BEHIND - ABORTING",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        orte_errmgr.abort(1, NULL);
        goto release;
    }

    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &start, &n, ORTE_RMCAST_SEQ_T))) {
        ORTE_ERROR_LOG(rc);
        goto release;
    }

    opal_output(0, "%s request resend data from %s for channel %d start %d",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                ORTE_NAME_PRINT(sender), channel, start);

    /* get the referenced channel object */
    if (NULL == (ch = orte_rmcast_base_get_channel(channel))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto release;
    }

    /* see if we can bring the proc up to date - if it is too
     * far behind, then there is no hope of recovery
     */
    log = (rmcast_send_log_t*)opal_ring_buffer_poke(&ch->cache, 0);
    if (NULL == log || start < log->seq_num) {
        /* no hope - tell them */
        channel = UINT32_MAX;
        recover = OBJ_NEW(opal_buffer_t);
        if (ORTE_SUCCESS != (rc = opal_dss.pack(recover, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
            ORTE_ERROR_LOG(rc);
            goto release;
        }
        if (0 > (rc = orte_rml.send_buffer_nb(sender, recover, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(recover);
        }
        goto release;
    }

    /* search its ring buffer for the starting message - function
     * automatically starts at the oldest message and works up
     * from there
     */
    for (n=0; n < ch->cache.size; n++) {
        log = (rmcast_send_log_t*)opal_ring_buffer_poke(&ch->cache, n);
        if (NULL == log ||
            log->seq_num <= start) {
            continue;
        }
        OPAL_OUTPUT_VERBOSE((0, orte_rmcast_base.rmcast_output,
                             "%s resending msg %d to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             log->seq_num, ORTE_NAME_PRINT(sender)));
        recover = OBJ_NEW(opal_buffer_t);
        opal_dss.copy_payload(recover, log->buf);
        if (0 > (rc = orte_rml.send_buffer_nb(sender, recover, ORTE_RML_TAG_MULTICAST, 0, cbfunc, NULL))) {
            OBJ_RELEASE(recover);
            ORTE_ERROR_LOG(rc);
            goto release;
        }
    }

 release:
    ORTE_RELEASE_THREAD(&ctl);
}
void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg)
{
    orte_rmcast_channel_t channel;
    rmcast_base_recv_t *ptr, *recv=NULL;
    orte_process_name_t name;
    orte_rmcast_tag_t tag;
    int8_t flag;
    struct iovec *iovec_array=NULL;
    int32_t iovec_count=0, i, n, isz;
    int rc=ORTE_SUCCESS;
    orte_rmcast_seq_t recvd_seq_num;
    opal_list_item_t *item;
    rmcast_seq_tracker_t *trkr, *tptr;
    rmcast_recv_log_t *log, *logptr;
    bool restart;
    opal_buffer_t alert;

    /* extract the header */
    if (ORTE_SUCCESS != (rc = extract_hdr(msg->buf, &name, &channel, &tag, &restart, &recvd_seq_num))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* if this message is from myself, ignore it */
    if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) {
        OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output,
                             "%s rmcast:base:process_recv sent from myself: %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&name)));
        goto cleanup;
    }
    
    /* if this is a heartbeat and I am not a daemon, then ignore it
     * to avoid swamping tools
     */
    if (!ORTE_PROC_IS_DAEMON && ORTE_RMCAST_TAG_HEARTBEAT == tag) {
        OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output,
                             "%s rmcast:base:process_recv ignoring heartbeat",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this message is from a different job family, ignore it unless
     * it is on the system channel. We ignore these messages to avoid
     * confusion between different jobs since we all may be sharing
     * multicast channels. The system channel is left open to support
     * cross-job communications for detecting multiple conflicting DVMs.
     */
    if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) &&
        (ORTE_RMCAST_SYS_CHANNEL != channel)) {
        /* if we are not the HNP or a daemon, then we ignore this */
        if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
            OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output,
                                 "%s rmcast:base:process_recv from a different job family: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&name)));
        } else {
            goto cleanup;
        }
    }
    
    if (orte_rmcast_base.unreliable_xport) {
        /* if the message is not on a system-specified channel, then check to see if we
         * are missing any messages and need a resend
         */
        if (ORTE_RMCAST_DYNAMIC_CHANNELS <= channel) {
            log = NULL;
            for (item = opal_list_get_first(&orte_rmcast_base.msg_logs);
                 item != opal_list_get_end(&orte_rmcast_base.msg_logs);
                 item = opal_list_get_next(item)) {
                logptr = (rmcast_recv_log_t*)item;
                /* look for this source */
                if (name.jobid == logptr->name.jobid &&
                    name.vpid == logptr->name.vpid) {
                    log = logptr;
                    break;
                }
            }
            if (NULL == log) {
                /* new source */
                log = OBJ_NEW(rmcast_recv_log_t);
                log->name.jobid = name.jobid;
                log->name.vpid = name.vpid;
                opal_list_append(&orte_rmcast_base.msg_logs, &log->super);
            }
            /* look for the channel */
            trkr = NULL;
            for (item = opal_list_get_first(&log->last_msg);
                 item != opal_list_get_end(&log->last_msg);
                 item = opal_list_get_next(item)) {
                tptr = (rmcast_seq_tracker_t*)item;
                if (channel == tptr->channel) {
                    trkr = tptr;
                    break;
                }
            }
            if (NULL == trkr) {
                /* new channel */
                trkr = OBJ_NEW(rmcast_seq_tracker_t);
                trkr->channel = channel;
                opal_list_append(&log->last_msg, &trkr->super);
                OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output,
                                     "%s NEW CHANNEL: %d SENDER: %s SEQ %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num));
            } else if (ORTE_RMCAST_SEQ_INVALID != trkr->seq_num && !restart) {
                /* if this is a repeat msg, ignore it */
                if (recvd_seq_num <= trkr->seq_num) {
                    OPAL_OUTPUT_VERBOSE((1, orte_rmcast_base.rmcast_output,
                                         "%s Repeat msg %d on channel %d from source %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num, channel,
                                         ORTE_NAME_PRINT(&name)));
                }
                if (1 != (recvd_seq_num - trkr->seq_num) ||
                    (ORTE_RMCAST_SEQ_MAX == trkr->seq_num && 0 != recvd_seq_num)) {
                    /* missing a message - request it */
                    OPAL_OUTPUT_VERBOSE((1, orte_rmcast_base.rmcast_output,
                                         "%s Missing msg %d (%d) on channel %d from source %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num,
                                         trkr->seq_num, channel, ORTE_NAME_PRINT(&name)));
                    OBJ_CONSTRUCT(&alert, opal_buffer_t);
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &channel, 1, ORTE_RMCAST_CHANNEL_T))) {
                        ORTE_ERROR_LOG(rc);
                        exit(1);
                    }
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &trkr->seq_num, 1, ORTE_RMCAST_SEQ_T))) {
                        ORTE_ERROR_LOG(rc);
                        exit(1);
                    }
                    if (0 > (rc = orte_rml.send_buffer(&name, &alert, ORTE_RML_TAG_MISSED_MSG, 0))) {
                        ORTE_ERROR_LOG(rc);
                        exit(1);
                    }
                    OBJ_DESTRUCT(&alert);
                    goto cleanup;
                }
                OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output,
                                     "%s CHANNEL: %d SENDER: %s SEQ: %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num));
            }
            trkr->seq_num = recvd_seq_num;
        }
    }

    /* unpack the iovec vs buf flag */
    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &flag, &n, OPAL_INT8))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output,
                         "%s rmcast:base:process_recv sender: %s channel: %d tag: %d %s seq_num: %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&name), channel, (int)tag,
                         (0 == flag) ? "iovecs" : "buffer", recvd_seq_num));
    
    
    /* find the recv for this channel, tag, and type */
    ORTE_ACQUIRE_THREAD(&orte_rmcast_base.main_ctl);
    for (item = opal_list_get_first(&orte_rmcast_base.recvs);
         item != opal_list_get_end(&orte_rmcast_base.recvs);
         item = opal_list_get_next(item)) {
        ptr = (rmcast_base_recv_t*)item;
        
        OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                             "%s rmcast:base:process_recv checking channel %d tag %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (int)ptr->channel, (int)ptr->tag));
        
        if (channel != ptr->channel) {
            continue;
        }
        
        if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) {
            continue;
        }
        
        ptr->seq_num = recvd_seq_num;
        recv = ptr;
        break;
    }

    if (NULL == recv) {
        /* recv not found - dump msg */
        ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
        goto cleanup;
    }

    if (!(ORTE_RMCAST_PERSISTENT & recv->flags)) {
        OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                             "%s rmcast:base:process_recv removing non-persistent recv",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        opal_list_remove_item(&orte_rmcast_base.recvs, &recv->item);
    }
    ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);

    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s rmcast:base:process_recv delivering message to channel %d tag %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag));
        
    /* we have a matching recv - unpack the data */
    if (0 == flag) {
        /* get the number of iovecs in the buffer */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &iovec_count, &n, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* malloc the required space */
        iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec));
        /* unpack the iovecs */
        for (i=0; i < iovec_count; i++) {
            /* unpack the number of bytes in this iovec */
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &isz, &n, OPAL_INT32))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            iovec_array[i].iov_base = NULL;
            iovec_array[i].iov_len = isz;
            if (0 < isz) {
                /* allocate the space */
                iovec_array[i].iov_base = (IOVBASE_TYPE*)malloc(isz);
                /* unpack the data */
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, iovec_array[i].iov_base, &isz, OPAL_UINT8))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }                    
            }
        }
        if (NULL != recv->cbfunc_iovec) {
            OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                                 "%s rmcast:base:process_recv delivering iovecs to channel %d tag %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag));
            recv->cbfunc_iovec(ORTE_SUCCESS, recv->channel, recv->seq_num, tag,
                              &name, iovec_array, iovec_count, recv->cbdata);
        } else {
            /* if something is already present, then we have a problem */
            if (NULL != recv->iovec_array) {
                OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                                     "%s rmcast:base:process_recv blocking recv already fulfilled",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto cleanup;
            }
            /* copy over the iovec array since it will be released by
             * the blocking recv
             */
            recv->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec));
            recv->iovec_count = iovec_count;
            for (i=0; i < iovec_count; i++) {
                recv->iovec_array[i].iov_base = (IOVBASE_TYPE*)malloc(iovec_array[i].iov_len);
                recv->iovec_array[i].iov_len = iovec_array[i].iov_len;
                memcpy(recv->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len);
            }
            /* release blocking recv */
            ORTE_WAKEUP_THREAD(&recv->ctl);
        }
    } else {
        if (NULL != recv->cbfunc_buffer) {
            OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                                 "%s rmcast:base:process_recv delivering buffer to channel %d tag %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag));
            recv->cbfunc_buffer(ORTE_SUCCESS, recv->channel, recv->seq_num, tag,
                               &name, msg->buf, recv->cbdata);
        } else {
            /* if something is already present, then we have a problem */
            if (NULL != recv->buf) {
                OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                                     "%s rmcast:base:process_recv blocking recv already fulfilled",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto cleanup;
            }
            OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                                 "%s rmcast:base:process_recv copying buffer for blocking recv",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* copy the buffer across since it will be released
             * by the blocking recv
             */
            recv->buf = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recv->buf, msg->buf))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }                    
            /* release blocking recv */
            ORTE_WAKEUP_THREAD(&recv->ctl);
        }
    }
    
 cleanup:
    if (NULL != iovec_array) {
        for (i=0; i < iovec_count; i++) {
            free(iovec_array[i].iov_base);
        }
        free(iovec_array);
        iovec_array = NULL;
        iovec_count = 0;
    }
    if (NULL != msg) {
        OBJ_RELEASE(msg);
    }
    if (NULL != recv && !(ORTE_RMCAST_PERSISTENT & recv->flags)) {
        OBJ_RELEASE(recv);
    }

    return;
}
Exemple #17
0
static int announce(const char *app,
                    const char *version,
                    const char *release,
                    orcm_pnp_announce_fn_t cbfunc)
{
    int ret;
    opal_buffer_t buf;
    orcm_pnp_channel_t chan;

    /* bozo check */
    if (NULL == app || NULL == version || NULL == release) {
        ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
        return ORCM_ERR_BAD_PARAM;
    }
    
    if (!orcm_pnp_base.comm_enabled) {
        return ORCM_ERR_COMM_DISABLED;
    }

    /* protect against threading */
    ORTE_ACQUIRE_THREAD(&local_thread);
    
    if (NULL != orcm_pnp_base.my_string_id) {
        /* must have been called before */
        OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                             "%s pnp:default:announce called before",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        ORTE_RELEASE_THREAD(&local_thread);
        return ORCM_SUCCESS;
    }
    
    OPAL_OUTPUT_VERBOSE((2, orcm_pnp_base.output,
                         "%s pnp:default:announce app %s version %s release %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         app, version, release));
    
    /* retain a local record of my info - this enables communication
     * by setting my_string_id != NULL
     */
    ORCM_CREATE_STRING_ID(&orcm_pnp_base.my_string_id, app, version, release);
    
    /* retain the callback function */
    orcm_pnp_base.my_announce_cbfunc = cbfunc;
    
    /* get a triplet object for myself - creates
     * it if one doesn't already exist
     */
    orcm_pnp_base.my_triplet = orcm_get_triplet(app, version, release, true);
    /* get my group object */
    orcm_pnp_base.my_group = orcm_get_triplet_group(orcm_pnp_base.my_triplet, ORTE_PROC_MY_NAME->jobid, true);
    orcm_pnp_base.my_group->uid = orcm_pnp_base.my_uid;
    orcm_pnp_base.my_group->input = orcm_pnp_base.my_input_channel->channel;
    orcm_pnp_base.my_group->output = orcm_pnp_base.my_output_channel->channel;

    /* check for pending recvs for these channels - this will copy
     * recvs that were pre-posted on the triplet to the channel
     * array
     */
    orcm_pnp_base_check_pending_recvs(orcm_pnp_base.my_triplet,
                                      orcm_pnp_base.my_group);

    /* release the triplet as we no longer require it */
    ORTE_RELEASE_THREAD(&orcm_pnp_base.my_triplet->ctl);

    /* no need to hold the lock any further */
    ORTE_RELEASE_THREAD(&local_thread);
    
    /* assemble the announcement message */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
    /* pack the common elements */
    if (ORCM_SUCCESS != (ret = orcm_pnp_base_pack_announcement(&buf, ORTE_NAME_INVALID))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        return ret;
    }
    
    /* select the channel */
    if (ORCM_PROC_IS_APP) {
        chan = ORTE_RMCAST_APP_PUBLIC_CHANNEL;
    } else {
        chan = ORTE_RMCAST_SYS_CHANNEL;
    }
    
    /* send it */
    if (ORCM_SUCCESS != (ret = default_output(chan, NULL,
                                              ORCM_PNP_TAG_ANNOUNCE,
                                              NULL, 0, &buf))) {
        ORTE_ERROR_LOG(ret);
    }
    
    /* cleanup */
    OBJ_DESTRUCT(&buf);
    
    return ret;
}
Exemple #18
0
static void check_config(int fd, short args, void *cbdata)
{
    DIR *dirp = NULL;
    struct dirent * dir_entry;
    struct stat buf;
    int i, rc, n, j, k, m;
    char *fullpath;
    orcm_cfgi_app_t *app, *app2, *aptr;
    orcm_cfgi_run_t *run;
    orcm_cfgi_exec_t *exec, *exec2, *eptr;
    orcm_cfgi_version_t *vers, *vers2, *vptr;
    orcm_cfgi_bin_t *bin;
    orte_job_t *jdat, *jptr;
    orte_app_context_t *ax;
    opal_pointer_array_t found_apps;
    bool found, dir_found;
    orcm_cfgi_caddy_t *caddy;

    /* take control */
    ORTE_ACQUIRE_THREAD(&orcm_cfgi_base.ctl);

    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                         "%s CHECKING CONFIG DIRECTORY %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         mca_orcm_cfgi_file_component.dir));

    /* Open the directory so we can get a listing */
    if (NULL == (dirp = opendir(mca_orcm_cfgi_file_component.dir))) {
        if (0 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
            orte_show_help("help-cfgi-file.txt", "no-dir",
                           true, mca_orcm_cfgi_file_component.dir);
        }
        dir_found = false;
        goto restart;
    }
    dir_found = true;

    /* setup the array of apps */
    OBJ_CONSTRUCT(&found_apps, opal_pointer_array_t);
    opal_pointer_array_init(&found_apps, 16, INT_MAX, 16);

    /* cycle thru the directory */
    while (NULL != (dir_entry = readdir(dirp))) {
        /* Skip the obvious */
        if (0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
            0 == strncmp(dir_entry->d_name, "..", strlen(".."))) {
            continue;
        }
        /* Skip editor-related files */
        if (NULL != strstr(dir_entry->d_name, ".swp") ||
            NULL != strstr(dir_entry->d_name, ".swx") ||
            NULL != strchr(dir_entry->d_name, '~')) {
            continue;
        }
        if ('#' == dir_entry->d_name[0]) {
            continue;
        }

        /* parse the file, adding all found apps to the array */
        fullpath = opal_os_path(false, mca_orcm_cfgi_file_component.dir, dir_entry->d_name, NULL);
        if (ORCM_SUCCESS != (rc = parse_file(fullpath, &found_apps))) {
            OPAL_OUTPUT_VERBOSE((1, orcm_cfgi_base.output,
                                 "%s CANNOT PARSE FILE %s: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 dir_entry->d_name, ORTE_ERROR_NAME(rc)));
        }
        free(fullpath);
    }
    closedir(dirp);

    /* cycle thru the installed apps */
    for (i=0; i < orcm_cfgi_base.installed_apps.size; i++) {
        if (NULL == (app = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&orcm_cfgi_base.installed_apps, i))) {
            continue;
        }
        app->modified = false;
        /* is this app present in the found apps? */
        app2 = NULL;
        for (j=0; j < found_apps.size; j++) {
            if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&found_apps, j))) {
                continue;
            }
            if (0 == strcmp(app->application, aptr->application)) {
                app2 = aptr;
                /* remove it from the found_apps array as we will now process it */
                opal_pointer_array_set_item(&found_apps, j, NULL);
                break;
            }
        }
        if (NULL == app2) {
            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                 "%s APP %s IS NO LONGER INSTALLED",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 app->application));
            /* no longer present - remove this object from the installed array */
            opal_pointer_array_set_item(&orcm_cfgi_base.installed_apps, app->idx, NULL);
            /* find all instances */
            for (j=0; j < app->instances.size; j++) {
                if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, j))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s IS NO LONGER INSTALLED - KILLING INSTANCE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, run->instance));
                /* remove it from the array */
                opal_pointer_array_set_item(&app->instances, j, NULL);
                run->app = NULL;
                run->app_idx = -1;
                /* delink all the binaries */
                for (k=0; k < run->binaries.size; k++) {
                    if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, k))) {
                        continue;
                    }
                    bin->vers = NULL;
                    bin->exec = NULL;
                }
                /* kill the associated executing job, if any */
                caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                caddy->cmd = ORCM_CFGI_KILL_JOB;
                /* retain the run object as it has -not- been removed from
                 * the running config
                 */
                OBJ_RETAIN(run);
                caddy->run = run;
                /* send it off to be processed */
                opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
            }
            /* release it */
            OBJ_RELEASE(app);
            continue;
        }
        /* app was present - did we modify it */
        if (app->max_instances != app2->max_instances) {
            app->max_instances = app2->max_instances;
            app->modified = true;
        }
        /* did we remove any executables? */
        for (j=0; j < app->executables.size; j++) {
            if (NULL == (exec = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, j))) {
                continue;
            }
            /* is it present in the found apps */
            exec2 = NULL;
            for (k=0; k < app2->executables.size; k++) {
                if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app2->executables, k))) {
                    continue;
                }
                if (0 == strcmp(exec->appname, eptr->appname)) {
                    exec2 = eptr;
                    break;
                }
            }
            if (NULL == exec2) {
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s EXECUTABLE %s IS NO LONGER INSTALLED",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec->appname));
                /* this executable has been removed */
                opal_pointer_array_set_item(&app->executables, j, NULL);
                /* find all instances
                 * that use this executable and kill associated binaries
                 */
                for (k=0; k < app->instances.size; k++) {
                    if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, k))) {
                        continue;
                    }
                    /* search the binaries to see if they include this executable */
                    for (n=0; n < run->binaries.size; n++) {
                        if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, n))) {
                            continue;
                        }
                        if (0 == strcmp(bin->appname, exec->appname)) {
                            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                                 "%s APP %s EXECUTABLE %s IS NO LONGER INSTALLED - KILLING BINARY %s",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 app->application, exec->appname, bin->binary));
                            exec->total_procs -= bin->num_procs;
                            /* ensure we know it is no longer pointing to an installed exec/version */
                            bin->vers = NULL;
                            bin->exec = NULL;
                            /* kill the associated executing exec, if any */
                            caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                            caddy->cmd = ORCM_CFGI_KILL_EXE;
                            /* retain the run object as it has -not- been removed from
                             * the running config
                             */
                            OBJ_RETAIN(run);
                            caddy->run = run;
                            /* send it off to be processed */
                            opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
                            break;
                        }
                    }
                }
                OBJ_RELEASE(exec);
                continue;
            }
            /* kept the exec, but was it modified */
            if (exec->process_limit != exec2->process_limit) {
                exec->process_limit = exec2->process_limit;
                app->modified = true;
            }
            /* did we remove any versions */
            for (k=0; k < exec->versions.size; k++) {
                if (NULL == (vers = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, k))) {
                    continue;
                }
                /* is it present in the found app/exec */
                vers2 = NULL;
                for (n=0; n < exec2->versions.size; n++) {
                    if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec2->versions, n))) {
                        continue;
                    }
                    if (0 == strcmp(vptr->version, vers->version)) {
                        vers2 = vptr;
                        /* since we have this version, we can remove it from
                         * the found app
                         */
                        opal_pointer_array_set_item(&exec2->versions, n, NULL);
                        break;
                    }
                }
                if (NULL != vers2) {
                    OBJ_RELEASE(vers2);
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s EXEC %s VERSION %s IS NO LONGER INSTALLED",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec->appname, vers->version));
                /* nope - been removed, so take it out of the array */
                opal_pointer_array_set_item(&exec->versions, k, NULL);
                /* find all instances and kill this version */
                for (m=0; m < app->instances.size; m++) {
                    if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, m))) {
                        continue;
                    }
                    /* search the binaries to see if they include this version */
                    for (n=0; n < run->binaries.size; n++) {
                        if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, n))) {
                            continue;
                        }
                        if (0 == strcmp(bin->appname, exec->appname) &&
                            0 == strcmp(bin->version, vers->version)) {
                            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                                 "%s APP %s EXECUTABLE %s VERSION %s IS NO LONGER INSTALLED - KILLING BINARY %s",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 app->application, exec->appname, vers->version, bin->binary));
                            exec->total_procs -= bin->num_procs;
                            /* ensure we know it is no longer pointing to an installed version */
                            bin->vers = NULL;
                            /* kill the associated executing exec, if any */
                            caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                            caddy->cmd = ORCM_CFGI_KILL_EXE;
                            /* retain the run object as it has -not- been removed from
                             * the running config
                             */
                            OBJ_RETAIN(run);
                            caddy->run = run;
                            /* send it off to be processed */
                            opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
                            break;
                        }
                    }
                }
                /* cleanup */
                OBJ_RELEASE(vers);
            }
        }
        /* did we add any executables or versions */
        for (k=0; k < app2->executables.size; k++) {
            if (NULL == (exec2 = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app2->executables, k))) {
                continue;
            }
            exec = NULL;
            for (j=0; j < app->executables.size; j++) {
                if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, j))) {
                    continue;
                }
                if (0 == strcmp(eptr->appname, exec2->appname)) {
                    exec = eptr;
                    break;
                }
            }
            if (NULL == exec) {
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s ADDING EXECUTABLE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec2->appname));
                /* added this exec - just move it across */
                opal_pointer_array_set_item(&app2->executables, k, NULL);
                exec2->idx = opal_pointer_array_add(&app->executables, exec2);
                app->modified = true;
                continue;
            }
            /* exec already present, and we dealt with mods above - so
             * see if any versions were added.
             */
            for (j=0; j < exec2->versions.size; j++) {
                if (NULL == (vers = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec2->versions, j))) {
                    continue;
                }
                /* if already present, ignore */
                vers2 = NULL;
                for (n=0; n < exec->versions.size; n++) {
                    if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, n))) {
                        continue;
                    }
                    if (0 == strcmp(vptr->version, vers->version)) {
                        vers2 = vptr;
                        break;
                    }
                }
                if (NULL == vers2) {
                    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                         "%s APP %s ADDING EXECUTABLE %s VERSION %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         app->application, exec2->appname, vers->version));
                    opal_pointer_array_set_item(&exec2->versions, j, NULL);
                    vers->exec = exec;
                    vers->idx = opal_pointer_array_add(&exec->versions, vers);
                    app->modified = true;
                } else {
                    OBJ_RELEASE(vers2);
                }
            }
        }
        /* done with this entry */
        OBJ_RELEASE(app2);
    }

    /* any added applications get handled now - anything still in found_apps
     * would have been added
     */
    for (j=0; j < found_apps.size; j++) {
        if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&found_apps, j))) {
            continue;
        }
        OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                             "%s ADDING APP %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             aptr->application));
        /* just shift the entry to the installed_apps array */
        aptr->idx = opal_pointer_array_add(&orcm_cfgi_base.installed_apps, aptr);
        /* mark it as modified so it will be handled below */
        aptr->modified = true;
    }
    OBJ_DESTRUCT(&found_apps);

    /* check installed vs configd for anything needing starting,
     * but only check modified apps
     */
    check_installed(false);

 restart:
#ifdef HAVE_SYS_INOTIFY_H
    if (dir_found) {
        if (timer_in_use) {
            /* redefine the event to use inotify now
             * that the dir has been found
             */
            if (0 > (watch = inotify_add_watch(notifier, mca_orcm_cfgi_file_component.dir,
                                               IN_DELETE | IN_MODIFY | IN_MOVE))) {
                close(notifier);
                opal_event_evtimer_add(probe_ev, &probe_time);
            } else {
                opal_event_del(probe_ev);
                opal_event_set(opal_event_base, probe_ev, notifier,
                               OPAL_EV_READ|OPAL_EV_PERSIST, inotify_handler, NULL);
                opal_event_add(probe_ev, 0);
                timer_in_use = false;
            }
        } else {
            /* reset the event */
            opal_event_add(probe_ev, 0);
        }
    } else {
        /* restart the timer so we keep looking for it */
        opal_event_evtimer_add(probe_ev, &probe_time);
    }
#else
    /* restart the timer */
    opal_event_evtimer_add(probe_ev, &probe_time);
#endif
    /* release control */
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
}
Exemple #19
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    int rc=ORTE_SUCCESS, i;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_proc_t *pptr, *daemon, *pptr2;
    opal_buffer_t *notify;
    orcm_triplet_t *trp;
    orcm_source_t *src;
    bool procs_recovered;
    orte_job_t *jdt;
    uint16_t jfam;
    bool send_msg;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:update_state for job %s proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job),
                         (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* protect against threads */
    ORTE_ACQUIRE_THREAD(&ctl);

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }
    
    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* should only get this if a daemon restarted and we need
         * to check for procs waiting to migrate
         */
        if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) {
            /* we should never get this situation */
            opal_output(0, "%s UNKNOWN JOB ERROR ",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERROR;
        }
        /* cycle thru all known jobs looking for those with procs
         * awaiting resources to migrate
         */
        for (i=0; i < orte_job_data->size; i++) {
            if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) {
                continue;
            }
            if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) {
                continue;
            }
            /* reset the job */
            orte_plm_base_reset_job(jdt);

            /* map the job again */
            if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) {
                ORTE_ERROR_LOG(rc);
                continue;
            }
            /* launch any procs that could be mapped - note that not
             * all procs that were waiting for migration may have
             * been successfully mapped, so this could in fact
             * result in no action by the daemons
             */
            notify = OBJ_NEW(opal_buffer_t);
            /* indicate the target DVM */
            jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
            opal_dss.pack(notify, &jfam, 1, OPAL_UINT16);

            /* get the launch data */
            if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(notify);
                ORTE_RELEASE_THREAD(&ctl);
                return ORTE_SUCCESS;
            }
            /* send it to the daemons */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                         NULL, ORCM_PNP_TAG_COMMAND,
                                                         NULL, 0, notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }


    /**** DEAL WITH INDIVIDUAL PROCS ****/

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:sched got state %s for proc %s pid %d exit_code %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid, exit_code));
 
    /* if this was a failed comm or heartbeat */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* ignore this */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* ensure that the heartbeat system knows to ignore this proc
         * from this point forward
         */
        daemon->beat = 0;
        /* if we have already heard about this proc, ignore repeats */
        if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) {
            /* already heard */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;
        }
#if 0
        /* delete the route */
        orte_routed.delete_route(proc);
        /* purge the oob */
        orte_rml.purge(proc);
#endif
        /* get the triplet/source and mark this source as "dead" */
        if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) {
            opal_output(0, "%s CANNOT FIND DAEMON TRIPLET",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        if (NULL == (src = orcm_get_source(trp, proc, false))) {
            opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            ORTE_RELEASE_THREAD(&trp->ctl);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        src->alive = false;
        ORTE_RELEASE_THREAD(&src->ctl);
        ORTE_RELEASE_THREAD(&trp->ctl);

        /* notify all apps immediately */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* reset the proc stats */
            OBJ_DESTRUCT(&pptr->stats);
            OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t);
            /* since we added something, need to send msg */
            send_msg = true;
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* record that the daemon died */
        daemon->state = state;
        daemon->exit_code = exit_code;
        daemon->pid = 0;
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        node = daemon->node;
        if (NULL == node) {
            opal_output(0, "%s Detected failure of daemon %s on unknown node",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc));
            /* can't do anything further */
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_SUCCESS;            
        } else {
            opal_output(0, "%s Detected failure of daemon %s on node %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        (NULL == node->name) ? "UNKNOWN" : node->name);
        }
        /* see if any usable daemons are left alive */
        procs_recovered = false;
        for (i=2; i < daemon_job->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) {
                continue;
            }
            if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) {
                continue;
            }
            /* at least one alive! recover procs from the failed one */
            recover_procs(proc);
            procs_recovered = true;
            break;
        }
        if (!procs_recovered) {
            daemon->node = NULL;
            node->state = ORTE_NODE_STATE_DOWN;
            node->daemon = NULL;
            /* mark all procs on this node as having terminated */
            for (i=0; i < node->procs->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                    continue;
                }
                /* get the job data object for this process */
                if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                    /* major problem */
                    opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&pptr->name), i,
                                orte_proc_state_to_str(pptr->state));
                    continue;
                }
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING PROC %s FROM NODE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pptr->name), node->name));
                app->num_procs--;
                opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
                OBJ_RELEASE(pptr);
                /* clean it off the node */
                opal_pointer_array_set_item(node->procs, i, NULL);
                node->num_procs--;
                /* maintain acctg */
                OBJ_RELEASE(pptr);
                /* see if job is empty */
                jdt->num_terminated++;
                if (jdt->num_procs <= jdt->num_terminated) {
                    OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                         "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_JOBID_PRINT(jdt->jobid)));
                    opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                    OBJ_RELEASE(jdt);
                }
            }
        }
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_STATE_RESTARTED == state) {
        OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                             "%s RESTART OF DAEMON %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* get the proc object for this daemon */
        if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        /* if apps were on that node, notify all apps immediately that
         * those procs have failed
         */
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            ORTE_RELEASE_THREAD(&ctl);
            return ORTE_ERR_NOT_FOUND;
        }
        notify = OBJ_NEW(opal_buffer_t);
        send_msg = false;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) {
                ORTE_ERROR_LOG(rc);
                ORTE_RELEASE_THREAD(&ctl);
                return rc;
            }
            /* since we added something, we need to send msg */
            send_msg = true;
            /* remove the proc from the app so that it will get
             * restarted when we re-activate the config
             */
            if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) {
                continue;
            }
            if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                 "%s REMOVING PROC %s FROM NODE %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pptr->name), node->name));
            app->num_procs--;
            opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL);
            OBJ_RELEASE(pptr);
            /* clean it off the node */
            opal_pointer_array_set_item(node->procs, i, NULL);
            node->num_procs--;
            /* maintain acctg */
            OBJ_RELEASE(pptr);
            /* see if job is empty */
            jdt->num_terminated++;
            if (jdt->num_procs <= jdt->num_terminated) {
                OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output,
                                     "%s REMOVING JOB %s FROM ACTIVE ARRAY",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_JOBID_PRINT(jdt->jobid)));
                opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL);
                OBJ_RELEASE(jdt);
            }
        }
        if (send_msg) {
            /* send it to all apps */
            if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL,
                                                         ORCM_PNP_TAG_ERRMGR, NULL, 0,
                                                         notify, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        } else {
            OBJ_RELEASE(notify);
        }
        /* reset the node stats */
        OBJ_DESTRUCT(&node->stats);
        OBJ_CONSTRUCT(&node->stats, opal_node_stats_t);
        /* reset the daemon stats */
        OBJ_DESTRUCT(&daemon->stats);
        OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t);
        /* don't restart procs - we'll do that later after
         * we allow time for multiple daemons to restart
         */
        ORTE_RELEASE_THREAD(&ctl);
        return ORTE_SUCCESS;
    }

    /* to arrive here is an error */
    opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                orte_proc_state_to_str(state),
                ORTE_NAME_PRINT(proc));
    return ORTE_ERROR;

}
Exemple #20
0
static int open_channel(orte_rmcast_channel_t channel, char *name,
                        char *network, int port, char *interface, uint8_t direction)
{
    opal_list_item_t *item;
    rmcast_base_channel_t *nchan, *chan;
    uint32_t netaddr=0, netmask=0, intr=0;
    int rc;
    unsigned int i, n, start, end, range;
    bool port_assigned;
    
    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s opening channel %d for %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, name));

    /* parse the network, if provided */
    if (NULL != network) {
        if (ORTE_SUCCESS != (rc = opal_iftupletoaddr(network, &netaddr, &netmask))) {
            orte_show_help("help-rmcast-base.txt", "invalid-net-mask", true, network, ORTE_ERROR_NAME(rc));
            return ORTE_ERR_SILENT;
        }        
    }
    
    /* parse the interface, if provided */
    if (NULL != interface) {
        if (ORTE_SUCCESS != (rc = opal_iftupletoaddr(interface, &intr, NULL))) {
            orte_show_help("help-rmcast-base.txt", "invalid-net-mask", true, interface, ORTE_ERROR_NAME(rc));
            return ORTE_ERR_SILENT;
        }        
    }
    
    /* see if this name has already been assigned a channel on the specified network */
    OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output,
                         "%s open_channel: searching for %s:%d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), name, channel));
                        
    chan = NULL;
    ORTE_ACQUIRE_THREAD(&orte_rmcast_base.main_ctl);
    for (item = opal_list_get_first(&orte_rmcast_base.channels);
         item != opal_list_get_end(&orte_rmcast_base.channels);
         item = opal_list_get_next(item)) {
        nchan = (rmcast_base_channel_t*)item;
        
        OPAL_OUTPUT_VERBOSE((7, orte_rmcast_base.rmcast_output,
                             "%s open_channel: channel %s:%d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             nchan->name, channel));

        if (nchan->channel == channel ||
            0 == strcasecmp(nchan->name, name)) {
             chan = nchan;
            break;
        }
    }
    
    if (NULL != chan) {
        /* already exists - check that the requested
         * sockets are setup
         */
        OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                             "%s rmcast:udp using existing channel %s:%d network %03d.%03d.%03d.%03d port %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             chan->name, chan->channel,
                             OPAL_IF_FORMAT_ADDR(chan->network),
                             (int)chan->port));
        
        if (ORTE_SUCCESS != (rc = setup_channel(chan, direction))) {
            ORTE_ERROR_LOG(rc);
            ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
            return rc;
        }
        ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
        return ORTE_SUCCESS;
    }
    
    /* we didn't find an existing match, so create a new channel */
    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s creating new channel %s for %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_rmcast_base_print_channel(channel), name));

    chan = OBJ_NEW(rmcast_base_channel_t);
    chan->name = strdup(name);
    chan->channel = channel;
    /* if we were not given a network, use the default */
    if (NULL == network) {
        chan->network = orte_rmcast_base.xmit_network;
    } else {
        chan->network = netaddr;
    }
    /* if we were not given an interface, use the default */
    if (NULL == interface) {
        chan->interface = orte_rmcast_base.interface;
    } else {
        chan->interface = intr;
    }
    /* if we were not given a port, use a default one */
    if (port < 0) {
        /* cycle thru the port ranges until we find the
         * port corresponding to this channel number
         */
        n=0;
        port_assigned = false;
        for (i=0; NULL != orte_rmcast_base.ports.start[i]; i++) {
            /* how many ports are in this range? */
            start = strtol(orte_rmcast_base.ports.start[i], NULL, 10);
            end = strtol(orte_rmcast_base.ports.end[i], NULL, 10);
            range = end - start + 1;
            if (chan->channel < (n + range)) {
                /* take the corresponding port */
                chan->port = start + (chan->channel - n);
                port_assigned = true;
                break;
            }
            n += range;
        }
        if (!port_assigned) {
            opal_output(0, "%s CANNOT ASSIGN PORT TO CHANNEL %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        orte_rmcast_base_print_channel(chan->channel));
            return ORTE_ERROR;
        }
    } else {
        chan->port = port;
    }
    opal_list_append(&orte_rmcast_base.channels, &chan->item);
    ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl);
    
    /* if this is my input, set that value */
    if (ORTE_RMCAST_MY_INPUT & direction) {
        orte_rmcast_base.my_input_channel = chan;
    }

    /* if this is my output, set that value */
    if (ORTE_RMCAST_MY_OUTPUT & direction) {
        orte_rmcast_base.my_output_channel = chan;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output,
                         "%s rmcast:udp opening new channel %s:%s network %03d.%03d.%03d.%03d port %d for%s%s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         chan->name, orte_rmcast_base_print_channel(chan->channel),
                         OPAL_IF_FORMAT_ADDR(chan->network),
                         (int)chan->port,
                         (ORTE_RMCAST_RECV & direction) ? " RECV" : " ",
                         (ORTE_RMCAST_XMIT & direction) ? " XMIT" : " "));

    if (ORTE_SUCCESS != (rc = setup_channel(chan, direction))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    return ORTE_SUCCESS;
}
Exemple #21
0
static void tool_messages(int status,
                          orte_process_name_t *sender,
                          orcm_pnp_tag_t tag,
                          struct iovec *msg,
                          int count,
                          opal_buffer_t *buffer,
                          void *cbdata)
{
    int32_t rc=ORCM_SUCCESS, n, j;
    orte_job_t *jdata, *jdt, *jdt2;
    uint16_t jfam;
    orcm_tool_cmd_t flag=ORCM_TOOL_ILLEGAL_CMD;
    opal_buffer_t *response;
    orcm_cfgi_caddy_t *caddy;

    /* wait for any existing action to complete */
    ORTE_ACQUIRE_THREAD(&orcm_cfgi_base.ctl);

    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                         "%s cfgi:tool released to process cmd",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* setup the response - we send it regardless so the tool won't hang */
    response = OBJ_NEW(opal_buffer_t);

    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jfam, &n, OPAL_UINT16))) {
        ORTE_ERROR_LOG(rc);
        opal_dss.pack(response, &flag, 1, ORCM_TOOL_CMD_T);
        goto cleanup;
    }

    /* unpack the cmd */
    n=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &n, ORCM_TOOL_CMD_T))) {
        ORTE_ERROR_LOG(rc);
        opal_dss.pack(response, &flag, 1, ORCM_TOOL_CMD_T);
        goto cleanup;
    }
    
    /* return the cmd flag */
    opal_dss.pack(response, &flag, 1, ORCM_TOOL_CMD_T);
    
    /* if this isn't intended for my DVM, ignore it */
    if (jfam != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
        opal_output(0, "%s cfgi:tool CMD NOT FOR ME!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        rc = ORTE_ERROR;
        goto cleanup;
    }
    
    if (ORCM_TOOL_START_CMD == flag) {
        OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                             "%s spawn cmd from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));

        /* unpack the job object */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &n, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* check it */
        if (ORCM_SUCCESS != (rc = orcm_cfgi_base_check_job(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* launch the job */
        caddy = OBJ_NEW(orcm_cfgi_caddy_t);
        caddy->cmd = ORCM_CFGI_SPAWN;
        /* don't retain the jdata - the base functions will either
         * keep it or not
         */
        caddy->jdata = jdata;
        opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);

    } else if (ORCM_TOOL_STOP_CMD == flag) {
        /* unpack the job object */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdt, &n, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* check for correctness */
        if (NULL == jdt->instance && NULL == jdt->name) {
            /* had to specify one of them */
            rc = ORTE_ERR_BAD_PARAM;
            OBJ_RELEASE(jdt);
            goto cleanup;
        }

        /* search for the specified job */
        jdata = NULL;
        for (j=0; j < orte_job_data->size; j++) {
            if (NULL == (jdt2 = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) {
                continue;
            }
            if (NULL != jdt2->instance && NULL != jdt->instance) {
                if (0 == strcmp(jdt2->instance, jdt->instance)) {
                    jdata = jdt2;
                    break;
                }
                continue;
            }
            if (NULL != jdt2->name && NULL != jdt->name) {
                if (0 == strcmp(jdt2->name, jdt->name)) {
                    jdata = jdt2;
                    break;
                }
                continue;
            }
        }
        if (NULL == jdata) {
            /* couldn't find the job */
            rc = ORTE_ERR_BAD_PARAM;
            OBJ_RELEASE(jdt);
            goto cleanup;
        }

        /* order the termination */
        caddy = OBJ_NEW(orcm_cfgi_caddy_t);
        caddy->cmd = ORCM_CFGI_KILL_JOB;
        caddy->jdata = jdata;
        opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);

        /* cleanup */
        OBJ_RELEASE(jdt);

    } else {
        opal_output(0, "%s: UNKNOWN TOOL CMD FLAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)flag);
    }
    
 cleanup:
    /* return the result of the cmd */
    opal_dss.pack(response, &rc, 1, OPAL_INT);
    /* release the thread */
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
    if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                 sender, ORCM_PNP_TAG_TOOL,
                                                 NULL, 0, response, cbfunc, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(response);
    }
}