Exemplo n.º 1
0
static void
rml_oob_recv_route_callback(int status,
                            struct orte_process_name_t* peer,
                            struct iovec* iov,
                            int count,
                            orte_rml_tag_t tag,
                            void *cbdata)
{
    orte_rml_oob_msg_header_t *hdr = 
        (orte_rml_oob_msg_header_t*) iov[0].iov_base;
    int real_tag;
    int ret;
    orte_process_name_t next, origin;
    struct iovec *new_iov;

    /* BWB -- propogate errors here... */
    assert(status >= 0);

    ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);

    origin = hdr->origin;

    next = orte_routed.get_route(&hdr->destination);
    if (next.vpid == ORTE_VPID_INVALID) {
        opal_output(0, "%s:route_callback tried routing message from %s to %s:%d, can't find route",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&origin),
                    ORTE_NAME_PRINT(&hdr->destination),
                    hdr->tag);
        opal_backtrace_print(stderr);
        orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        return;
    }

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
        opal_output(0, "%s:route_callback trying to get message from %s to %s:%d, routing loop",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&origin),
                    ORTE_NAME_PRINT(&hdr->destination),
                    hdr->tag);
        opal_backtrace_print(stderr);
        orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
    }

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, &hdr->destination)) {
        real_tag = hdr->tag;
    } else {
        real_tag = ORTE_RML_TAG_RML_ROUTE;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                         "%s routing message from %s for %s to %s (tag: %d)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(&hdr->origin),
                         ORTE_NAME_PRINT(&hdr->destination),
                         ORTE_NAME_PRINT(&next),
                         hdr->tag));

    ORTE_RML_OOB_MSG_HEADER_HTON(*hdr);

    /* NTH: fix potential race condition. oob may modify iov before
       the oob send completes */
    new_iov = (struct iovec*) calloc (count, sizeof (struct iovec));
    if (NULL == new_iov) {
	opal_output (0, "%s:route_callback malloc error!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        opal_backtrace_print(stderr);
        orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);    
    }

    memcpy (new_iov, iov, count * sizeof (struct iovec));

    ret = orte_rml_oob_module.active_oob->oob_send_nb(&next,
                                                      &origin,
                                                      new_iov,
                                                      count,
                                                      real_tag,
                                                      0,
                                                      rml_oob_recv_route_send_callback,
                                                      NULL);

    if (ORTE_SUCCESS != ret) {
        if (ORTE_ERR_ADDRESSEE_UNKNOWN == ret) {
            /* no route -- queue and hope we find a route */
            orte_rml_oob_queued_msg_t *qmsg = OBJ_NEW(orte_rml_oob_queued_msg_t);
            OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                                 "%s: no OOB information for %s.  Queuing for later.",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&next)));
            ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);
            qmsg->payload[0].iov_base = (IOVBASE_TYPE*) malloc(iov[0].iov_len);
            if (NULL == qmsg->payload[0].iov_base) abort();
            qmsg->payload[0].iov_len = iov[0].iov_len;
            memcpy(qmsg->payload[0].iov_base, iov[0].iov_base, iov[0].iov_len);
            OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
            opal_list_append(&orte_rml_oob_module.queued_routing_messages,
                             &qmsg->super);
            if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
                opal_event_evtimer_add(orte_rml_oob_module.timer_event,
                                       &orte_rml_oob_module.timeout);
            }
            OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
        } else {
            opal_output(0,
                        "%s failed to send message to %s: %s (rc = %d)",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&next),
                        opal_strerror(ret),
                        ret);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }
    }
}
Exemplo n.º 2
0
int
orte_rml_oob_send(orte_process_name_t* peer,
                  struct iovec *iov,
                  int count,
                  int tag,
                  int flags)
{
    orte_rml_oob_msg_t *msg = OBJ_NEW(orte_rml_oob_msg_t);
    int ret;
    orte_process_name_t next;
    int real_tag;
    int i;
    int bytes = 0;

    if (ORTE_RML_TAG_INVALID == tag) {
        /* cannot send to an invalid tag */
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }
    
    msg->msg_type = ORTE_RML_BLOCKING_SEND;
    flags |= ORTE_RML_FLAG_RECURSIVE_CALLBACK;

    next = orte_routed.get_route(peer);
    if (next.vpid == ORTE_VPID_INVALID) {
        ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
        opal_output(0, "%s could not get route to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer));
        return ORTE_ERR_ADDRESSEE_UNKNOWN;
    }
    msg->msg_data = (struct iovec *) malloc(sizeof(struct iovec) * (count + 1));
    msg->msg_data[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_header;
    msg->msg_data[0].iov_len = sizeof(orte_rml_oob_msg_header_t);
    bytes += msg->msg_data[0].iov_len;
    for (i = 0 ; i < count ; ++i) {
        msg->msg_data[i + 1].iov_base = iov[i].iov_base;
        msg->msg_data[i + 1].iov_len = iov[i].iov_len;
        bytes += msg->msg_data[i + 1].iov_len;
    }

    msg->msg_header.origin = *ORTE_PROC_MY_NAME;
    msg->msg_header.destination = *peer;
    msg->msg_header.tag = tag;
    ORTE_RML_OOB_MSG_HEADER_HTON(msg->msg_header);

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, peer)) {
        real_tag = tag;
    } else {
        real_tag = ORTE_RML_TAG_RML_ROUTE;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                         "rml_send %s -> %s (router %s, tag %d, %d)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(peer),
                         ORTE_NAME_PRINT(&next),
                         tag,
                         real_tag));
    ret = orte_rml_oob_module.active_oob->oob_send_nb(&next, 
                                                      ORTE_PROC_MY_NAME,
                                                      msg->msg_data,
                                                      count + 1,
                                                      real_tag, 
                                                      flags,
                                                      orte_rml_send_msg_callback,
                                                      msg);
    if (ret < 0) {
        ORTE_ERROR_LOG(ret);
        opal_output(0, "%s attempted to send to %s: tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                             ORTE_NAME_PRINT(&next), (int)real_tag);
        goto cleanup;
    }

    OPAL_THREAD_LOCK(&msg->msg_lock);
    while (!msg->msg_complete) {
        opal_condition_wait(&msg->msg_cond, &msg->msg_lock);
    }
    ret = msg->msg_status;
    OPAL_THREAD_UNLOCK(&msg->msg_lock);

 cleanup:
    OBJ_RELEASE(msg);

    return ret;
}
Exemplo n.º 3
0
static void 
rml_oob_queued_progress(int fd, short event, void *arg)
{
    orte_rml_oob_queued_msg_t *qmsg;
    orte_rml_oob_msg_header_t *hdr;
    int real_tag;
    int ret;
    orte_process_name_t next, origin;

    while (true) {
        OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
        qmsg = (orte_rml_oob_queued_msg_t*) opal_list_remove_first(&orte_rml_oob_module.queued_routing_messages);
        OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
        if (NULL == qmsg) break;

        hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base;
        origin = hdr->origin;

        next = orte_routed.get_route(&hdr->destination);
        if (next.vpid == ORTE_VPID_INVALID) {
            opal_output(0,
                        "%s:queued progress tried routing message from %s to %s:%d, can't find route",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&hdr->origin),
                        ORTE_NAME_PRINT(&hdr->destination),
                        hdr->tag);
            opal_backtrace_print(stderr);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }

        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, ORTE_PROC_MY_NAME)) {
            opal_output(0, "%s:queued progress trying to get message from %s to %s:%d, routing loop",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&hdr->origin),
                        ORTE_NAME_PRINT(&hdr->destination),
                        hdr->tag);
            opal_backtrace_print(stderr);
            orte_errmgr.abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
        }

        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, &hdr->destination)) {
            real_tag = hdr->tag;
        } else {
            real_tag = ORTE_RML_TAG_RML_ROUTE;
        }

        OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                             "%s routing message from %s for %s to %s (tag: %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&hdr->origin),
                             ORTE_NAME_PRINT(&hdr->destination),
                             ORTE_NAME_PRINT(&next),
                             hdr->tag));

        ORTE_RML_OOB_MSG_HEADER_HTON(*hdr);

        ret = orte_rml_oob_module.active_oob->oob_send_nb(&next,
                                                          &origin,
                                                          qmsg->payload,
                                                          1,
                                                          real_tag,
                                                          0,
                                                          rml_oob_recv_route_queued_send_callback,
                                                          qmsg);

        if (ORTE_SUCCESS != ret) {
            if (ORTE_ERR_ADDRESSEE_UNKNOWN == ret) {
                /* still no route -- try again */
                ORTE_RML_OOB_MSG_HEADER_NTOH(*hdr);
                OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock);
                opal_list_append(&orte_rml_oob_module.queued_routing_messages,
                                 &qmsg->super);
                if (1 == opal_list_get_size(&orte_rml_oob_module.queued_routing_messages)) {
                    opal_event_evtimer_add(orte_rml_oob_module.timer_event,
                                           &orte_rml_oob_module.timeout);
                }
                OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock);
            } else {
                opal_output(0,
                            "%s failed to send message from %s to %s:%d %s (rc = %d)",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&next),
                            ORTE_NAME_PRINT(&origin),
                            real_tag,
                            ORTE_ERROR_NAME(ret),
                            ret);
                abort();
            }
        }
    }
}
Exemplo n.º 4
0
int
orte_rml_oob_send_buffer_nb(orte_process_name_t* peer,
                            opal_buffer_t* buffer,
                            orte_rml_tag_t tag,
                            int flags,
                            orte_rml_buffer_callback_fn_t cbfunc,
                            void* cbdata)
{
    orte_rml_oob_msg_t *msg = OBJ_NEW(orte_rml_oob_msg_t);
    void *dataptr;
    orte_std_cntr_t datalen;
    int ret;
    int real_tag;
    orte_process_name_t next;
    int bytes = 0;

    if (ORTE_RML_TAG_INVALID == tag) {
        /* cannot send to an invalid tag */
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }
    
    /* first build iovec from buffer information */
    ret = opal_dss.unload(buffer, &dataptr, &datalen);
    if (ORTE_SUCCESS != ret) {
        OBJ_RELEASE(msg);
        return ret;
    }
    opal_dss.load(buffer, dataptr, datalen);

    msg->msg_type = ORTE_RML_NONBLOCKING_BUFFER_SEND;
    msg->msg_cbfunc.buffer = cbfunc;
    msg->msg_cbdata = cbdata;
    msg->user_buffer = buffer;

    msg->msg_data = (struct iovec *) malloc(sizeof(struct iovec) * 2);

    next = orte_routed.get_route(peer);
    if (next.vpid == ORTE_VPID_INVALID) {
        ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
        opal_output(0, "%s unable to find address for %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(peer));
        return ORTE_ERR_ADDRESSEE_UNKNOWN;
    }

    msg->msg_data[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_header;
    msg->msg_data[0].iov_len = sizeof(orte_rml_oob_msg_header_t);
    bytes += msg->msg_data[0].iov_len;

    msg->msg_data[1].iov_base = (IOVBASE_TYPE*)dataptr;
    msg->msg_data[1].iov_len  = datalen;
    
    bytes += msg->msg_data[1].iov_len;

    msg->msg_header.origin = *ORTE_PROC_MY_NAME;
    msg->msg_header.destination = *peer;
    msg->msg_header.tag = tag;
    ORTE_RML_OOB_MSG_HEADER_HTON(msg->msg_header);

    if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, peer)) {
        real_tag = tag;
    } else {
        real_tag = ORTE_RML_TAG_RML_ROUTE;
    }

    OBJ_RETAIN(buffer);

    OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output,
                         "rml_send_buffer_nb %s -> %s (router %s, tag %d, %d)",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(peer),
                         ORTE_NAME_PRINT(&next),
                         tag, real_tag));

    ret = orte_rml_oob_module.active_oob->oob_send_nb(&next,
                                                      ORTE_PROC_MY_NAME,
                                                      msg->msg_data,
                                                      2,
                                                      real_tag,
                                                      flags,
                                                      orte_rml_send_msg_callback,
                                                      msg);

    if (ret < 0) {
        ORTE_ERROR_LOG(ret);
        opal_output(0, "%s attempted to send to %s: tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                             ORTE_NAME_PRINT(&next), (int)real_tag);
        OBJ_RELEASE(msg);
        OBJ_RELEASE(buffer);
    }

    return ret;
}