Пример #1
0
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc)
{
    int i;
    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:req_complete %s request %p completed with status %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc);

    if (NULL != req->req_qp) {
        (void) mca_oob_ud_qp_data_release (req->req_qp);
        req->req_qp = NULL;
    }

    /* deregister memory *before* handing it to the callback */
    MCA_OOB_UD_REQ_DEREG_MR(req);

    switch (req->type) {
    case MCA_OOB_UD_REQ_SEND:
        if (req->req_data_type != MCA_OOB_UD_REQ_TR) {
            req->rml_msg->status = rc;
        }
        break;
    case MCA_OOB_UD_REQ_RECV:
        if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) &&
            (req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) {
            opal_output_verbose(1, orte_oob_base_framework.framework_output,
                "%s DELIVERING TO RML",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
                char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
                int datalen = 0;
                for (i = 0 ; i < req->req_data.iov.count; ++i) {
                    memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
                    datalen += req->req_data.iov.uiov[i].iov_len;
                }
                ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, data, datalen);
                free(data);
            } else {
                ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num,
                                      req->req_data.buf.p, req->req_data.buf.size);
            }
        } else {
            opal_output_verbose(1, orte_oob_base_framework.framework_output,
                                "%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&req->req_target));

            orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t);
            snd->dst = req->req_target;
            snd->origin =  req->req_origin;
            snd->tag = req->req_tag;
            snd->seq_num = req->req_seq_num;
            if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
                char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec));
                int datalen = 0;
                for (i = 0 ; i < req->req_data.iov.count; ++i) {
                    memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len);
                    datalen += req->req_data.iov.uiov[i].iov_len;
                }
                snd->data = data;
                snd->count = datalen;
            } else {
                char *data = (char *)calloc(req->req_data.buf.size, sizeof(char));
                memcpy (data, req->req_data.buf.p, req->req_data.buf.size);
                snd->data = data;
                snd->count = req->req_data.buf.size;
            }
            snd->cbfunc.iov = NULL;
            snd->cbdata = NULL;
            /* activate the OOB send state */
            ORTE_OOB_SEND(snd);
        }
        break;
    default:
        break;
    }

    mca_oob_ud_req_return (req);
}
Пример #2
0
void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
{
    mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata;
    int rc;
    orte_process_name_t hop;
    mca_oob_tcp_peer_t *relay;
    uint64_t ui64;

    if (orte_abnormal_term_ordered) {
        return;
    }

    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s:tcp:recv:handler called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&peer->name));

    switch (peer->state) {
    case MCA_OOB_TCP_CONNECT_ACK:
        if (ORTE_SUCCESS == (rc = mca_oob_tcp_peer_recv_connect_ack(peer, peer->sd, NULL))) {
            opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:tcp:recv:handler starting send/recv events",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            /* we connected! Start the send/recv events */
            if (!peer->recv_ev_active) {
                opal_event_add(&peer->recv_event, 0);
                peer->recv_ev_active = true;
            }
            if (peer->timer_ev_active) {
                opal_event_del(&peer->timer_event);
                peer->timer_ev_active = false;
            }
            /* if there is a message waiting to be sent, queue it */
            if (NULL == peer->send_msg) {
                peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue);
            }
            if (NULL != peer->send_msg && !peer->send_ev_active) {
                opal_event_add(&peer->send_event, 0);
                peer->send_ev_active = true;
            }
            /* update our state */
            peer->state = MCA_OOB_TCP_CONNECTED;
        } else {
            opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s UNABLE TO COMPLETE CONNECT ACK WITH %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&peer->name));
            opal_event_del(&peer->recv_event);
            ORTE_FORCED_TERMINATE(1);
            return;
        }
        break;
    case MCA_OOB_TCP_CONNECTED:
        opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s:tcp:recv:handler CONNECTED",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        /* allocate a new message and setup for recv */
        if (NULL == peer->recv_msg) {
            opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:tcp:recv:handler allocate new recv msg",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            peer->recv_msg = OBJ_NEW(mca_oob_tcp_recv_t);
            if (NULL == peer->recv_msg) {
                opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                return;
            }
            /* start by reading the header */
            peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr;
            peer->recv_msg->rdbytes = sizeof(mca_oob_tcp_hdr_t);
        }
        /* if the header hasn't been completely read, read it */
        if (!peer->recv_msg->hdr_recvd) {
            opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:tcp:recv:handler read hdr",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {
                OPAL_TIMING_EVENT((&tm,"Header received from %s",
                                   ORTE_NAME_PRINT(&peer->name)));
                /* completed reading the header */
                peer->recv_msg->hdr_recvd = true;
                /* convert the header */
                MCA_OOB_TCP_HDR_NTOH(&peer->recv_msg->hdr);
                /* if this is a zero-byte message, then we are done */
                if (0 == peer->recv_msg->hdr.nbytes) {
                    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag);
                    peer->recv_msg->data = NULL;  // make sure
                } else {
                    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s:tcp:recv:handler allocate data region of size %lu",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
                    /* allocate the data region */
                    peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
                    /* point to it */
                    peer->recv_msg->rdptr = peer->recv_msg->data;
                    peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes;
                }
                /* fall thru and attempt to read the data */
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                /* close the connection */
                opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s:tcp:recv:handler error reading bytes - closing connection",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                mca_oob_tcp_peer_close(peer);
                return;
            }
        }

        if (peer->recv_msg->hdr_recvd) {
            /* continue to read the data block - we start from
             * wherever we left off, which could be at the
             * beginning or somewhere in the message
             */
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {

                OPAL_TIMING_EVENT((&tm,"Msg received from %s",
                                   ORTE_NAME_PRINT(&peer->name)));


                /* we recvd all of the message */
                opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s RECVD COMPLETE MESSAGE FROM %s (ORIGIN %s) OF %d BYTES FOR DEST %s TAG %d",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&peer->name),
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin),
                                    (int)peer->recv_msg->hdr.nbytes,
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst),
                                    peer->recv_msg->hdr.tag);
                /* am I the intended recipient (header was already converted back to host order)? */
                if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid &&
                    peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) {
                    /* yes - post it to the RML for delivery */
                    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s DELIVERING TO RML",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag,
                                          peer->recv_msg->data,
                                          peer->recv_msg->hdr.nbytes);
                    OBJ_RELEASE(peer->recv_msg);
                } else {
                    /* no - find the next hop in the route */
                    hop = orte_routed.get_route(&peer->recv_msg->hdr.dst);
                    if (hop.jobid == ORTE_JOBID_INVALID ||
                        hop.vpid == ORTE_VPID_INVALID) {
                        /* no hop known - post the error to the component
                         * and let the OOB see if there is another way
                         * to get there from here
                         */
                        opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                            "%s NO ROUTE TO %s FROM HERE",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&peer->name));
                        /* let the component know about the problem */
                        ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_no_route);
                        /* cleanup */
                        OBJ_RELEASE(peer->recv_msg);
                        return;
                    } else {
                        /* does we know how to reach the next hop? */
                        memcpy(&ui64, (char*)&hop, sizeof(uint64_t));
                        if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_module.peers, ui64, (void**)&relay)) {
                            opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                                "%s ADDRESS OF NEXT HOP %s TO %s IS UNKNOWN",
                                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                ORTE_NAME_PRINT(&hop),
                                                ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst));
                            /* let the component know about the problem */
                            ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_hop_unknown);
                            /* cleanup */
                            OBJ_RELEASE(peer->recv_msg);
                            return;
                        }
                        opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                            "%s ROUTING TO %s FROM HERE",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&relay->name));
                        /* if this came from a different job family, then ensure
                         * we know how to return
                         */
                        if (ORTE_JOB_FAMILY(peer->recv_msg->hdr.origin.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
                            orte_routed.update_route(&(peer->recv_msg->hdr.origin), &peer->name);
                        }
                        /* post the message for retransmission */
                        MCA_OOB_TCP_QUEUE_RELAY(peer->recv_msg, relay);
                        OBJ_RELEASE(peer->recv_msg);
                    }
                }
                peer->recv_msg = NULL;
                return;
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                // report the error
                opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to recv message",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                /* turn off the recv event */
                opal_event_del(&peer->recv_event);
                ORTE_FORCED_TERMINATE(1);
                return;
            }
        }
        break;
    default: 
        opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", 
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&(peer->name)),
                    peer->state);
        // mca_oob_tcp_peer_close(peer);
        break;
    }
}
Пример #3
0
static void xcast_recv(int status, orte_process_name_t* sender,
                       opal_buffer_t* buffer, orte_rml_tag_t tg,
                       void* cbdata)
{
    opal_list_item_t *item;
    orte_namelist_t *nm;
    int ret, cnt;
    opal_buffer_t *relay=NULL, *rly;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD;
    opal_buffer_t wireup, datbuf, *data;
    opal_byte_object_t *bo;
    int8_t flag;
    orte_job_t *jdata;
    orte_proc_t *rec;
    opal_list_t coll;
    orte_grpcomm_signature_t *sig;
    orte_rml_tag_t tag;
    char *rtmod, *nidmap;
    size_t inlen, cmplen;
    uint8_t *packed_data, *cmpdata;
    int32_t nvals, i;
    opal_value_t kv, *kval;
    orte_process_name_t dmn;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:direct:xcast:recv: with %d bytes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)buffer->bytes_used));

    /* we need a passthru buffer to send to our children - we leave it
     * as compressed data */
    rly = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(rly, buffer);
    OBJ_CONSTRUCT(&datbuf, opal_buffer_t);
    /* setup the relay list */
    OBJ_CONSTRUCT(&coll, opal_list_t);

    /* unpack the flag to see if this payload is compressed */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        return;
    }
    if (flag) {
        /* unpack the data size */
        cnt=1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &inlen, &cnt, OPAL_SIZE))) {
            ORTE_ERROR_LOG(ret);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* unpack the unpacked data size */
        cnt=1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmplen, &cnt, OPAL_SIZE))) {
            ORTE_ERROR_LOG(ret);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* allocate the space */
        packed_data = (uint8_t*)malloc(inlen);
        /* unpack the data blob */
        cnt = inlen;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, packed_data, &cnt, OPAL_UINT8))) {
            ORTE_ERROR_LOG(ret);
            free(packed_data);
            ORTE_FORCED_TERMINATE(ret);
            OBJ_DESTRUCT(&datbuf);
            OBJ_DESTRUCT(&coll);
            OBJ_RELEASE(rly);
            return;
        }
        /* decompress the data */
        if (orte_util_uncompress_block(&cmpdata, cmplen,
                                       packed_data, inlen)) {
            /* the data has been uncompressed */
            opal_dss.load(&datbuf, cmpdata, cmplen);
            data = &datbuf;
        } else {
            data = buffer;
        }
        free(packed_data);
    } else {
        data = buffer;
    }

    /* get the signature that we do not need */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }
    OBJ_RELEASE(sig);

    /* get the target tag */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &tag, &cnt, ORTE_RML_TAG))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&datbuf);
        OBJ_DESTRUCT(&coll);
        OBJ_RELEASE(rly);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }

    /* get our conduit's routed module name */
    rtmod = orte_rml.get_routed(orte_coll_conduit);

    /* if this is headed for the daemon command processor,
     * then we first need to check for add_local_procs
     * as that command includes some needed wireup info */
    if (ORTE_RML_TAG_DAEMON == tag) {
        /* peek at the command */
        cnt=1;
        if (ORTE_SUCCESS == (ret = opal_dss.unpack(data, &command, &cnt, ORTE_DAEMON_CMD))) {
            /* if it is an exit cmd, then flag that we are quitting so we will properly
             * handle connection losses from our downstream peers */
            if (ORTE_DAEMON_EXIT_CMD == command ||
                ORTE_DAEMON_HALT_VM_CMD == command) {
                orte_orteds_term_ordered = true;
                if (ORTE_DAEMON_HALT_VM_CMD == command) {
                    /* this is an abnormal termination */
                    orte_abnormal_term_ordered = true;
                }
                /* copy the msg for relay to ourselves */
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                opal_dss.copy_payload(relay, data);
            } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command ||
                       ORTE_DAEMON_DVM_NIDMAP_CMD == command ||
                       ORTE_DAEMON_DVM_ADD_PROCS == command) {
                /* setup our internal relay buffer */
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                /* unpack the nidmap string - may be NULL */
                cnt = 1;
                if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (NULL != nidmap) {
                    if (ORTE_SUCCESS != (ret = orte_regx.nidmap_parse(nidmap))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    free(nidmap);
                }
                /* see if they included info on node capabilities */
                cnt = 1;
                if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (0 != flag) {
                    /* update our local nidmap, if required - the decode function
                     * knows what to do
                     */
                    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                         "%s grpcomm:direct:xcast updating daemon nidmap",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

                    if (ORTE_SUCCESS != (ret = orte_regx.decode_daemon_nodemap(data))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }

                    if (!ORTE_PROC_IS_HNP) {
                        /* update the routing plan - the HNP already did
                         * it when it computed the VM, so don't waste time
                         * re-doing it here */
                        orte_routed.update_routing_plan(rtmod);
                    }
                    /* routing is now possible */
                    orte_routed_base.routing_enabled = true;

                    /* unpack the byte object */
                    cnt=1;
                    if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    if (0 < bo->size) {
                        /* load it into a buffer */
                        OBJ_CONSTRUCT(&wireup, opal_buffer_t);
                        opal_dss.load(&wireup, bo->bytes, bo->size);
                        /* decode it, pushing the info into our database */
                        if (opal_pmix.legacy_get()) {
                            OBJ_CONSTRUCT(&kv, opal_value_t);
                            kv.key = OPAL_PMIX_PROC_URI;
                            kv.type = OPAL_STRING;
                            cnt=1;
                            while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) {
                                cnt = 1;
                                if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kv.data.string, &cnt, OPAL_STRING))) {
                                    ORTE_ERROR_LOG(ret);
                                    break;
                                }
                                if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, &kv))) {
                                    ORTE_ERROR_LOG(ret);
                                    free(kv.data.string);
                                    break;
                                }
                                free(kv.data.string);
                                kv.data.string = NULL;
                            }
                            if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) {
                                ORTE_ERROR_LOG(ret);
                            }
                        } else {
                           cnt=1;
                           while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) {
                               cnt = 1;
                               if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &nvals, &cnt, OPAL_INT32))) {
                                   ORTE_ERROR_LOG(ret);
                                   break;
                               }
                               for (i=0; i < nvals; i++) {
                                cnt = 1;
                                if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kval, &cnt, OPAL_VALUE))) {
                                    ORTE_ERROR_LOG(ret);
                                    break;
                                }
                                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                                     "%s STORING MODEX DATA FOR PROC %s KEY %s",
                                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                     ORTE_NAME_PRINT(&dmn), kval->key));
                                if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, kval))) {
                                    ORTE_ERROR_LOG(ret);
                                    OBJ_RELEASE(kval);
                                    break;
                                }
                                OBJ_RELEASE(kval);
                            }
                            }
                            if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) {
                                ORTE_ERROR_LOG(ret);
                            }
                        }
                        /* done with the wireup buffer - dump it */
                        OBJ_DESTRUCT(&wireup);
                    }
                    free(bo);
                }
                /* copy the remainder of the payload - we don't pass wiring info
                 * to the odls */
                opal_dss.copy_payload(relay, data);
            } else {
                relay = OBJ_NEW(opal_buffer_t);
                /* repack the command */
                if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                /* copy the msg for relay to ourselves */
                opal_dss.copy_payload(relay, data);
            }
        } else {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }
    } else {
        /* copy the msg for relay to ourselves */
        relay = OBJ_NEW(opal_buffer_t);
        opal_dss.copy_payload(relay, data);
    }

  relay:
    if (!orte_do_not_launch) {
        /* get the list of next recipients from the routed module */
        orte_routed.get_routing_list(rtmod, &coll);

        /* if list is empty, no relay is required */
        if (opal_list_is_empty(&coll)) {
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:direct:send_relay - recipient list is empty!",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto CLEANUP;
        }

        /* send the message to each recipient on list, deconstructing it as we go */
        while (NULL != (item = opal_list_remove_first(&coll))) {
            nm = (orte_namelist_t*)item;

            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used,
                                 ORTE_NAME_PRINT(&nm->name)));
            OBJ_RETAIN(rly);
            /* check the state of the recipient - no point
             * sending to someone not alive
             */
            jdata = orte_get_job_data_object(nm->name.jobid);
            if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) {
                if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) {
                    opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
                }
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            if ((ORTE_PROC_STATE_RUNNING < rec->state &&
                ORTE_PROC_STATE_CALLED_ABORT != rec->state) ||
                !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
                if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) {
                    opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay: %s ",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name),
                                ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE) ? orte_proc_state_to_str(rec->state) : "NOT ALIVE");
                }
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
                                                               &nm->name, rly, ORTE_RML_TAG_XCAST,
                                                               orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(rly);
                OBJ_RELEASE(item);
                ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH);
                continue;
            }
            OBJ_RELEASE(item);
        }
    }

 CLEANUP:
    /* cleanup */
    OPAL_LIST_DESTRUCT(&coll);
    OBJ_RELEASE(rly);  // retain accounting

    /* now pass the relay buffer to myself for processing - don't
     * inject it into the RML system via send as that will compete
     * with the relay messages down in the OOB. Instead, pass it
     * directly to the RML message processor */
    if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
        ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1,
                              relay->base_ptr, relay->bytes_used);
        relay->base_ptr = NULL;
        relay->bytes_used = 0;
    }
    if (NULL != relay) {
        OBJ_RELEASE(relay);
    }
    OBJ_DESTRUCT(&datbuf);
}
Пример #4
0
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata)
{
    mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata;
    int rc;
    orte_rml_send_t *snd;

    if (orte_abnormal_term_ordered) {
        return;
    }

    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s:usock:recv:handler called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&peer->name));

    switch (peer->state) {
    case MCA_OOB_USOCK_CONNECT_ACK:
        if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler starting send/recv events",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            /* we connected! Start the send/recv events */
            if (!peer->recv_ev_active) {
                opal_event_add(&peer->recv_event, 0);
                peer->recv_ev_active = true;
            }
            if (peer->timer_ev_active) {
                opal_event_del(&peer->timer_event);
                peer->timer_ev_active = false;
            }
            /* if there is a message waiting to be sent, queue it */
            if (NULL == peer->send_msg) {
                peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue);
            }
            if (NULL != peer->send_msg && !peer->send_ev_active) {
                opal_event_add(&peer->send_event, 0);
                peer->send_ev_active = true;
            }
            /* update our state */
            peer->state = MCA_OOB_USOCK_CONNECTED;
        } else {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s UNABLE TO COMPLETE CONNECT ACK WITH %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&peer->name));
            opal_event_del(&peer->recv_event);
            peer->recv_ev_active = false;
            ORTE_FORCED_TERMINATE(1);
            return;
        }
        break;
    case MCA_OOB_USOCK_CONNECTED:
        opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s:usock:recv:handler CONNECTED",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        /* allocate a new message and setup for recv */
        if (NULL == peer->recv_msg) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler allocate new recv msg",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t);
            if (NULL == peer->recv_msg) {
                opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                return;
            }
            /* start by reading the header */
            peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr;
            peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t);
        }
        /* if the header hasn't been completely read, read it */
        if (!peer->recv_msg->hdr_recvd) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler read hdr",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {
                /* completed reading the header */
                peer->recv_msg->hdr_recvd = true;
                /* if this is a zero-byte message, then we are done */
                if (0 == peer->recv_msg->hdr.nbytes) {
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag);
                    peer->recv_msg->data = NULL;  // make sure
                    peer->recv_msg->rdptr = NULL;
                    peer->recv_msg->rdbytes = 0;
                } else {
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s:usock:recv:handler allocate data region of size %lu",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
                    /* allocate the data region */
                    peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
                    /* point to it */
                    peer->recv_msg->rdptr = peer->recv_msg->data;
                    peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes;
                }
                /* fall thru and attempt to read the data */
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                /* close the connection */
                opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s:usock:recv:handler error reading bytes - closing connection",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                mca_oob_usock_peer_close(peer);
                return;
            }
        }

        if (peer->recv_msg->hdr_recvd) {
            /* continue to read the data block - we start from
             * wherever we left off, which could be at the
             * beginning or somewhere in the message
             */
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {
                /* we recvd all of the message */
                opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin),
                                    (int)peer->recv_msg->hdr.nbytes,
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst),
                                    peer->recv_msg->hdr.tag);
                /* am I the intended recipient? */
                if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid &&
                    peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) {
                    /* yes - post it to the RML for delivery */
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                       "%s DELIVERING TO RML",
                                       ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag,
                                          peer->recv_msg->hdr.seq_num,
                                          peer->recv_msg->data,
                                          peer->recv_msg->hdr.nbytes);
                    OBJ_RELEASE(peer->recv_msg);
                } else {
                    /* no - we don't route things, so we promote this
                     * back to the OOB and let another transport move
                     * it along. If we are a daemon and it is intended
                     * for another of our local procs, it will just come
                     * back to us and be handled then
                     */
                    snd = OBJ_NEW(orte_rml_send_t);
                    snd->dst = peer->recv_msg->hdr.dst;
                    snd->origin = peer->recv_msg->hdr.origin;
                    snd->tag = peer->recv_msg->hdr.tag;
                    snd->data = peer->recv_msg->data;
                    snd->seq_num = peer->recv_msg->hdr.seq_num;
                    snd->count = peer->recv_msg->hdr.nbytes;
                    snd->cbfunc.iov = NULL;
                    snd->cbdata = NULL;
                    /* activate the OOB send state */
                    ORTE_OOB_SEND(snd);
                    /* protect the data */
                    peer->recv_msg->data = NULL;
                    /* cleanup */
                    OBJ_RELEASE(peer->recv_msg);
                    return;
                }
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                // report the error
                opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                /* turn off the recv event */
                opal_event_del(&peer->recv_event);
                peer->recv_ev_active = false;
                ORTE_FORCED_TERMINATE(1);
                return;
            }
        }
        break;
    default:
        opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&(peer->name)),
                    peer->state);
        // mca_oob_usock_peer_close(peer);
        break;
    }
}