/*
 * Non blocking RML recv callback.  Read incoming QP and other info,
 * and if this endpoint is trying to connect, reply with our QP info,
 * otherwise try to modify QP's and establish reliable connection
 */
static void xoob_rml_recv_cb(int status, ompi_process_name_t* process_name,
                        opal_buffer_t* buffer, ompi_rml_tag_t tag,
                        void* cbdata)
{
    int rc;
    uint8_t message_type;
    uint16_t requested_lid = 0;
    mca_btl_openib_rem_info_t rem_info;
    mca_btl_openib_endpoint_t *ib_endpoint = NULL;

    if ( OMPI_SUCCESS != init_rem_info(&rem_info)) {
        return;
    }

    /* Get data. */
    if ( OMPI_SUCCESS != xoob_receive_connect_data(&rem_info, &requested_lid, &message_type, buffer)) {
        BTL_ERROR(("Failed to read data\n"));
        mca_btl_openib_endpoint_invoke_error(NULL);
        return;
    }

    /* Processing message */
    switch (message_type) {
        case ENDPOINT_XOOB_CONNECT_REQUEST:
            BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_REQUEST: lid %d, sid %" PRIx64 ", rlid %d\n",
                        rem_info.rem_lid,
                        rem_info.rem_subnet_id,
                        requested_lid));
            ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id,
                    requested_lid, message_type);
            if ( NULL == ib_endpoint) {
                BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST."
                           " Failed to find endpoint with subnet %" PRIx64
                           " and LID %d",
                           rem_info.rem_subnet_id,requested_lid));
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
            /* prepost data on receiver site */
            if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
                BTL_ERROR(("Failed to post on XRC SRQs"));
                mca_btl_openib_endpoint_invoke_error(NULL);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                return;
            }
            /* we should create qp and send the info + srq to requestor */
            rc = xoob_reply_first_connect(ib_endpoint, &rem_info);
            if (OMPI_SUCCESS != rc) {
                BTL_ERROR(("error in endpoint reply start connect"));
                mca_btl_openib_endpoint_invoke_error(NULL);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                return;
            }
            /* enable pooling for this btl */
            OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            break;
        case ENDPOINT_XOOB_CONNECT_XRC_REQUEST:
            /* pasha we don't need the remote lid here ??*/
            BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_REQUEST: lid %d, sid %" PRIx64 "\n",
                        rem_info.rem_lid,
                        rem_info.rem_subnet_id));
            ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id,
                    requested_lid, message_type);
            if ( NULL == ib_endpoint) {
                BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST."
                            " Failed to find endpoint with subnet %" PRIx64 " and LID %d",
                            rem_info.rem_subnet_id,requested_lid));
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            if (OMPI_SUCCESS == xoob_recv_qp_connect(ib_endpoint, &rem_info)) {
                if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) {
                    BTL_ERROR(("Failed to post on XRC SRQs"));
                    mca_btl_openib_endpoint_invoke_error(ib_endpoint);
                    return;
                }
                OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
                rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE);
                if (OMPI_SUCCESS != rc) {
                    BTL_ERROR(("error in endpoint reply start connect"));
                    mca_btl_openib_endpoint_invoke_error(ib_endpoint);
                    OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                    return;
                }
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            } else {
                /* The XRC recv qp was destroyed */
                OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
                rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE);
                if (OMPI_SUCCESS != rc) {
                    BTL_ERROR(("error in endpoint reply start connect"));
                    mca_btl_openib_endpoint_invoke_error(ib_endpoint);
                    OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                    return;
                }
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            }
            /* enable pooling for this btl */
            break;
        case ENDPOINT_XOOB_CONNECT_RESPONSE:
            BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_RESPONSE: lid %d, sid %" PRIx64 "\n",
                        rem_info.rem_lid,
                        rem_info.rem_subnet_id));
            ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id,
                    rem_info.rem_lid, message_type);
            if ( NULL == ib_endpoint) {
                BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_RESPONSE."
                            " Failed to find endpoint with subnet %" PRIx64 " and LID %d",
                            rem_info.rem_subnet_id,rem_info.rem_lid));
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
            /* we got all the data srq. switch the endpoint to connect mode */
            XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
            /* update ib_addr with remote qp number */
            ib_endpoint->ib_addr->remote_xrc_rcv_qp_num =
                ib_endpoint->rem_info.rem_qps->rem_qp_num;
            BTL_VERBOSE(("rem_info: lid %d, sid %" PRIx64
                         " ep %d %" PRIx64 "\n",
                         rem_info.rem_lid,
                         rem_info.rem_subnet_id,
                         ib_endpoint->rem_info.rem_lid,
                         ib_endpoint->rem_info.rem_subnet_id));
            if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) {
                BTL_ERROR(("Failed to connect  endpoint\n"));
                mca_btl_openib_endpoint_invoke_error(NULL);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                return;
            }
            mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
            /* cpc complete unlock the endpoint */
            break;
        case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE:
            BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: lid %d, sid %" PRIx64 "\n",
                        rem_info.rem_lid,
                        rem_info.rem_subnet_id));
            ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id,
                    rem_info.rem_lid, message_type);
            if ( NULL == ib_endpoint) {
                BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE."
                            " Failed to find endpoint with subnet %" PRIx64 " and LID %d",
                            rem_info.rem_subnet_id,rem_info.rem_lid));
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
            /* we got srq numbers on our request */
            XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info);
            mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
            /* cpc complete unlock the endpoint */
            break;
        case ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE:
            /* The XRC recv site already was destroyed so we need
             * start to bringup the connection from scratch  */
            BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE: lid %d, sid %" PRIx64 "\n",
                        rem_info.rem_lid,
                        rem_info.rem_subnet_id));
            ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id,
                    rem_info.rem_lid, message_type);
            if ( NULL == ib_endpoint) {
                BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE."
                            " Failed to find endpoint with subnet %" PRIx64 " and LID %d",
                            rem_info.rem_subnet_id,rem_info.rem_lid));
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            xoob_restart_connect(ib_endpoint);
            break;
        default :
            BTL_ERROR(("Invalid message type %d", message_type));
    }

    free_rem_info(&rem_info);
}
/*
 * Non blocking RML recv callback.  Read incoming QP and other info,
 * and if this endpoint is trying to connect, reply with our QP info,
 * otherwise try to modify QP's and establish reliable connection
 */
static void rml_recv_cb(int status, orte_process_name_t* process_name, 
                        opal_buffer_t* buffer, orte_rml_tag_t tag, 
                        void* cbdata)
{
    mca_btl_openib_proc_t *ib_proc;
    mca_btl_openib_endpoint_t *ib_endpoint = NULL;
    int endpoint_state;
    int rc;
    uint32_t i, lcl_qp = 0;
    uint16_t lcl_lid = 0;
    int32_t cnt = 1;
    mca_btl_openib_rem_info_t rem_info;
    uint8_t message_type;
    bool master;
    
    /* start by unpacking data first so we know who is knocking at 
       our door */ 
    BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8));
    rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        mca_btl_openib_endpoint_invoke_error(NULL);
        return;
    }
    
    BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64));
    rc = opal_dss.unpack(buffer, &rem_info.rem_subnet_id, &cnt, OPAL_UINT64);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        mca_btl_openib_endpoint_invoke_error(NULL);
        return;
    }
    
    if (ENDPOINT_CONNECT_REQUEST != message_type) {
        BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
        rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            mca_btl_openib_endpoint_invoke_error(NULL);
            return;
        }
        BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
        rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            mca_btl_openib_endpoint_invoke_error(NULL);
            return;
        }
    }
    if (ENDPOINT_CONNECT_ACK != message_type) {
        int qp; 
        /* get ready for the data */
        rem_info.rem_qps = 
            (mca_btl_openib_rem_qp_info_t*) malloc(sizeof(mca_btl_openib_rem_qp_info_t) * 
                                                   mca_btl_openib_component.num_qps);
        
        /* unpack all the qp info */
        for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { 
            BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
            rc = opal_dss.unpack(buffer, &rem_info.rem_qps[qp].rem_qp_num, &cnt,
                                 OPAL_UINT32);
            if (ORTE_SUCCESS != rc) {
                ORTE_ERROR_LOG(rc);
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
            BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
            rc = opal_dss.unpack(buffer, &rem_info.rem_qps[qp].rem_psn, &cnt,
                                 OPAL_UINT32);
            if (ORTE_SUCCESS != rc) {
                ORTE_ERROR_LOG(rc);
                mca_btl_openib_endpoint_invoke_error(NULL);
                return;
            }
        }
        
        BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
        rc = opal_dss.unpack(buffer, &rem_info.rem_lid, &cnt, OPAL_UINT16);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            mca_btl_openib_endpoint_invoke_error(NULL);
            return;
        }
        BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
        rc = opal_dss.unpack(buffer, &rem_info.rem_mtu, &cnt, OPAL_UINT32);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            mca_btl_openib_endpoint_invoke_error(NULL);
            return;
        }
        BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
        rc = opal_dss.unpack(buffer, &rem_info.rem_index, &cnt, OPAL_UINT32);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            mca_btl_openib_endpoint_invoke_error(NULL);
            return;
        }
    }
    
    BTL_VERBOSE(("Received QP Info,  LID = %d, SUBNET = %016x\n",
                 rem_info.rem_lid, 
                 rem_info.rem_subnet_id));
    
    master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME,
                                    process_name) > 0 ? true : false;
    
    /* Need to protect the ib_procs list */
    OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);

    for (ib_proc = (mca_btl_openib_proc_t*)
            opal_list_get_first(&mca_btl_openib_component.ib_procs);
        ib_proc != (mca_btl_openib_proc_t*)
            opal_list_get_end(&mca_btl_openib_component.ib_procs);
        ib_proc  = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
        bool found = false;
        
        if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                   &ib_proc->proc_guid, process_name) != OPAL_EQUAL) {
            continue;
        }
        
        if (ENDPOINT_CONNECT_REQUEST != message_type) {
            /* This is a reply message. Try to get the endpoint
               instance the reply belongs to */
            for (i = 0; i < ib_proc->proc_endpoint_count; i++) { 
                ib_endpoint = ib_proc->proc_endpoints[i];
                if (ib_endpoint->qps[0].qp->lcl_qp != NULL &&
                    lcl_lid == ib_endpoint->endpoint_btl->lid &&
                    lcl_qp == ib_endpoint->qps[0].qp->lcl_qp->qp_num &&
                    rem_info.rem_subnet_id == ib_endpoint->subnet_id) {
                    found = true;
                    break;
                }
            }
        } else {
            /* This is new connection request. If this is master try
               to find endpoint in a connecting state. If this is
               slave try to find  endpoint in closed state and
               initiate connection back */
            mca_btl_openib_endpoint_t *ib_endpoint_found = NULL;
            for (i = 0; i < ib_proc->proc_endpoint_count; i++) { 
                ib_endpoint = ib_proc->proc_endpoints[i];
                if (ib_endpoint->subnet_id != rem_info.rem_subnet_id ||
                   (ib_endpoint->endpoint_state != MCA_BTL_IB_CONNECTING
                    && ib_endpoint->endpoint_state != MCA_BTL_IB_CLOSED))
                    continue;
                found = true;
                ib_endpoint_found = ib_endpoint;
                if ((master &&
                     MCA_BTL_IB_CONNECTING == ib_endpoint->endpoint_state) ||
                    (!master &&
                     MCA_BTL_IB_CLOSED == ib_endpoint->endpoint_state))
                    break; /* Found one. No point to continue */
            }
            ib_endpoint = ib_endpoint_found;
            
            /* if this is slave and there is no endpoints in closed
               state then all connection are already in progress so
               just ignore this connection request */
            if (found && !master &&
                MCA_BTL_IB_CLOSED != ib_endpoint->endpoint_state) {
		OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
                return;
            }
        }
        
        if (!found) {
            BTL_ERROR(("can't find suitable endpoint for this peer\n")); 
            mca_btl_openib_endpoint_invoke_error(NULL);
	    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
            return; 
        }
        
        OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
        endpoint_state = ib_endpoint->endpoint_state;
        
        /* Update status */
        switch (endpoint_state) {
        case MCA_BTL_IB_CLOSED :
            /* We had this connection closed before.  The endpoint is
               trying to connect. Move the status of this connection
               to CONNECTING, and then reply with our QP
               information */
            if (master) {
                rc = reply_start_connect(ib_endpoint, &rem_info);
            } else {
                rc = oob_module_start_connect(ib_endpoint->endpoint_local_cpc, 
                                              ib_endpoint);
            }
            
            if (OMPI_SUCCESS != rc) {
                BTL_ERROR(("error in endpoint reply start connect"));
                mca_btl_openib_endpoint_invoke_error(ib_endpoint);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                break;
            }
            
            /* As long as we expect a message from the peer (in order
               to setup the connection) let the event engine pool the
               RML events. Note: we increment it once peer active
               connection. */
            opal_progress_event_users_increment();
            OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            break;
             
        case MCA_BTL_IB_CONNECTING :
            set_remote_info(ib_endpoint, &rem_info);
            if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) {
                BTL_ERROR(("endpoint connect error: %d", rc)); 
                mca_btl_openib_endpoint_invoke_error(ib_endpoint);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
                break;
            }
           
            if (master) {
                ib_endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK;

                /* Send him an ACK */
                send_connect_data(ib_endpoint, ENDPOINT_CONNECT_RESPONSE);
                OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            } else {
                send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
                /* Tell main BTL that we're done */
                mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
                /* cpc complete unlock the endpoint */
             }
            break;
            
        case MCA_BTL_IB_WAITING_ACK:
            /* Tell main BTL that we're done */
            mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
            /* cpc complete unlock the endpoint */
            break;
            
        case MCA_BTL_IB_CONNECT_ACK:
            send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
            /* Tell main BTL that we're done */
            mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
            /* cpc complete unlock the endpoint */
            break;
            
        case MCA_BTL_IB_CONNECTED:
            OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
            break;

        default :
            BTL_ERROR(("Invalid endpoint state %d", endpoint_state));
            mca_btl_openib_endpoint_invoke_error(ib_endpoint);
            OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
        }
        break;
    }
    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
}