Пример #1
0
int clear_bit(opal_bitmap_t *bm, int bit)
{
    int err = opal_bitmap_clear_bit(bm, bit);
    if ((err != 0) || opal_bitmap_is_set_bit(bm, bit)) {
	fprintf(error_out, "ERROR: clear_bit for bit = %d \n\n", bit);
	return ERR_CODE;
    }

    return 0;
}
Пример #2
0
int clear_bit(opal_bitmap_t *bm, int bit)
{
    int err = opal_bitmap_clear_bit(bm, bit);
    if ((err != 0)
	|| (bm->bitmap[bit/SIZE_OF_CHAR] & (1 << bit % SIZE_OF_CHAR))) {
	fprintf(error_out, "ERROR: clear_bit for bit = %d \n\n", bit);
	return ERR_CODE;
    }

    return 0;
}
Пример #3
0
void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
{
    mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata;
    uint64_t ui64;
    orte_oob_base_peer_t *bpr;
    int rc;

    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s tcp:lost connection called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&pop->peer));

    /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */
    if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
        goto cleanup;
    }

    /* Mark that we no longer support this peer */
    memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t));
    if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
                                                         ui64, (void**)&bpr) || NULL == bpr) {
        bpr = OBJ_NEW(orte_oob_base_peer_t);
    }
    opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx);
   if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers,
                                                               ui64, NULL))) {
        ORTE_ERROR_LOG(rc);
    }

 cleanup:
    /* activate the proc state */
    if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
        ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
    } else {
        ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
    }

    OBJ_RELEASE(pop);
}
Пример #4
0
void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
{
    mca_oob_tcp_msg_error_t *mop = (mca_oob_tcp_msg_error_t*)cbdata;
    uint64_t ui64;
    int rc;
    orte_oob_base_peer_t *bpr;

    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s tcp:no route called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&mop->hop));

    /* mark that we cannot reach this hop */
    memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
    if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
                                                         ui64, (void**)&bpr) || NULL == bpr) {
        bpr = OBJ_NEW(orte_oob_base_peer_t);
    }
    opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx);
    if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers,
                                                               ui64, NULL))) {
        ORTE_ERROR_LOG(rc);
    }

    /* report the error back to the OOB and let it try other components
     * or declare a problem
     */
    if (!orte_finalizing && !orte_abnormal_term_ordered) {
        /* if this was a lifeline, then alert */
        if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
            ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
        } else {
            ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
        }
    }

    OBJ_RELEASE(mop);
}
Пример #5
0
void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
{
    mca_oob_tcp_msg_error_t *mop = (mca_oob_tcp_msg_error_t*)cbdata;
    uint64_t ui64;
    orte_rml_send_t *snd;
    orte_oob_base_peer_t *bpr;

    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s tcp:unknown hop called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&mop->hop));

    if (orte_finalizing || orte_abnormal_term_ordered) {
        /* just ignore the problem */
        OBJ_RELEASE(mop);
        return;
    }

   /* mark that this component cannot reach this hop */
    memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
    if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
                                                         ui64, (void**)&bpr) ||
        NULL == bpr) {
        /* the overall OOB has no knowledge of this hop. Only
         * way this could happen is if the peer contacted us
         * via this component, and it wasn't entered into the
         * OOB framework hash table. We have no way of knowing
         * what to do next, so just output an error message and
         * abort */
        opal_output(0, "%s ERROR: message to %s requires routing and the OOB has no knowledge of the reqd hop %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&mop->snd->hdr.dst),
                    ORTE_NAME_PRINT(&mop->hop));
        ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
        OBJ_RELEASE(mop);
        return;
    }
    opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx);

    /* mark that this component cannot reach this destination either */
    memcpy(&ui64, (char*)&(mop->snd->hdr.dst), sizeof(uint64_t));
    if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
                                                         ui64, (void**)&bpr) ||
        NULL == bpr) {
        opal_output(0, "%s ERROR: message to %s requires routing and the OOB has no knowledge of this process",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&mop->snd->hdr.dst));
        ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
        OBJ_RELEASE(mop);
        return;
    }
    opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx);

    /* post the message to the OOB so it can see
     * if another component can transfer it
     */
    MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
    snd = OBJ_NEW(orte_rml_send_t);
    snd->dst = mop->snd->hdr.dst;
    snd->origin = mop->snd->hdr.origin;
    snd->tag = mop->snd->hdr.tag;
    snd->data = mop->snd->data;
    snd->count = mop->snd->hdr.nbytes;
    snd->cbfunc.iov = NULL;
    snd->cbdata = NULL;
    /* activate the OOB send state */
    ORTE_OOB_SEND(snd);
    /* protect the data */
    mop->snd->data = NULL;

    OBJ_RELEASE(mop);
}