예제 #1
0
static int
flowctl_fanout_callback(ptl_event_t *ev,
                        ompi_mtl_portals4_base_request_t *ptl_base_request)
{
    int ret;
    struct timeval tv;

    ompi_mtl_portals4.flowctl.flowctl_active = false;
    ret = PtlPTEnable(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx);
    if (OPAL_UNLIKELY(PTL_OK != ret)) {
        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                            "%s:%d: PtlPTEnabled failed: %d\n",
                            __FILE__, __LINE__, ret);
        return ret;
    }

    gettimeofday(&tv, NULL);
    if (((tv.tv_sec * 1000000 + tv.tv_usec) - 
         (ompi_mtl_portals4.flowctl.tv.tv_sec * 1000000 + ompi_mtl_portals4.flowctl.tv.tv_usec)) 
        < 1000000 * ompi_mtl_portals4.flowctl.backoff_count) {
        usleep(++ompi_mtl_portals4.flowctl.backoff_count);
    } else {
        ompi_mtl_portals4.flowctl.backoff_count = 0;
    }
    ompi_mtl_portals4.flowctl.tv = tv;
         
    ompi_mtl_portals4_pending_list_progress();

    OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
                         "Exiting flowctl_fanout_callback %ld",
                         ompi_mtl_portals4.flowctl.epoch_counter));

    return OMPI_SUCCESS;
}
예제 #2
0
파일: rptl.c 프로젝트: zhanglt/mpich
static int poke_progress(void)
{
    int ret = PTL_OK;
    struct rptl_target *target;
    struct rptl_op *op;
    struct rptl *rptl;
    int i;
    int mpi_errno = MPI_SUCCESS;
    ptl_process_t id;
    ptl_pt_index_t data_pt, control_pt;
    MPIDI_STATE_DECL(MPID_STATE_POKE_PROGRESS);

    MPIDI_FUNC_ENTER(MPID_STATE_POKE_PROGRESS);

    /* make progress on local RPTLs */
    for (rptl = rptl_info.rptl_list; rptl; rptl = rptl->next) {
        /* if the local state is active, there's nothing to do */
        if (rptl->local_state == RPTL_LOCAL_STATE_ACTIVE)
            continue;

        /* if we are in a local AWAITING PAUSE ACKS state, see if we
         * can send out the unpause message */
        if (rptl->local_state == RPTL_LOCAL_STATE_AWAITING_PAUSE_ACKS &&
            rptl->pause_ack_counter == rptl_info.world_size - 1) {
            /* if we are over the max count limit, do not send an
             * unpause message yet */
            if (rptl->data.ob_curr_count > rptl->data.ob_max_count)
                continue;

            ret = PtlPTEnable(rptl->ni, rptl->data.pt);
            RPTLU_ERR_POP(ret, "Error returned while reenabling PT\n");

            rptl->local_state = RPTL_LOCAL_STATE_ACTIVE;

            for (i = 0; i < rptl_info.world_size; i++) {
                if (i == MPIDI_Process.my_pg_rank)
                    continue;
                mpi_errno = rptl_info.get_target_info(i, &id, rptl->data.pt, &data_pt, &control_pt);
                if (mpi_errno) {
                    ret = PTL_FAIL;
                    RPTLU_ERR_POP(ret, "Error getting target info\n");
                }

                /* make sure the user setup a control portal */
                assert(control_pt != PTL_PT_ANY);

                ret = rptl_put(rptl->md, 0, 0, PTL_NO_ACK_REQ, id, control_pt,
                               0, 0, NULL, RPTL_CONTROL_MSG_UNPAUSE, RPTL_PT_CONTROL);
                RPTLU_ERR_POP(ret, "Error sending unpause message\n");
            }
        }
    }

    /* make progress on targets */
    for (target = rptl_info.target_list; target; target = target->next) {
        if (target->state == RPTL_TARGET_STATE_RECEIVED_PAUSE) {
            for (op = target->data_op_list; op; op = op->next)
                if (op->state == RPTL_OP_STATE_ISSUED)
                    break;
            if (op)
                continue;

            /* send a pause ack message */
            assert(target->rptl);
            for (i = 0; i < rptl_info.world_size; i++) {
                if (i == MPIDI_Process.my_pg_rank)
                    continue;
                /* find the target that has this target id and get the
                 * control portal information for it */
                mpi_errno = rptl_info.get_target_info(i, &id, target->rptl->data.pt, &data_pt, &control_pt);
                if (mpi_errno) {
                    ret = PTL_FAIL;
                    RPTLU_ERR_POP(ret, "Error getting target info\n");
                }
                if (IDS_ARE_EQUAL(id, target->id))
                    break;
            }

            /* make sure the user setup a control portal */
            assert(control_pt != PTL_PT_ANY);

            target->state = RPTL_TARGET_STATE_PAUSE_ACKED;

            ret = rptl_put(target->rptl->md, 0, 0, PTL_NO_ACK_REQ, id, control_pt, 0,
                           0, NULL, RPTL_CONTROL_MSG_PAUSE_ACK, RPTL_PT_CONTROL);
            RPTLU_ERR_POP(ret, "Error sending pause ack message\n");

            continue;
        }

        /* issue out all the control messages first */
        for (op = target->control_op_list; op; op = op->next) {
            assert(op->op_type == RPTL_OP_PUT);

            /* skip all the issued ops */
            if (op->state == RPTL_OP_STATE_ISSUED)
                continue;

            /* we should not get any NACKs on the control portal */
            assert(op->state != RPTL_OP_STATE_NACKED);

            if (rptl_info.origin_events_left < 2 || target->issued_data_ops > PER_TARGET_THRESHOLD) {
                /* too few origin events left.  we can't issue this op
                 * or any following op to this target in order to
                 * maintain ordering */
                break;
            }

            rptl_info.origin_events_left -= 2;
            target->issued_data_ops++;

            /* force request for an ACK even if the user didn't ask
             * for it.  replace the user pointer with the OP id. */
            ret = PtlPut(op->u.put.md_handle, op->u.put.local_offset, op->u.put.length,
                         PTL_ACK_REQ, op->u.put.target_id, op->u.put.pt_index,
                         op->u.put.match_bits, op->u.put.remote_offset, op,
                         op->u.put.hdr_data);
            RPTLU_ERR_POP(ret, "Error issuing PUT\n");

            op->state = RPTL_OP_STATE_ISSUED;
        }

        if (target->state == RPTL_TARGET_STATE_DISABLED || target->state == RPTL_TARGET_STATE_PAUSE_ACKED)
            continue;

        /* then issue out all the data messages */
        for (op = target->data_op_list; op; op = op->next) {
            if (op->op_type == RPTL_OP_PUT) {
                /* skip all the issued ops */
                if (op->state == RPTL_OP_STATE_ISSUED)
                    continue;

                /* if an op has been nacked, don't issue anything else
                 * to this target */
                if (op->state == RPTL_OP_STATE_NACKED)
                    break;

                if (rptl_info.origin_events_left < 2 || target->issued_data_ops > PER_TARGET_THRESHOLD) {
                    /* too few origin events left.  we can't issue
                     * this op or any following op to this target in
                     * order to maintain ordering */
                    break;
                }

                rptl_info.origin_events_left -= 2;
                target->issued_data_ops++;

                /* force request for an ACK even if the user didn't
                 * ask for it.  replace the user pointer with the OP
                 * id. */
                ret = PtlPut(op->u.put.md_handle, op->u.put.local_offset, op->u.put.length,
                             PTL_ACK_REQ, op->u.put.target_id, op->u.put.pt_index,
                             op->u.put.match_bits, op->u.put.remote_offset, op,
                             op->u.put.hdr_data);
                RPTLU_ERR_POP(ret, "Error issuing PUT\n");
            }
            else if (op->op_type == RPTL_OP_GET) {
                /* skip all the issued ops */
                if (op->state == RPTL_OP_STATE_ISSUED)
                    continue;

                /* if an op has been nacked, don't issue anything else
                 * to this target */
                if (op->state == RPTL_OP_STATE_NACKED)
                    break;

                if (rptl_info.origin_events_left < 1 || target->issued_data_ops > PER_TARGET_THRESHOLD) {
                    /* too few origin events left.  we can't issue
                     * this op or any following op to this target in
                     * order to maintain ordering */
                    break;
                }

                rptl_info.origin_events_left--;
                target->issued_data_ops++;

                ret = PtlGet(op->u.get.md_handle, op->u.get.local_offset, op->u.get.length,
                             op->u.get.target_id, op->u.get.pt_index, op->u.get.match_bits,
                             op->u.get.remote_offset, op);
                RPTLU_ERR_POP(ret, "Error issuing GET\n");
            }

            op->state = RPTL_OP_STATE_ISSUED;
        }
    }

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_POKE_PROGRESS);
    return ret;

  fn_fail:
    goto fn_exit;
}
예제 #3
0
int main(int   argc,
         char *argv[])
{
    ptl_handle_ni_t ni_handle;
    ptl_process_t   *procs;
    int             rank;
    ptl_pt_index_t  pt_index, signal_pt_index;
    HANDLE_T        value_e_handle, signal_e_handle;
    int             num_procs;
    ptl_handle_eq_t eq_handle;
    ptl_handle_ct_t ct_handle;
    ptl_handle_md_t md_handle;

    CHECK_RETURNVAL(PtlInit());

    CHECK_RETURNVAL(libtest_init());

    rank = libtest_get_rank();
    num_procs = libtest_get_size();
    if (num_procs < 2) {
        fprintf(stderr, "test_flowctl_noeq requires at least two processes\n");
        return 77;
    }

    CHECK_RETURNVAL(PtlNIInit(PTL_IFACE_DEFAULT, NI_TYPE | PTL_NI_LOGICAL,
                              PTL_PID_ANY, NULL, NULL, &ni_handle));
    procs = libtest_get_mapping(ni_handle);
    CHECK_RETURNVAL(PtlSetMap(ni_handle, num_procs, procs));


    if (0 == rank) {
        ENTRY_T         value_e;

        /* create data ME */
        CHECK_RETURNVAL(PtlEQAlloc(ni_handle, (num_procs - 1) * ITERS / 2, &eq_handle));
        CHECK_RETURNVAL(PtlPTAlloc(ni_handle, PTL_PT_FLOWCTRL, eq_handle, 5,
                                   &pt_index));
        value_e.start = NULL;
        value_e.length = 0;
        value_e.ct_handle = PTL_CT_NONE;
        value_e.uid = PTL_UID_ANY;
        value_e.options = OPTIONS;
#if INTERFACE == 1
        value_e.match_id.rank = PTL_RANK_ANY;
        value_e.match_bits = 0;
        value_e.ignore_bits = 0;
#endif
        CHECK_RETURNVAL(APPEND(ni_handle, 5, &value_e, PTL_PRIORITY_LIST, NULL, &value_e_handle));

        /* create signal ME */
        CHECK_RETURNVAL(PtlCTAlloc(ni_handle, &ct_handle));
        CHECK_RETURNVAL(PtlPTAlloc(ni_handle, 0, PTL_EQ_NONE, 6,
                                   &signal_pt_index));
        value_e.start = NULL;
        value_e.length = 0;
        value_e.ct_handle = ct_handle;
        value_e.uid = PTL_UID_ANY;
        value_e.options = OPTIONS | PTL_LE_EVENT_SUCCESS_DISABLE | PTL_LE_EVENT_CT_COMM;
#if INTERFACE == 1
        value_e.match_id.rank = PTL_RANK_ANY;
        value_e.match_bits = 0;
        value_e.ignore_bits = 0;
#endif
        CHECK_RETURNVAL(APPEND(ni_handle, 6, &value_e, PTL_PRIORITY_LIST, NULL, &signal_e_handle));
    } else {
        ptl_md_t        md;

        /* 16 extra just in case... */
        CHECK_RETURNVAL(PtlEQAlloc(ni_handle, ITERS * 2 + 16, &eq_handle));

        md.start = NULL;
        md.length = 0;
        md.options = 0;
        md.eq_handle = eq_handle;
        md.ct_handle = PTL_CT_NONE;

        CHECK_RETURNVAL(PtlMDBind(ni_handle, &md, &md_handle));
    }

    libtest_barrier();

    if (0 == rank) {
        ptl_ct_event_t  ct;
        ptl_event_t ev;
        int ret, count = 0, saw_dropped = 0, saw_flowctl = 0;

        /* wait for signal counts */
        CHECK_RETURNVAL(PtlCTWait(ct_handle, num_procs - 1, &ct));
        if (ct.success != num_procs - 1 || ct.failure != 0) {
            return 1;
        }

        /* wait for event entries */
        while (count < ITERS * (num_procs - 1)) {
            ret = PtlEQWait(eq_handle, &ev);
            if (PTL_OK == ret) {
                ;
            } else if (PTL_EQ_DROPPED == ret) {
                saw_dropped++;
                if (ev.type == PTL_EVENT_PT_DISABLED){
                    saw_flowctl++;
                    CHECK_RETURNVAL(PtlPTEnable(ni_handle, pt_index));
                }
                break;
            } else {
                fprintf(stderr, "0: Unexpected return code from EQWait: %d\n", ret);
                return 1;
            }

            if (ev.type == PTL_EVENT_PT_DISABLED) {
                CHECK_RETURNVAL(PtlPTEnable(ni_handle, pt_index));
                saw_flowctl++;
            } else {
                count++;
            }
        }

        fprintf(stderr, "0: Saw %d dropped, %d flowctl\n", saw_dropped, saw_flowctl);
        if (saw_flowctl == 0) {
            return 1;
        }
    } else {
        ptl_process_t target;
        ptl_event_t ev;
        int ret, count = 0, fails = 0;
        int i;
        int *fail_seen;

        fail_seen = malloc(sizeof(int) * ITERS);
        if (NULL == fail_seen) {
             fprintf(stderr, "%d: malloc failed\n", rank);
             return 1;
        }
        memset(fail_seen, 0, sizeof(int) * ITERS);

        target.rank = 0;
        for (i = 0 ; i < ITERS ; ++i) {
            CHECK_RETURNVAL(PtlPut(md_handle,
                                   0,
                                   0,
                                   PTL_ACK_REQ,
                                   target,
                                   5,
                                   0,
                                   0,
                                   (void*)(size_t)i,
                                   0));
            usleep(100);
        }

        while (count < ITERS) {
            ret = PtlEQGet(eq_handle, &ev);
            if (PTL_EQ_EMPTY == ret) {
                continue;
            } else if (PTL_OK != ret) {
                fprintf(stderr, "%d: PtlEQGet returned %d\n", rank, ret);
                return 1;
            }

            if (ev.ni_fail_type == PTL_NI_OK) {
                if (ev.type == PTL_EVENT_SEND) {
                    continue;
                } else if (ev.type == PTL_EVENT_ACK) {
                    count++;
                } else {
                    fprintf(stderr, "%d: Unexpected event type %d\n", rank, ev.type);
                }
            } else if (ev.ni_fail_type == PTL_NI_PT_DISABLED) {
                int iter = (size_t) ev.user_ptr;
                if (fail_seen[iter]++ > 0) {
                    fprintf(stderr, "%d: Double report of PT_DISABLED for "
                            "iteration %d\n", rank, iter);
                    return 1;
                }
                count++;
                fails++;
            } else {
                fprintf(stderr, "%d: Unexpected fail type: %d\n", rank, ev.ni_fail_type);
                return 1;
            }
        }

        fprintf(stderr, "%d: Saw %d of %d events as fails\n", rank, fails, count);

        CHECK_RETURNVAL(PtlPut(md_handle,
                               0,
                               0,
                               PTL_NO_ACK_REQ,
                               target,
                               6,
                               0,
                               0,
                               NULL,
                               0));
        /* wait for the send event on the last put */
        CHECK_RETURNVAL(PtlEQWait(eq_handle, &ev));

        while (fails > 0) {
            CHECK_RETURNVAL(PtlPut(md_handle,
                                   0,
                                   0,
                                   PTL_ACK_REQ,
                                   target,
                                   5,
                                   0,
                                   0,
                                   NULL,
                                   0));
            while (1) {
                ret = PtlEQWait(eq_handle, &ev);
                if (PTL_OK != ret) {
                    fprintf(stderr, "%d: PtlEQWait returned %d\n", rank, ret);
                    return 1;
                }

                if (ev.ni_fail_type == PTL_NI_OK) {
                    if (ev.type == PTL_EVENT_SEND) {
                        continue;
                    } else if (ev.type == PTL_EVENT_ACK) {
                        fails--;
                        break;
                    } else {
                        fprintf(stderr, "%d: Unexpected event type %d\n", rank, ev.type);
                    }
                } else if (ev.ni_fail_type == PTL_NI_PT_DISABLED) {
                    break;
                } else {
                    fprintf(stderr, "%d: Unexpected fail type: %d\n", rank, ev.ni_fail_type);
                    return 1;
                }
            }
        }
    }

    libtest_barrier();

    if (0 == rank) {
        CHECK_RETURNVAL(UNLINK(signal_e_handle));
        CHECK_RETURNVAL(PtlPTFree(ni_handle, signal_pt_index));
        CHECK_RETURNVAL(PtlCTFree(ct_handle));
        CHECK_RETURNVAL(UNLINK(value_e_handle));
        CHECK_RETURNVAL(PtlPTFree(ni_handle, pt_index));
        CHECK_RETURNVAL(PtlEQFree(eq_handle));
    } else {
        CHECK_RETURNVAL(PtlMDRelease(md_handle));
        CHECK_RETURNVAL(PtlEQFree(eq_handle));
    }

    CHECK_RETURNVAL(PtlNIFini(ni_handle));
    CHECK_RETURNVAL(libtest_fini());
    PtlFini();

    return 0;
}