Ejemplo n.º 1
0
/* returns TRUE iff the request was sent on the vc */
static inline int req_uses_vc(const MPID_Request* req, const MPIDI_VC_t *vc)
{
    MPIDI_VC_t *vc1;
    
    MPIDI_Comm_get_vc(req->comm, req->dev.match.parts.rank, &vc1);
    return vc == vc1;
}
Ejemplo n.º 2
0
/* returns TRUE iff the vc is part of the comm*/
static inline int is_vc_in_comm(const MPIDI_VC_t *vc, const MPID_Comm *comm)
{
    int i;

    for (i = 0; i < comm->remote_size; ++i) {
        MPIDI_VC_t *vc1;
        MPIDI_Comm_get_vc(comm, i, &vc1);
        if (vc == vc1)
            return TRUE;
    }
    return FALSE;
}
Ejemplo n.º 3
0
static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    MPIDI_VC_t *vc;
    MPI_Aint last;
    void *buf;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);

    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);

    dequeue_req(e);

    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_unpack_complete(e);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }

    if (e->type == PTL_EVENT_PUT_OVERFLOW)
        buf = e->start;
    else
        buf = REQ_PTL(rreq)->chunk_buffer[0];

    MPIU_Assert(e->mlength == PTL_LARGE_THRESHOLD);
    last = PTL_LARGE_THRESHOLD;
    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf);
    MPIU_Assert(last == PTL_LARGE_THRESHOLD);
    rreq->dev.segment_first += PTL_LARGE_THRESHOLD;
    MPIU_Free(REQ_PTL(rreq)->chunk_buffer[0]);

    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                        mpi_errno, "chunk_buffer");
    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
}
Ejemplo n.º 4
0
static int nonempty_intersection(MPID_Comm *comm, MPID_Group *group, int *flag)
{
    int mpi_errno = MPI_SUCCESS;
    int i_g, i_c;
    MPIDI_VC_t *vc_g, *vc_c;
    MPIDI_STATE_DECL(MPID_STATE_NONEMPTY_INTERSECTION);

    MPIDI_FUNC_ENTER(MPID_STATE_NONEMPTY_INTERSECTION);

    /* handle common case fast */
    if (comm == MPIR_Process.comm_world || comm == MPIR_Process.icomm_world) {
        *flag = TRUE;
        MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "comm is comm_world or icomm_world");
        goto fn_exit;
    }
    *flag = FALSE;
    
    /* FIXME: This algorithm assumes that the number of processes in group is
       very small (like 1).  So doing a linear search for them in comm is better
       than sorting the procs in comm and group then doing a binary search */

    for (i_g = 0; i_g < group->size; ++i_g) {
        /* FIXME: This won't work for dynamic procs */
        MPIDI_PG_Get_vc(MPIDI_Process.my_pg, group->lrank_to_lpid[i_g].lpid, &vc_g);
        for (i_c = 0; i_c < comm->remote_size; ++i_c) {
            MPIDI_Comm_get_vc(comm, i_c, &vc_c);
            if (vc_g == vc_c) {
                *flag = TRUE;
                goto fn_exit;
            }
        }
    }
    
 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_NONEMPTY_INTERSECTION);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Ejemplo n.º 5
0
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPIR_Win * win_ptr, MPIDI_RMA_Target_t ** target)
{
    int i, local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0;
    int made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;

    (*target) = NULL;

    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        /* switch to window-wide protocol */
        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
            if (i == win_ptr->comm_ptr->rank)
                continue;
            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
            if (orig_vc->node_id != target_vc->node_id) {
                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
                if (mpi_errno)
                    MPIR_ERR_POP(mpi_errno);
                if (curr_target == NULL) {
                    win_ptr->outstanding_locks++;
                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIR_ERR_POP(mpi_errno);
                }
            }
        }
        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
    }

    do {
        /* find a non-empty slot and set the FLUSH flag on the first
         * target */
        /* TODO: we should think about better strategies on selecting the target */
        for (i = 0; i < win_ptr->num_slots; i++)
            if (win_ptr->slots[i].target_list_head != NULL)
                break;
        curr_target = win_ptr->slots[i].target_list_head;
        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
        }

        /* Issue out all operations. */
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                        &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        /* Wait for remote completion. */
        do {
            MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed);

            if (!remote_completed) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        } while (!remote_completed);

        /* Cleanup the target. */
        mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, curr_target);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        /* check if we got a target */
        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);

    } while ((*target) == NULL);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Ejemplo n.º 6
0
int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                          void *result_addr, MPI_Datatype datatype, int target_rank,
                          MPI_Aint target_disp, MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;

    MPIDI_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
        MPI_Aint type_size;
        void *src = NULL, *dest = NULL;

        /* Append this operation to the RMA ops queue */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = 1;
        op_ptr->origin_datatype = datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_datatype = datatype;
        op_ptr->compare_addr = (void *) compare_addr;
        op_ptr->compare_datatype = datatype;
        op_ptr->target_rank = target_rank;
        op_ptr->piggyback_lock_candidate = 1;   /* CAS is always able to piggyback LOCK */

        /************** Setting packet struct areas in operation ****************/

        cas_pkt = &(op_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED);
        cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;

        /* REQUIRE: All datatype arguments must be of the same, builtin
         * type and counts must be 1. */
        MPID_Datatype_get_size_macro(datatype, type_size);
        MPIU_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u));

        src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data));
        mpi_errno = immed_copy(src, dest, type_size);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data));
        mpi_errno = immed_copy(src, dest, type_size);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Ejemplo n.º 7
0
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
                            dt_true_lb);

    if (target_data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            mpi_errno = MPID_Request_complete(ureq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
        }
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
        MPI_Aint origin_type_size;
        MPI_Aint target_type_size;
        int use_immed_pkt = FALSE, i;
        int is_origin_contig, is_target_contig, is_result_contig;
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
        int is_empty_origin = FALSE;

        /* Judge if origin buffer is empty */
        if (op == MPI_NO_OP)
            is_empty_origin = TRUE;

        /* Append the operation to the window's RMA ops queue */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_count = result_count;
        op_ptr->result_datatype = result_datatype;
        op_ptr->target_rank = target_rank;

        /* Remember user request */
        op_ptr->ureq = ureq;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
            MPID_Datatype_get_ptr(result_datatype, result_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
        }

        if (is_empty_origin == FALSE) {
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t);
        }
        else {
            /* If origin buffer is empty, set origin data size to 0 */
            orig_data_sz = 0;
        }

        MPID_Datatype_get_size_macro(target_datatype, target_type_size);

        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            predefined_dtp_size = target_type_size;
            predefined_dtp_count = target_count;
            MPID_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent);
        }
        else {
            MPIU_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size);
            predefined_dtp_count = target_data_sz / predefined_dtp_size;
            MPID_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent);
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
            if (result_dtp != NULL) {
                MPID_Datatype_add_ref(result_dtp);
            }
        }

        if (is_empty_origin == FALSE) {
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        }
        else {
            /* If origin buffer is empty, mark origin data as contig data */
            is_origin_contig = 1;
        }
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
        MPID_Datatype_is_contig(result_datatype, &is_result_contig);

        /* Judge if we can use IMMED data packet */
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
            is_origin_contig && is_target_contig && is_result_contig) {
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
                use_immed_pkt = TRUE;
        }

        /* Judge if this operation is a piggyback candidate */
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for origin, target and result data. We should extend this optimization to derived
             * datatypes as well. */
            if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }

        /************** Setting packet struct areas in operation ****************/

        get_accum_pkt = &(op_ptr->pkt.get_accum);

        if (use_immed_pkt) {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
        }
        else {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
        }

        get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        get_accum_pkt->count = target_count;
        get_accum_pkt->datatype = target_datatype;
        get_accum_pkt->info.dataloop_size = 0;
        get_accum_pkt->op = op;
        get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
            mpi_errno = immed_copy(src, dest, orig_data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIR_ERR_POP(mpi_errno);
        }

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Ejemplo n.º 8
0
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig ATTRIBUTE((unused)), rank;
    MPID_Datatype *dtp;
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPIDI_msg_sz_t data_sz;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* If the put is a local operation, do it here */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            mpi_errno = MPID_Request_complete(ureq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
        }
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
        int use_immed_pkt = FALSE;
        int is_origin_contig, is_target_contig;

        /* queue it up */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        /* FIXME: For contig and very short operations, use a streamlined op */
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;

        /* Remember user request */
        op_ptr->ureq = ureq;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }

        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

        /* Judge if we can use IMMED data packet */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
                use_immed_pkt = TRUE;
        }

        /* Judge if this operation is an piggyback candidate */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }

        /************** Setting packet struct areas in operation ****************/

        put_pkt = &(op_ptr->pkt.put);

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->info.dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
            mpi_errno = immed_copy(src, dest, data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIR_ERR_POP(mpi_errno);
        }

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Ejemplo n.º 9
0
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target)
{
    int i, local_completed = 0, remote_completed = 0;
    int made_progress = 0;
    MPIDI_RMA_Target_t *curr_target = NULL;
    int mpi_errno = MPI_SUCCESS;

    (*target) = NULL;

    /* If we are in an aggressive cleanup, the window must be holding
     * up resources.  If it isn't, we are in the wrong window and
     * incorrectly entered this function. */
    MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget");

    if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
        /* switch to window-wide protocol */
        MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc);
        for (i = 0; i < win_ptr->comm_ptr->local_size; i++) {
            if (i == win_ptr->comm_ptr->rank)
                continue;
            MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc);
            if (orig_vc->node_id != target_vc->node_id) {
                mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target);
                if (mpi_errno)
                    MPIU_ERR_POP(mpi_errno);
                if (curr_target == NULL) {
                    win_ptr->outstanding_locks++;
                    mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr);
                    if (mpi_errno != MPI_SUCCESS)
                        MPIU_ERR_POP(mpi_errno);
                }
            }
        }
        win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED;
    }

    do {
        /* find a non-empty slot and set the FLUSH flag on the first
         * target */
        /* TODO: we should think about better strategies on selecting the target */
        for (i = 0; i < win_ptr->num_slots; i++)
            if (win_ptr->slots[i].target_list != NULL)
                break;
        curr_target = win_ptr->slots[i].target_list;
        if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) {
            curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH;
            curr_target->sync.have_remote_incomplete_ops = 0;
            curr_target->sync.outstanding_acks++;
        }

        /* Issue out all operations. */
        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank,
                                                        &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        /* Wait for remote completion. */
        do {
            mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target,
                                                          &local_completed, &remote_completed);
            if (mpi_errno != MPI_SUCCESS)
                MPIU_ERR_POP(mpi_errno);
            if (!remote_completed) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIU_ERR_POP(mpi_errno);
            }
        } while (!remote_completed);

        /* Cleanup the target. */
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, curr_target);
        if (mpi_errno != MPI_SUCCESS)
            MPIU_ERR_POP(mpi_errno);

        /* check if we got a target */
        (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr);

    } while ((*target) == NULL);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Ejemplo n.º 10
0
MPID_Request * MPIDI_CH3U_Recvq_FDU_or_AEP(int source, int tag, 
                                           int context_id, MPID_Comm *comm, void *user_buf,
                                           int user_count, MPI_Datatype datatype, int * foundp)
{
    MPID_Time_t timer_start;
    int found;
    MPID_Request *rreq, *prev_rreq;
    MPIDI_Message_match match;
    MPIDI_Message_match mask;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);

    MPIU_THREAD_CS_ASSERT_HELD(MSGQUEUE);

    /* Store how much time is spent traversing the queue */
    MPIR_T_START_TIMER(RECVQ_STATISTICS, timer_start);

    /* Optimize this loop for an empty unexpected receive queue */
    rreq = recvq_unexpected_head;
    if (rreq) {
	prev_rreq = NULL;

	match.parts.context_id = context_id;
	match.parts.tag = tag;
	match.parts.rank = source;

	if (tag != MPI_ANY_TAG && source != MPI_ANY_SOURCE) {
	    do {
                MPIR_T_INC(RECVQ_STATISTICS, unexpected_recvq_match_attempts);
		if (MATCH_WITH_NO_MASK(rreq->dev.match, match)) {
		    if (prev_rreq != NULL) {
			prev_rreq->dev.next = rreq->dev.next;
		    }
		    else {
			recvq_unexpected_head = rreq->dev.next;
		    }

		    if (rreq->dev.next == NULL) {
			recvq_unexpected_tail = prev_rreq;
		    }
                    MPIR_T_DEC(RECVQ_STATISTICS, unexpected_qlen);

                    if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG)
                        MPIR_T_SUBTRACT(RECVQ_STATISTICS, MPIDI_CH3I_unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz);

		    rreq->comm = comm;
		    MPIR_Comm_add_ref(comm);
		    rreq->dev.user_buf = user_buf;
		    rreq->dev.user_count = user_count;
		    rreq->dev.datatype = datatype;
		    found = TRUE;
		    goto lock_exit;
		}
		prev_rreq = rreq;
		rreq      = rreq->dev.next;
	    } while (rreq);
	}
	else {
	    mask.parts.context_id = mask.parts.rank = mask.parts.tag = ~0;
	    if (tag == MPI_ANY_TAG)
		match.parts.tag = mask.parts.tag = 0;
	    if (source == MPI_ANY_SOURCE)
		match.parts.rank = mask.parts.rank = 0;

	    do {
                MPIR_T_INC(RECVQ_STATISTICS, unexpected_recvq_match_attempts);
		if (MATCH_WITH_LEFT_MASK(rreq->dev.match, match, mask)) {
		    if (prev_rreq != NULL) {
			prev_rreq->dev.next = rreq->dev.next;
		    }
		    else {
			recvq_unexpected_head = rreq->dev.next;
		    }
		    if (rreq->dev.next == NULL) {
			recvq_unexpected_tail = prev_rreq;
		    }
                    MPIR_T_DEC(RECVQ_STATISTICS, unexpected_qlen);

                    if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG)
                        MPIR_T_SUBTRACT(RECVQ_STATISTICS, MPIDI_CH3I_unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz);

		    rreq->comm                 = comm;
		    MPIR_Comm_add_ref(comm);
		    rreq->dev.user_buf         = user_buf;
		    rreq->dev.user_count       = user_count;
		    rreq->dev.datatype         = datatype;
		    found = TRUE;
		    goto lock_exit;
		}
		prev_rreq = rreq;
		rreq = rreq->dev.next;
	    } while (rreq);
	}
    }
    MPIR_T_END_TIMER(RECVQ_STATISTICS, timer_start, time_matching_unexpectedq);

    /* A matching request was not found in the unexpected queue, so we 
       need to allocate a new request and add it to the posted queue */
    {
	int mpi_errno = MPI_SUCCESS;

        found = FALSE;

	MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit );
	rreq->dev.match.parts.tag	   = tag;
	rreq->dev.match.parts.rank	   = source;
	rreq->dev.match.parts.context_id   = context_id;

	/* Added a mask for faster search on 64-bit capable
	 * platforms */
	rreq->dev.mask.parts.context_id = ~0;
	if (rreq->dev.match.parts.rank == MPI_ANY_SOURCE)
	    rreq->dev.mask.parts.rank = 0;
	else
	    rreq->dev.mask.parts.rank = ~0;
	if (rreq->dev.match.parts.tag == MPI_ANY_TAG)
	    rreq->dev.mask.parts.tag = 0;
	else
	    rreq->dev.mask.parts.tag = ~0;

        rreq->comm                 = comm;
        MPIR_Comm_add_ref(comm);
        rreq->dev.user_buf         = user_buf;
        rreq->dev.user_count       = user_count;
        rreq->dev.datatype         = datatype;

        /* check whether VC has failed, or this is an ANY_SOURCE in a
           failed communicator */
        if (source != MPI_ANY_SOURCE) {
            MPIDI_VC_t *vc;
            MPIDI_Comm_get_vc(comm, source, &vc);
            if (vc->state == MPIDI_VC_STATE_MORIBUND) {
                MPIU_ERR_SET1(mpi_errno, MPIX_ERR_PROC_FAIL_STOP, "**comm_fail", "**comm_fail %d", vc->pg_rank);
                rreq->status.MPI_ERROR = mpi_errno;
                MPIDI_CH3U_Request_complete(rreq);
                goto lock_exit;
            }
        } else if (!MPIDI_CH3I_Comm_AS_enabled(comm)) {
            MPIU_ERR_SET(mpi_errno, MPIX_ERR_PROC_FAIL_STOP, "**comm_fail");
            rreq->status.MPI_ERROR = mpi_errno;
            MPIDI_CH3U_Request_complete(rreq);
            goto lock_exit;
        }
        
	rreq->dev.next = NULL;
	if (recvq_posted_tail != NULL) {
	    recvq_posted_tail->dev.next = rreq;
	}
	else {
	    recvq_posted_head = rreq;
	}
	recvq_posted_tail = rreq;
        MPIR_T_INC(RECVQ_STATISTICS, posted_qlen);
	MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq);
    }
    
  lock_exit:
    *foundp = found;

    /* If a match was not found, the timer was stopped after the traversal */
    if (found)
        MPIR_T_END_TIMER(RECVQ_STATISTICS, timer_start, time_matching_unexpectedq);
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);
    return rreq;
}
Ejemplo n.º 11
0
static int handler_recv_dequeue_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    MPIDI_VC_t *vc;
    MPID_nem_ptl_vc_area *vc_ptl;
    int ret;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPI_Aint last;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
    
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);

    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);
    vc_ptl = VC_PTL(vc);
    
    dequeue_req(e);

    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);

    /* unpack data from unexpected buffer first */
    if (e->type == PTL_EVENT_PUT_OVERFLOW) {
        if (dt_contig) {
            MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength);
        } else {
            last = e->mlength;
            MPID_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start);
            MPIU_Assert(last == e->mlength);
            rreq->dev.segment_first = e->mlength;
        }
    }
    
    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_complete(e);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }
        
    MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD);

    /* we need to GET the rest of the data from the sender's buffer */
    if (dt_contig) {
        big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD,
                vc, e->match_bits, rreq);
        goto fn_exit;
    }

    /* noncontig recv buffer */
    
    last = rreq->dev.segment_size;
    rreq->dev.iov_count = MPL_IOV_LIMIT;
    MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);

    if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
        /* Rest of message fits in one IOV */
        ptl_md_t md;

        md.start = rreq->dev.iov;
        md.length = rreq->dev.iov_count;
        md.options = PTL_IOVEC;
        md.eq_handle = MPIDI_nem_ptl_origin_eq;
        md.ct_handle = PTL_CT_NONE;
        ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md);
        MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret));

        REQ_PTL(rreq)->event_handler = handler_recv_complete;
        ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg,
                     e->match_bits, 0, rreq);
        MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
        goto fn_exit;
    }
        
    /* message won't fit in a single IOV, allocate buffer and unpack when received */
    /* FIXME: For now, allocate a single large buffer to hold entire message */
    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD,
                        mpi_errno, "chunk_buffer");
    big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq);

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
}
Ejemplo n.º 12
0
int create_2level_comm (MPI_Comm comm, int size, int my_rank)
{
    static const char FCNAME[] = "create_2level_comm";
    int mpi_errno = MPI_SUCCESS;
    MPID_Comm* comm_ptr;
    MPID_Comm* comm_world_ptr;
    MPI_Group subgroup1, comm_group;
    MPID_Group *group_ptr=NULL;
    int leader_comm_size, my_local_size, my_local_id, input_flag =0, output_flag=0;
    int errflag = FALSE;
    int leader_group_size=0;
  
    MPIU_THREADPRIV_DECL;
    MPIU_THREADPRIV_GET;
    MPID_Comm_get_ptr( comm, comm_ptr );
    MPID_Comm_get_ptr( MPI_COMM_WORLD, comm_world_ptr );

    int* shmem_group = MPIU_Malloc(sizeof(int) * size);
    if (NULL == shmem_group){
        printf("Couldn't malloc shmem_group\n");
        ibv_error_abort (GEN_EXIT_ERR, "create_2level_com");
    }

    /* Creating local shmem group */
    int i = 0;
    int local_rank = 0;
    int grp_index = 0;
    comm_ptr->ch.leader_comm=MPI_COMM_NULL;
    comm_ptr->ch.shmem_comm=MPI_COMM_NULL;

    MPIDI_VC_t* vc = NULL;
    for (; i < size ; ++i){
       MPIDI_Comm_get_vc(comm_ptr, i, &vc);
       if (my_rank == i || vc->smp.local_rank >= 0){
           shmem_group[grp_index] = i;
           if (my_rank == i){
               local_rank = grp_index;
           }
           ++grp_index;
       }  
    } 

    /* Creating leader group */
    int leader = 0;
    leader = shmem_group[0];


    /* Gives the mapping to any process's leader in comm */
    comm_ptr->ch.leader_map = MPIU_Malloc(sizeof(int) * size);
    if (NULL == comm_ptr->ch.leader_map){
        printf("Couldn't malloc group\n");
        ibv_error_abort (GEN_EXIT_ERR, "create_2level_com");
    }
    
    mpi_errno = MPIR_Allgather_impl (&leader, 1, MPI_INT , comm_ptr->ch.leader_map, 1, MPI_INT, comm_ptr, &errflag);
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }


    int* leader_group = MPIU_Malloc(sizeof(int) * size);
    if (NULL == leader_group){
        printf("Couldn't malloc leader_group\n");
        ibv_error_abort (GEN_EXIT_ERR, "create_2level_com");
    }

    /* Gives the mapping from leader's rank in comm to 
     * leader's rank in leader_comm */
    comm_ptr->ch.leader_rank = MPIU_Malloc(sizeof(int) * size);
    if (NULL == comm_ptr->ch.leader_rank){
        printf("Couldn't malloc marker\n");
        ibv_error_abort (GEN_EXIT_ERR, "create_2level_com");
    }

    for (i=0; i < size ; ++i){
         comm_ptr->ch.leader_rank[i] = -1;
    }
    int* group = comm_ptr->ch.leader_map;
    grp_index = 0;
    for (i=0; i < size ; ++i){
        if (comm_ptr->ch.leader_rank[(group[i])] == -1){
            comm_ptr->ch.leader_rank[(group[i])] = grp_index;
            leader_group[grp_index++] = group[i];
           
        }
    }
    leader_group_size = grp_index;
    comm_ptr->ch.leader_group_size = leader_group_size;

    mpi_errno = PMPI_Comm_group(comm, &comm_group);
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }
    
    mpi_errno = PMPI_Group_incl(comm_group, leader_group_size, leader_group, &subgroup1);
     if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }

    mpi_errno = PMPI_Comm_create(comm, subgroup1, &(comm_ptr->ch.leader_comm));
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }

    MPID_Comm *leader_ptr;
    MPID_Comm_get_ptr( comm_ptr->ch.leader_comm, leader_ptr );
       
    MPIU_Free(leader_group);
    MPID_Group_get_ptr( subgroup1, group_ptr );
    if(group_ptr != NULL) { 
       mpi_errno = PMPI_Group_free(&subgroup1);
       if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
       }
    } 

    mpi_errno = PMPI_Comm_split(comm, leader, local_rank, &(comm_ptr->ch.shmem_comm));
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }

    MPID_Comm *shmem_ptr;
    MPID_Comm_get_ptr(comm_ptr->ch.shmem_comm, shmem_ptr);


    mpi_errno = PMPI_Comm_rank(comm_ptr->ch.shmem_comm, &my_local_id);
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }
    mpi_errno = PMPI_Comm_size(comm_ptr->ch.shmem_comm, &my_local_size);
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }

    if(my_local_id == 0) { 
           int array_index=0;
           mpi_errno = PMPI_Comm_size(comm_ptr->ch.leader_comm, &leader_comm_size);
           if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
           }

           comm_ptr->ch.node_sizes = MPIU_Malloc(sizeof(int)*leader_comm_size);
           mpi_errno = PMPI_Allgather(&my_local_size, 1, MPI_INT,
				 comm_ptr->ch.node_sizes, 1, MPI_INT, comm_ptr->ch.leader_comm);
           if(mpi_errno) {
              MPIU_ERR_POP(mpi_errno);
           }
           comm_ptr->ch.is_uniform = 1; 
           for(array_index=0; array_index < leader_comm_size; array_index++) { 
                if(comm_ptr->ch.node_sizes[0] != comm_ptr->ch.node_sizes[array_index]) {
                     comm_ptr->ch.is_uniform = 0; 
                     break;
                }
           }
     } 
    
    comm_ptr->ch.is_global_block = 0; 
    /* We need to check to see if the ranks are block or not. Each node leader
     * gets the global ranks of all of its children processes. It scans through
     * this array to see if the ranks are in block order. The node-leaders then
     * do an allreduce to see if all the other nodes are also in block order.
     * This is followed by an intra-node bcast to let the children processes
     * know of the result of this step */ 
    if(my_local_id == 0) {
            int is_local_block = 1; 
            int index = 1; 
            
            while( index < my_local_size) { 
                    if( (shmem_group[index] - 1) != 
                         shmem_group[index - 1]) { 
                            is_local_block = 0; 
                            break; 
                    }
            index++;  
            }  

            comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/
            mpi_errno = MPIR_Allreduce_impl(&(is_local_block), 
                                      &(comm_ptr->ch.is_global_block), 1, 
                                      MPI_INT, MPI_LAND, leader_ptr, &errflag);
            if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
            } 
            mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0,
                                   shmem_ptr, &errflag); 
            if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
            } 
    } else { 
            mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0,
                                   shmem_ptr, &errflag); 
            if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
            } 
    }      
                              

    if (my_local_id == 0){
        lock_shmem_region();
        increment_shmem_comm_count();
        shmem_comm_count = get_shmem_comm_count();
        unlock_shmem_region();
    }
    
    shmem_ptr->ch.shmem_coll_ok = 0; 
    /* To prevent Bcast taking the knomial_2level_bcast route */
    mpi_errno = MPIR_Bcast_impl (&shmem_comm_count, 1, MPI_INT, 0, shmem_ptr, &errflag);
     if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }


    if (shmem_comm_count <= g_shmem_coll_blocks){
        shmem_ptr->ch.shmem_comm_rank = shmem_comm_count-1;
        input_flag = 1;
    } else{
        input_flag = 0;
    }
    comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/
    mpi_errno = MPIR_Allreduce_impl(&input_flag, &output_flag, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag);
    if(mpi_errno) {
       MPIU_ERR_POP(mpi_errno);
    }
    comm_ptr->ch.allgather_comm_ok = 0;
    if (allgather_ranking){
        int is_contig =1, check_leader =1, check_size=1, is_local_ok=0,is_block=0;
        int PPN;
        int shmem_grp_size = my_local_size;
        int leader_rank; 
        MPI_Group allgather_group; 
        comm_ptr->ch.allgather_comm=MPI_COMM_NULL; 
        comm_ptr->ch.allgather_new_ranks=NULL;

        if(comm_ptr->ch.leader_comm != MPI_COMM_NULL) { 
            PMPI_Comm_rank(comm_ptr->ch.leader_comm, &leader_rank); 
        } 

        mpi_errno=MPIR_Bcast_impl(&leader_rank, 1, MPI_INT, 0, shmem_ptr, &errflag);
        if(mpi_errno) {
        	MPIU_ERR_POP(mpi_errno);
        } 

        for (i=1; i < shmem_grp_size; i++ ){
            if (shmem_group[i] != shmem_group[i-1]+1){
                is_contig =0; 
                break;
            }
        }

        if (leader != (shmem_grp_size*leader_rank)){
            check_leader=0;
        }

        if (shmem_grp_size != (size/leader_group_size)){
            check_size=0;
        }

        is_local_ok = is_contig && check_leader && check_size;

        mpi_errno = MPIR_Allreduce_impl(&is_local_ok, &is_block, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag);
        if(mpi_errno) {
            MPIU_ERR_POP(mpi_errno);
        }

        if (is_block){
            int counter=0,j;
            comm_ptr->ch.allgather_new_ranks = MPIU_Malloc(sizeof(int)*size);
            if (NULL == comm_ptr->ch.allgather_new_ranks){
                    mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, 
                                                  FCNAME, __LINE__, MPI_ERR_OTHER, 
                                                  "**nomem", 0 );
                    return mpi_errno;
            }
   
            PPN = shmem_grp_size;
            
            for (j=0; j < PPN; j++){
                for (i=0; i < leader_group_size; i++){
                    comm_ptr->ch.allgather_new_ranks[counter] = j + i*PPN;
                    counter++;
                }
            }

            mpi_errno = PMPI_Group_incl(comm_group, size, comm_ptr->ch.allgather_new_ranks, &allgather_group);
            if(mpi_errno) {
                 MPIU_ERR_POP(mpi_errno);
            }  
            mpi_errno = PMPI_Comm_create(comm_ptr->handle, allgather_group, &(comm_ptr->ch.allgather_comm));
            if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
            }
            comm_ptr->ch.allgather_comm_ok = 1;
            
            mpi_errno=PMPI_Group_free(&allgather_group);
            if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
            }
        }
    }
    mpi_errno=PMPI_Group_free(&comm_group);
    if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
    }

    if (output_flag == 1){
        comm_ptr->ch.shmem_coll_ok = 1;
        comm_registry[comm_registered++] = comm_ptr->context_id;
    } else{
        comm_ptr->ch.shmem_coll_ok = 0;
        MPID_Group_get_ptr( subgroup1, group_ptr );
        if(group_ptr != NULL) { 
             mpi_errno = PMPI_Group_free(&subgroup1);
             if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
             }
        }
        MPID_Group_get_ptr( comm_group, group_ptr );
        if(group_ptr != NULL) { 
             mpi_errno = PMPI_Group_free(&comm_group);
             if(mpi_errno) {
               MPIU_ERR_POP(mpi_errno);
             }
        }
        free_2level_comm(comm_ptr);
        comm_ptr->ch.shmem_comm = MPI_COMM_NULL; 
        comm_ptr->ch.leader_comm = MPI_COMM_NULL; 
    }
    ++comm_count;
    MPIU_Free(shmem_group);

    fn_fail: 
       MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr );
    
    return (mpi_errno);


}
Ejemplo n.º 13
0
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                    origin_datatype, int target_rank, MPI_Aint target_disp,
                    int target_count, MPI_Datatype target_datatype, MPI_Op op,
                    MPID_Win *win_ptr)
{
    int mpi_errno=MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int dt_contig ATTRIBUTE((unused)), rank;
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc, *target_vc;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
    
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    MPIDI_Datatype_get_info(origin_count, origin_datatype,
			    dt_contig, data_sz, dtp, dt_true_lb);  
    
    if (data_sz == 0) {
	goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;
    
    if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
           if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
           the same node. However, in ch3:sock, even if origin and target are on the same node, they do
           not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
           which is only set to TRUE when SHM region is allocated in nemesis.
           In future we need to figure out a way to check if origin and target are in the same "SHM comm".
        */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id))
    {
	mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
					  target_rank, target_disp, target_count, target_datatype,
					  op, win_ptr);
	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    }
    else
    {
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
        MPIDI_RMA_Op_t *new_ptr = NULL;

	/* queue it up */
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr);
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }

	/* If predefined and contiguous, use a simplified element */
	if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) {
	    MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
	    new_ptr->type = MPIDI_RMA_ACC_CONTIG;
	    /* Only the information needed for the contig/predefined acc */
            /* Cast away const'ness for origin_address as
             * MPIDI_RMA_Op_t contain both PUT and GET like ops */
	    new_ptr->origin_addr = (void *) origin_addr;
	    new_ptr->origin_count = origin_count;
	    new_ptr->origin_datatype = origin_datatype;
	    new_ptr->target_rank = target_rank;
	    new_ptr->target_disp = target_disp;
	    new_ptr->target_count = target_count;
	    new_ptr->target_datatype = target_datatype;
	    new_ptr->op = op;
	    MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
	    goto fn_exit;
	}

	MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
	new_ptr->type = MPIDI_RMA_ACCUMULATE;
        /* Cast away const'ness for origin_address as MPIDI_RMA_Op_t
         * contain both PUT and GET like ops */
	new_ptr->origin_addr = (void *) origin_addr;
	new_ptr->origin_count = origin_count;
	new_ptr->origin_datatype = origin_datatype;
	new_ptr->target_rank = target_rank;
	new_ptr->target_disp = target_disp;
	new_ptr->target_count = target_count;
	new_ptr->target_datatype = target_datatype;
	new_ptr->op = op;
	MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
	
	/* if source or target datatypes are derived, increment their
	   reference counts */
	if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype))
	{
	    MPID_Datatype_get_ptr(origin_datatype, dtp);
	    MPID_Datatype_add_ref(dtp);
	}
	if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype))
	{
	    MPID_Datatype_get_ptr(target_datatype, dtp);
	    MPID_Datatype_add_ref(dtp);
	}
    }

 fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Ejemplo n.º 14
0
MPID_Request * MPIDI_CH3U_Recvq_FDU_or_AEP(int source, int tag, 
                                           int context_id, MPID_Comm *comm, void *user_buf,
                                           int user_count, MPI_Datatype datatype, int * foundp)
{
    int found;
    MPID_Request *rreq, *prev_rreq;
    MPIDI_Message_match match;
    MPIDI_Message_match mask;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);

    MPIU_THREAD_CS_ASSERT_HELD(MSGQUEUE);

    /* Optimize this loop for an empty unexpected receive queue */
    rreq = recvq_unexpected_head;
    if (rreq) {
	prev_rreq = NULL;

	match.parts.context_id = context_id;
	match.parts.tag = tag;
	match.parts.rank = source;

	if (tag != MPI_ANY_TAG && source != MPI_ANY_SOURCE) {
	    do {
		if (MATCH_WITH_NO_MASK(rreq->dev.match, match)) {
		    if (prev_rreq != NULL) {
			prev_rreq->dev.next = rreq->dev.next;
		    }
		    else {
			recvq_unexpected_head = rreq->dev.next;
		    }

		    if (rreq->dev.next == NULL) {
			recvq_unexpected_tail = prev_rreq;
		    }

		    rreq->comm = comm;
		    MPIR_Comm_add_ref(comm);
		    rreq->dev.user_buf = user_buf;
		    rreq->dev.user_count = user_count;
		    rreq->dev.datatype = datatype;
		    found = TRUE;
		    goto lock_exit;
		}
		prev_rreq = rreq;
		rreq      = rreq->dev.next;
	    } while (rreq);
	}
	else {
	    mask.parts.context_id = mask.parts.rank = mask.parts.tag = ~0;
	    if (tag == MPI_ANY_TAG)
		match.parts.tag = mask.parts.tag = 0;
	    if (source == MPI_ANY_SOURCE)
		match.parts.rank = mask.parts.rank = 0;

	    do {
		if (MATCH_WITH_LEFT_MASK(rreq->dev.match, match, mask)) {
		    if (prev_rreq != NULL) {
			prev_rreq->dev.next = rreq->dev.next;
		    }
		    else {
			recvq_unexpected_head = rreq->dev.next;
		    }
		    if (rreq->dev.next == NULL) {
			recvq_unexpected_tail = prev_rreq;
		    }
		    rreq->comm                 = comm;
		    MPIR_Comm_add_ref(comm);
		    rreq->dev.user_buf         = user_buf;
		    rreq->dev.user_count       = user_count;
		    rreq->dev.datatype         = datatype;
		    found = TRUE;
		    goto lock_exit;
		}
		prev_rreq = rreq;
		rreq = rreq->dev.next;
	    } while (rreq);
	}
    }
    
    /* A matching request was not found in the unexpected queue, so we 
       need to allocate a new request and add it to the posted queue */
    {
	int mpi_errno = MPI_SUCCESS;

        found = FALSE;

	MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit );
	rreq->dev.match.parts.tag	   = tag;
	rreq->dev.match.parts.rank	   = source;
	rreq->dev.match.parts.context_id   = context_id;

	/* Added a mask for faster search on 64-bit capable
	 * platforms */
	rreq->dev.mask.parts.context_id = ~0;
	if (rreq->dev.match.parts.rank == MPI_ANY_SOURCE)
	    rreq->dev.mask.parts.rank = 0;
	else
	    rreq->dev.mask.parts.rank = ~0;
	if (rreq->dev.match.parts.tag == MPI_ANY_TAG)
	    rreq->dev.mask.parts.tag = 0;
	else
	    rreq->dev.mask.parts.tag = ~0;

        rreq->comm                 = comm;
        MPIR_Comm_add_ref(comm);
        rreq->dev.user_buf         = user_buf;
        rreq->dev.user_count       = user_count;
        rreq->dev.datatype         = datatype;

        /* check whether VC has failed, or this is an ANY_SOURCE in a
           failed communicator */
        if (source != MPI_ANY_SOURCE) {
            MPIDI_VC_t *vc;
            MPIDI_Comm_get_vc(comm, source, &vc);
            if (vc->state == MPIDI_VC_STATE_MORIBUND) {
                MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
                rreq->status.MPI_ERROR = mpi_errno;
                MPIDI_CH3U_Request_complete(rreq);
                goto lock_exit;
            }
        } else if (MPID_VCRT_Contains_failed_vc(comm->vcrt)) {
            MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**comm_fail");
            rreq->status.MPI_ERROR = mpi_errno;
            MPIDI_CH3U_Request_complete(rreq);
            goto lock_exit;
        }
        
	rreq->dev.next = NULL;
	if (recvq_posted_tail != NULL) {
	    recvq_posted_tail->dev.next = rreq;
	}
	else {
	    recvq_posted_head = rreq;
	}
	recvq_posted_tail = rreq;
	MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq);
    }
    
  lock_exit:

    *foundp = found;
    
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP);
    return rreq;
}