Beispiel #1
0
int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
                          void *result_addr, MPI_Datatype datatype, int target_rank,
                          MPI_Aint target_disp, MPID_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int rank;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;

    MPIDI_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* The datatype must be predefined, and one of: C integer, Fortran integer,
     * Logical, Multi-language types, or Byte.  This is checked above the ADI,
     * so there's no need to check it again here. */

    /* FIXME: For shared memory windows, we should provide an implementation
     * that uses a processor atomic operation. */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
                                          datatype, target_rank, target_disp, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
        MPI_Aint type_size;
        void *src = NULL, *dest = NULL;

        /* Append this operation to the RMA ops queue */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = 1;
        op_ptr->origin_datatype = datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_datatype = datatype;
        op_ptr->compare_addr = (void *) compare_addr;
        op_ptr->compare_datatype = datatype;
        op_ptr->target_rank = target_rank;
        op_ptr->piggyback_lock_candidate = 1;   /* CAS is always able to piggyback LOCK */

        /************** Setting packet struct areas in operation ****************/

        cas_pkt = &(op_ptr->pkt.cas);
        MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED);
        cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        cas_pkt->datatype = datatype;
        cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;

        /* REQUIRE: All datatype arguments must be of the same, builtin
         * type and counts must be 1. */
        MPID_Datatype_get_size_macro(datatype, type_size);
        MPIU_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u));

        src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data));
        mpi_errno = immed_copy(src, dest, type_size);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data));
        mpi_errno = immed_copy(src, dest, type_size);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Beispiel #2
0
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
                              MPI_Datatype origin_datatype, void *result_addr, int result_count,
                              MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
                              int target_count, MPI_Datatype target_datatype, MPI_Op op,
                              MPID_Win * win_ptr, MPID_Request * ureq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_msg_sz_t orig_data_sz, target_data_sz;
    int rank;
    int dt_contig ATTRIBUTE((unused));
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
                            dt_true_lb);

    if (target_data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
                                              result_addr, result_count, result_datatype,
                                              target_rank, target_disp, target_count,
                                              target_datatype, op, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            mpi_errno = MPID_Request_complete(ureq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
        }
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
        MPI_Aint origin_type_size;
        MPI_Aint target_type_size;
        int use_immed_pkt = FALSE, i;
        int is_origin_contig, is_target_contig, is_result_contig;
        MPI_Aint stream_elem_count, stream_unit_count;
        MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
        MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
        int is_empty_origin = FALSE;

        /* Judge if origin buffer is empty */
        if (op == MPI_NO_OP)
            is_empty_origin = TRUE;

        /* Append the operation to the window's RMA ops queue */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->result_addr = result_addr;
        op_ptr->result_count = result_count;
        op_ptr->result_datatype = result_datatype;
        op_ptr->target_rank = target_rank;

        /* Remember user request */
        op_ptr->ureq = ureq;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, origin_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
            MPID_Datatype_get_ptr(result_datatype, result_dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, target_dtp);
        }

        if (is_empty_origin == FALSE) {
            MPID_Datatype_get_size_macro(origin_datatype, origin_type_size);
            MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t);
        }
        else {
            /* If origin buffer is empty, set origin data size to 0 */
            orig_data_sz = 0;
        }

        MPID_Datatype_get_size_macro(target_datatype, target_type_size);

        /* Get size and count for predefined datatype elements */
        if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            predefined_dtp_size = target_type_size;
            predefined_dtp_count = target_count;
            MPID_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent);
        }
        else {
            MPIU_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL);
            MPID_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size);
            predefined_dtp_count = target_data_sz / predefined_dtp_size;
            MPID_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent);
        }
        MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
                    predefined_dtp_extent > 0);

        /* Calculate number of predefined elements in each stream unit, and
         * total number of stream units. */
        stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
        stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
        MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0);

        for (i = 0; i < stream_unit_count; i++) {
            if (origin_dtp != NULL) {
                MPID_Datatype_add_ref(origin_dtp);
            }
            if (target_dtp != NULL) {
                MPID_Datatype_add_ref(target_dtp);
            }
            if (result_dtp != NULL) {
                MPID_Datatype_add_ref(result_dtp);
            }
        }

        if (is_empty_origin == FALSE) {
            MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        }
        else {
            /* If origin buffer is empty, mark origin data as contig data */
            is_origin_contig = 1;
        }
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);
        MPID_Datatype_is_contig(result_datatype, &is_result_contig);

        /* Judge if we can use IMMED data packet */
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
            is_origin_contig && is_target_contig && is_result_contig) {
            if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
                use_immed_pkt = TRUE;
        }

        /* Judge if this operation is a piggyback candidate */
        if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
            MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for origin, target and result data. We should extend this optimization to derived
             * datatypes as well. */
            if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }

        /************** Setting packet struct areas in operation ****************/

        get_accum_pkt = &(op_ptr->pkt.get_accum);

        if (use_immed_pkt) {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
        }
        else {
            MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
        }

        get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        get_accum_pkt->count = target_count;
        get_accum_pkt->datatype = target_datatype;
        get_accum_pkt->info.dataloop_size = 0;
        get_accum_pkt->op = op;
        get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data);
            mpi_errno = immed_copy(src, dest, orig_data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIR_ERR_POP(mpi_errno);
        }

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Beispiel #3
0
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
                   origin_datatype, int target_rank, MPI_Aint target_disp,
                   int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr,
                   MPID_Request * ureq)
{
    int mpi_errno = MPI_SUCCESS;
    int dt_contig ATTRIBUTE((unused)), rank;
    MPID_Datatype *dtp;
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPIDI_msg_sz_t data_sz;
    MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
    int made_progress = 0;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT);

    MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);

    if (data_sz == 0) {
        goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;

    if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
        win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
         * if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
         * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
         * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
         * which is only set to TRUE when SHM region is allocated in nemesis.
         * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
         */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* If the put is a local operation, do it here */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
        mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
                                          target_disp, target_count, target_datatype, win_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (ureq) {
            /* Complete user request and release the ch3 ref */
            mpi_errno = MPID_Request_complete(ureq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
        }
    }
    else {
        MPIDI_RMA_Op_t *op_ptr = NULL;
        MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
        int use_immed_pkt = FALSE;
        int is_origin_contig, is_target_contig;

        /* queue it up */
        mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);

        /******************** Setting operation struct areas ***********************/

        /* FIXME: For contig and very short operations, use a streamlined op */
        op_ptr->origin_addr = (void *) origin_addr;
        op_ptr->origin_count = origin_count;
        op_ptr->origin_datatype = origin_datatype;
        op_ptr->target_rank = target_rank;

        /* Remember user request */
        op_ptr->ureq = ureq;

        /* if source or target datatypes are derived, increment their
         * reference counts */
        if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
            MPID_Datatype_get_ptr(origin_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }
        if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            MPID_Datatype_get_ptr(target_datatype, dtp);
            MPID_Datatype_add_ref(dtp);
        }

        MPID_Datatype_is_contig(origin_datatype, &is_origin_contig);
        MPID_Datatype_is_contig(target_datatype, &is_target_contig);

        /* Judge if we can use IMMED data packet */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
            if (data_sz <= MPIDI_RMA_IMMED_BYTES)
                use_immed_pkt = TRUE;
        }

        /* Judge if this operation is an piggyback candidate */
        if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
            /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
             * for both origin and target data. We should extend this optimization to derived
             * datatypes as well. */
            if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
                op_ptr->piggyback_lock_candidate = 1;
        }

        /************** Setting packet struct areas in operation ****************/

        put_pkt = &(op_ptr->pkt.put);

        if (use_immed_pkt) {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
        }
        else {
            MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
        }

        put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
            win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
        put_pkt->count = target_count;
        put_pkt->datatype = target_datatype;
        put_pkt->info.dataloop_size = 0;
        put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
        put_pkt->source_win_handle = win_ptr->handle;
        put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE;
        if (use_immed_pkt) {
            void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data);
            mpi_errno = immed_copy(src, dest, data_sz);
            if (mpi_errno != MPI_SUCCESS)
                MPIR_ERR_POP(mpi_errno);
        }

        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);

        mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
            MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
            while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
                mpi_errno = wait_progress_engine();
                if (mpi_errno != MPI_SUCCESS)
                    MPIR_ERR_POP(mpi_errno);
            }
        }
    }

  fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Beispiel #4
0
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
                    origin_datatype, int target_rank, MPI_Aint target_disp,
                    int target_count, MPI_Datatype target_datatype, MPI_Op op,
                    MPID_Win *win_ptr)
{
    int mpi_errno=MPI_SUCCESS;
    MPIDI_msg_sz_t data_sz;
    int dt_contig ATTRIBUTE((unused)), rank;
    MPI_Aint dt_true_lb ATTRIBUTE((unused));
    MPID_Datatype *dtp;
    MPIDI_VC_t *orig_vc, *target_vc;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE);
    
    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE);

    if (target_rank == MPI_PROC_NULL) {
        goto fn_exit;
    }

    if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) {
        win_ptr->epoch_state = MPIDI_EPOCH_FENCE;
    }

    MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE,
                        mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");

    MPIDI_Datatype_get_info(origin_count, origin_datatype,
			    dt_contig, data_sz, dtp, dt_true_lb);  
    
    if (data_sz == 0) {
	goto fn_exit;
    }

    rank = win_ptr->comm_ptr->rank;
    
    if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
        /* check if target is local and shared memory is allocated on window,
           if so, we directly perform this operation on shared memory region. */

        /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
           the same node. However, in ch3:sock, even if origin and target are on the same node, they do
           not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
           which is only set to TRUE when SHM region is allocated in nemesis.
           In future we need to figure out a way to check if origin and target are in the same "SHM comm".
        */
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
        MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
    }

    /* Do =! rank first (most likely branch?) */
    if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
        (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id))
    {
	mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
					  target_rank, target_disp, target_count, target_datatype,
					  op, win_ptr);
	if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    }
    else
    {
        MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank);
        MPIDI_RMA_Op_t *new_ptr = NULL;

	/* queue it up */
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
        mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr);
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
        if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }

	/* If predefined and contiguous, use a simplified element */
	if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
            MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) {
	    MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
	    new_ptr->type = MPIDI_RMA_ACC_CONTIG;
	    /* Only the information needed for the contig/predefined acc */
            /* Cast away const'ness for origin_address as
             * MPIDI_RMA_Op_t contain both PUT and GET like ops */
	    new_ptr->origin_addr = (void *) origin_addr;
	    new_ptr->origin_count = origin_count;
	    new_ptr->origin_datatype = origin_datatype;
	    new_ptr->target_rank = target_rank;
	    new_ptr->target_disp = target_disp;
	    new_ptr->target_count = target_count;
	    new_ptr->target_datatype = target_datatype;
	    new_ptr->op = op;
	    MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
	    goto fn_exit;
	}

	MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
	new_ptr->type = MPIDI_RMA_ACCUMULATE;
        /* Cast away const'ness for origin_address as MPIDI_RMA_Op_t
         * contain both PUT and GET like ops */
	new_ptr->origin_addr = (void *) origin_addr;
	new_ptr->origin_count = origin_count;
	new_ptr->origin_datatype = origin_datatype;
	new_ptr->target_rank = target_rank;
	new_ptr->target_disp = target_disp;
	new_ptr->target_count = target_count;
	new_ptr->target_datatype = target_datatype;
	new_ptr->op = op;
	MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
	
	/* if source or target datatypes are derived, increment their
	   reference counts */
	if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype))
	{
	    MPID_Datatype_get_ptr(origin_datatype, dtp);
	    MPID_Datatype_add_ref(dtp);
	}
	if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype))
	{
	    MPID_Datatype_get_ptr(target_datatype, dtp);
	    MPID_Datatype_add_ref(dtp);
	}
    }

 fn_exit:
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Beispiel #5
0
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info,
                                       MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    void **base_pp = (void **) base_ptr;
    int i, node_size, node_rank;
    MPIR_Comm *node_comm_ptr;
    MPI_Aint *node_sizes;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int noncontig = FALSE;
    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        mpi_errno =
            MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
        goto fn_exit;
    }

    /* see if we can allocate all windows contiguously */
    noncontig = (*win_ptr)->info_args.alloc_shared_noncontig;

    (*win_ptr)->shm_allocated = TRUE;

    /* When allocating shared memory region segment, we need comm of processes
     * that are on the same node as this process (node_comm).
     * If node_comm == NULL, this process is the only one on this node, therefore
     * we use comm_self as node comm. */
    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_size = node_comm_ptr->local_size;
    node_rank = node_comm_ptr->rank;

    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
    /* allocate memory for the base addresses, disp_units, and
     * completion counters of all processes */
    MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");

    /* get the sizes of the windows and window objectsof
     * all processes.  allocate temp. buffer for communication */
    MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
                        "node_sizes");

    /* FIXME: This needs to be fixed for heterogeneous systems */
    node_sizes[node_rank] = (MPI_Aint) size;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                    node_sizes, sizeof(MPI_Aint), MPI_BYTE,
                                    node_comm_ptr, &errflag);
    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    (*win_ptr)->shm_segment_len = 0;

    for (i = 0; i < node_size; i++) {
        if (noncontig)
            /* Round up to next page size */
            (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]);
        else
            (*win_ptr)->shm_segment_len += node_sizes[i];
    }

    if ((*win_ptr)->shm_segment_len == 0) {
        (*win_ptr)->base = NULL;
    }

    else {
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle,
                                                (*win_ptr)->shm_segment_len,
                                                (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
                                     (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* Allocated the interprocess mutex segment. */
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle,
                                                sizeof(MPIDI_CH3I_SHM_MUTEX),
                                                (char **) &(*win_ptr)->shm_mutex, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle,
                                     sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex,
                                     0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* compute the base addresses of each process within the shared memory segment */
        {
            char *cur_base;
            int cur_rank;

            cur_base = (*win_ptr)->shm_base_addr;
            cur_rank = 0;
            ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr;
            for (i = 1; i < node_size; ++i) {
                if (node_sizes[i]) {
                    /* For the base addresses, we track the previous
                     * process that has allocated non-zero bytes of shared
                     * memory.  We can not simply use "i-1" for the
                     * previous process because rank "i-1" might not have
                     * allocated any memory. */
                    if (noncontig) {
                        ((*win_ptr)->shm_base_addrs)[i] =
                            cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
                    }
                    else {
                        ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank];
                    }
                    cur_base = ((*win_ptr)->shm_base_addrs)[i];
                    cur_rank = i;
                }
                else {
                    ((*win_ptr)->shm_base_addrs)[i] = NULL;
                }
            }
        }

        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
    }

    *base_pp = (*win_ptr)->base;

    /* gather window information among processes via shared memory region. */
    mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    /* Cache SHM windows */
    MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Beispiel #6
0
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
                               MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank;
    MPI_Aint *tmp_buf;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIU_CHKPMEM_DECL(1);
    MPIU_CHKLMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);

    MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);

    comm_size = (*win_ptr)->comm_ptr->local_size;
    rank = (*win_ptr)->comm_ptr->rank;

    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
    /* allocate memory for the base addresses, disp_units, and
     * completion counters of all processes */
    MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
                        comm_size * sizeof(MPIDI_Win_basic_info_t),
                        mpi_errno, "(*win_ptr)->basic_info_table");

    /* get the addresses of the windows, window objects, and completion
     * counters of all processes.  allocate temp. buffer for communication */
    MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                        mpi_errno, "tmp_buf");

    /* FIXME: This needs to be fixed for heterogeneous systems */
    /* FIXME: If we wanted to validate the transfer as within range at the
     * origin, we'd also need the window size. */
    tmp_buf[4 * rank] = MPIU_PtrToAint(base);
    tmp_buf[4 * rank + 1] = size;
    tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit;
    tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                    tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag);
    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    k = 0;
    for (i = 0; i < comm_size; i++) {
        (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]);
        (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
        (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
        (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
    }

  fn_exit:
    MPIU_CHKLMEM_FREEALL();
    MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}