int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win_ptr, MPIR_Request * ureq) { int mpi_errno = MPI_SUCCESS; intptr_t orig_data_sz, target_data_sz; int rank; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPIDU_Datatype*dtp; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp, dt_true_lb); if (target_data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt; MPI_Aint origin_type_size; MPI_Aint target_type_size; int use_immed_pkt = FALSE, i; int is_origin_contig, is_target_contig, is_result_contig; MPI_Aint stream_elem_count, stream_unit_count; MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent; MPIDU_Datatype*origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL; int is_empty_origin = FALSE; /* Judge if origin buffer is empty */ if (op == MPI_NO_OP) is_empty_origin = TRUE; /* Append the operation to the window's RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->result_addr = result_addr; op_ptr->result_count = result_count; op_ptr->result_datatype = result_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPIDU_Datatype_get_ptr(origin_datatype, origin_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) { MPIDU_Datatype_get_ptr(result_datatype, result_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPIDU_Datatype_get_ptr(target_datatype, target_dtp); } if (is_empty_origin == FALSE) { MPIDU_Datatype_get_size_macro(origin_datatype, origin_type_size); MPIR_Assign_trunc(orig_data_sz, origin_count * origin_type_size, intptr_t); } else { /* If origin buffer is empty, set origin data size to 0 */ orig_data_sz = 0; } MPIDU_Datatype_get_size_macro(target_datatype, target_type_size); /* Get size and count for predefined datatype elements */ if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { predefined_dtp_size = target_type_size; predefined_dtp_count = target_count; MPIDU_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent); } else { MPIR_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL); MPIDU_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size); predefined_dtp_count = target_data_sz / predefined_dtp_size; MPIDU_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent); } MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0); /* Calculate number of predefined elements in each stream unit, and * total number of stream units. */ stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent; stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1; MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0); for (i = 0; i < stream_unit_count; i++) { if (origin_dtp != NULL) { MPIDU_Datatype_add_ref(origin_dtp); } if (target_dtp != NULL) { MPIDU_Datatype_add_ref(target_dtp); } if (result_dtp != NULL) { MPIDU_Datatype_add_ref(result_dtp); } } if (is_empty_origin == FALSE) { MPIDU_Datatype_is_contig(origin_datatype, &is_origin_contig); } else { /* If origin buffer is empty, mark origin data as contig data */ is_origin_contig = 1; } MPIDU_Datatype_is_contig(target_datatype, &is_target_contig); MPIDU_Datatype_is_contig(result_datatype, &is_result_contig); /* Judge if we can use IMMED data packet */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig && is_result_contig) { if (target_data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is a piggyback candidate */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for origin, target and result data. We should extend this optimization to derived * datatypes as well. */ if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ get_accum_pkt = &(op_ptr->pkt.get_accum); if (use_immed_pkt) { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED); } else { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM); } get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; get_accum_pkt->count = target_count; get_accum_pkt->datatype = target_datatype; get_accum_pkt->info.dataloop_size = 0; get_accum_pkt->op = op; get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data); mpi_errno = immed_copy(src, dest, orig_data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDU_Type_indexed(int count, const int *blocklength_array, const void *displacement_array, int dispinbytes, MPI_Datatype oldtype, MPI_Datatype *newtype) { int mpi_errno = MPI_SUCCESS; int is_builtin, old_is_contig; int i; MPI_Aint contig_count; MPI_Aint el_sz, el_ct, old_ct, old_sz; MPI_Aint old_lb, old_ub, old_extent, old_true_lb, old_true_ub; MPI_Aint min_lb = 0, max_ub = 0, eff_disp; MPI_Datatype el_type; MPIDU_Datatype *new_dtp; if (count == 0) return MPIDU_Type_zerolen(newtype); /* sanity check that blocklens are all non-negative */ for (i = 0; i < count; ++i) { DLOOP_Assert(blocklength_array[i] >= 0); } /* allocate new datatype object and handle */ new_dtp = (MPIDU_Datatype *) MPIR_Handle_obj_alloc(&MPIDU_Datatype_mem); /* --BEGIN ERROR HANDLING-- */ if (!new_dtp) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, "MPIDU_Type_indexed", __LINE__, MPI_ERR_OTHER, "**nomem", 0); return mpi_errno; } /* --END ERROR HANDLING-- */ /* handle is filled in by MPIR_Handle_obj_alloc() */ MPIR_Object_set_ref(new_dtp, 1); new_dtp->is_permanent = 0; new_dtp->is_committed = 0; new_dtp->attributes = NULL; new_dtp->cache_id = 0; new_dtp->name[0] = 0; new_dtp->contents = NULL; new_dtp->dataloop = NULL; new_dtp->dataloop_size = -1; new_dtp->dataloop_depth = -1; new_dtp->hetero_dloop = NULL; new_dtp->hetero_dloop_size = -1; new_dtp->hetero_dloop_depth = -1; is_builtin = (HANDLE_GET_KIND(oldtype) == HANDLE_KIND_BUILTIN); if (is_builtin) { /* builtins are handled differently than user-defined types because * they have no associated dataloop or datatype structure. */ el_sz = MPIDU_Datatype_get_basic_size(oldtype); old_sz = el_sz; el_ct = 1; el_type = oldtype; old_lb = 0; old_true_lb = 0; old_ub = (MPI_Aint) el_sz; old_true_ub = (MPI_Aint) el_sz; old_extent = (MPI_Aint) el_sz; old_is_contig = 1; new_dtp->has_sticky_ub = 0; new_dtp->has_sticky_lb = 0; MPIR_Assign_trunc(new_dtp->alignsize, el_sz, MPI_Aint); new_dtp->builtin_element_size = el_sz; new_dtp->basic_type = el_type; new_dtp->max_contig_blocks = count; } else { /* user-defined base type (oldtype) */ MPIDU_Datatype *old_dtp; MPIDU_Datatype_get_ptr(oldtype, old_dtp); /* Ensure that "builtin_element_size" fits into an int datatype. */ MPIR_Ensure_Aint_fits_in_int(old_dtp->builtin_element_size); el_sz = old_dtp->builtin_element_size; old_sz = old_dtp->size; el_ct = old_dtp->n_builtin_elements; el_type = old_dtp->basic_type; old_lb = old_dtp->lb; old_true_lb = old_dtp->true_lb; old_ub = old_dtp->ub; old_true_ub = old_dtp->true_ub; old_extent = old_dtp->extent; old_is_contig = old_dtp->is_contig; new_dtp->has_sticky_lb = old_dtp->has_sticky_lb; new_dtp->has_sticky_ub = old_dtp->has_sticky_ub; new_dtp->builtin_element_size = (MPI_Aint) el_sz; new_dtp->basic_type = el_type; new_dtp->max_contig_blocks = 0; for(i=0; i<count; i++) new_dtp->max_contig_blocks += old_dtp->max_contig_blocks * ((MPI_Aint ) blocklength_array[i]); } /* find the first nonzero blocklength element */ i = 0; while (i < count && blocklength_array[i] == 0) i++; if (i == count) { MPIR_Handle_obj_free(&MPIDU_Datatype_mem, new_dtp); return MPIDU_Type_zerolen(newtype); } /* priming for loop */ old_ct = blocklength_array[i]; eff_disp = (dispinbytes) ? ((MPI_Aint *) displacement_array)[i] : (((MPI_Aint) ((int *) displacement_array)[i]) * old_extent); MPIDU_DATATYPE_BLOCK_LB_UB((MPI_Aint) blocklength_array[i], eff_disp, old_lb, old_ub, old_extent, min_lb, max_ub); /* determine min lb, max ub, and count of old types in remaining * nonzero size blocks */ for (i++; i < count; i++) { MPI_Aint tmp_lb, tmp_ub; if (blocklength_array[i] > 0) { old_ct += blocklength_array[i]; /* add more oldtypes */ eff_disp = (dispinbytes) ? ((MPI_Aint *) displacement_array)[i] : (((MPI_Aint) ((int *) displacement_array)[i]) * old_extent); /* calculate ub and lb for this block */ MPIDU_DATATYPE_BLOCK_LB_UB((MPI_Aint)(blocklength_array[i]), eff_disp, old_lb, old_ub, old_extent, tmp_lb, tmp_ub); if (tmp_lb < min_lb) min_lb = tmp_lb; if (tmp_ub > max_ub) max_ub = tmp_ub; } } new_dtp->size = old_ct * old_sz; new_dtp->lb = min_lb; new_dtp->ub = max_ub; new_dtp->true_lb = min_lb + (old_true_lb - old_lb); new_dtp->true_ub = max_ub + (old_true_ub - old_ub); new_dtp->extent = max_ub - min_lb; new_dtp->n_builtin_elements = old_ct * el_ct; /* new type is only contig for N types if it's all one big * block, its size and extent are the same, and the old type * was also contiguous. */ new_dtp->is_contig = 0; if(old_is_contig) { MPI_Aint *blklens = MPL_malloc(count *sizeof(MPI_Aint)); for (i=0; i<count; i++) blklens[i] = blocklength_array[i]; contig_count = MPIDU_Type_indexed_count_contig(count, blklens, displacement_array, dispinbytes, old_extent); new_dtp->max_contig_blocks = contig_count; if( (contig_count == 1) && ((MPI_Aint) new_dtp->size == new_dtp->extent)) { new_dtp->is_contig = 1; } MPL_free(blklens); } *newtype = new_dtp->handle; return mpi_errno; }
int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr, MPIR_Request * ureq) { int mpi_errno = MPI_SUCCESS; intptr_t orig_data_sz, target_data_sz; int dt_contig ATTRIBUTE((unused)), rank; MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPIDU_Datatype*dtp; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp, dt_true_lb); if (orig_data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* If the get is a local operation, do it here */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_get_t *get_pkt = NULL; MPI_Aint target_type_size; int use_immed_resp_pkt = FALSE; int is_origin_contig, is_target_contig; /* queue it up */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ /* FIXME: For contig and very short operations, use a streamlined op */ op_ptr->origin_addr = origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPIDU_Datatype_get_ptr(origin_datatype, dtp); MPIDU_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPIDU_Datatype_get_ptr(target_datatype, dtp); MPIDU_Datatype_add_ref(dtp); } MPIDU_Datatype_is_contig(origin_datatype, &is_origin_contig); MPIDU_Datatype_is_contig(target_datatype, &is_target_contig); MPIDU_Datatype_get_size_macro(target_datatype, target_type_size); MPIR_Assign_trunc(target_data_sz, target_count * target_type_size, intptr_t); /* Judge if we can use IMMED data response packet */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) { if (target_data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_resp_pkt = TRUE; } /* Judge if this operation is an piggyback candidate. */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for both origin and target data. We should extend this optimization to derived * datatypes as well. */ op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ get_pkt = &(op_ptr->pkt.get); MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET); get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; get_pkt->count = target_count; get_pkt->datatype = target_datatype; get_pkt->info.dataloop_size = 0; get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; get_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_resp_pkt) get_pkt->flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP; MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }