int MPIDI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int rank, origin_predefined, result_predefined, target_predefined; int shm_locked = 0; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->myrank; origin_predefined = TRUE; /* quiet uninitialized warnings (b/c goto) */ if (op != MPI_NO_OP) { MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); } MPIDI_CH3I_DATATYPE_IS_PREDEFINED(result_datatype, result_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { MPI_User_function *uop; void *base; int disp_unit; if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) { base = win_ptr->shm_base_addrs[target_rank]; disp_unit = win_ptr->disp_units[target_rank]; MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr); shm_locked = 1; } else { base = win_ptr->base; disp_unit = win_ptr->disp_unit; } /* Perform the local get first, then the accumulate */ mpi_errno = MPIR_Localcopy((char *) base + disp_unit * target_disp, target_count, target_datatype, result_addr, result_count, result_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* NO_OP: Don't perform the accumulate */ if (op == MPI_NO_OP) { if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) base + disp_unit * target_disp, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shm_locked) { MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr); shm_locked = 0; } goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_OP_HDL_TO_FN(op); if (origin_predefined && target_predefined) { /* Cast away const'ness for origin_address in order to * avoid changing the prototype for MPI_User_function */ (*uop)((void *) origin_addr, (char *) base + disp_unit*target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *target_buf; const void *source_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) base + disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) base + disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPIDI_Accumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig, rank, origin_predefined, target_predefined; MPI_Aint dt_true_lb; MPIDI_RMA_ops *new_ptr; MPID_Datatype *dtp; MPIU_CHKLMEM_DECL(2); MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if ((data_sz == 0) || (target_rank == MPI_PROC_NULL)) { goto fn_exit; } rank = win_ptr->myrank; MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, origin_predefined); MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, target_predefined); /* Do =! rank first (most likely branch?) */ if (target_rank == rank) { MPI_User_function *uop; if (op == MPI_REPLACE) { mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, target_count, target_datatype); goto fn_exit; } MPIU_ERR_CHKANDJUMP1((HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN), mpi_errno, MPI_ERR_OP, "**opnotpredefined", "**opnotpredefined %d", op ); /* get the function by indexing into the op table */ uop = MPIR_Op_table[((op)&0xf) - 1]; if (origin_predefined && target_predefined) { (*uop)(origin_addr, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { /* derived datatype */ MPID_Segment *segp; DLOOP_VECTOR *dloop_vec; MPI_Aint first, last; int vec_len, i, type_size, count; MPI_Datatype type; MPI_Aint true_lb, true_extent, extent; void *tmp_buf=NULL, *source_buf, *target_buf; if (origin_datatype != target_datatype) { /* first copy the data into a temporary buffer with the same datatype as the target. Then do the accumulate operation. */ MPIR_Type_get_true_extent_impl(target_datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(target_datatype, extent); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, target_count * (MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); mpi_errno = MPIR_Localcopy(origin_addr, origin_count, origin_datatype, tmp_buf, target_count, target_datatype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (target_predefined) { /* target predefined type, origin derived datatype */ (*uop)(tmp_buf, (char *) win_ptr->base + win_ptr->disp_unit * target_disp, &target_count, &target_datatype); } else { segp = MPID_Segment_alloc(); MPIU_ERR_CHKANDJUMP1((!segp), mpi_errno, MPI_ERR_OTHER, "**nomem","**nomem %s","MPID_Segment_alloc"); MPID_Segment_init(NULL, target_count, target_datatype, segp, 0); first = 0; last = SEGMENT_IGNORE_LAST; MPID_Datatype_get_ptr(target_datatype, dtp); vec_len = dtp->max_contig_blocks * target_count + 1; /* +1 needed because Rob says so */ MPIU_CHKLMEM_MALLOC(dloop_vec, DLOOP_VECTOR *, vec_len * sizeof(DLOOP_VECTOR), mpi_errno, "dloop vector"); MPID_Segment_pack_vector(segp, first, &last, dloop_vec, &vec_len); source_buf = (tmp_buf != NULL) ? tmp_buf : origin_addr; target_buf = (char *) win_ptr->base + win_ptr->disp_unit * target_disp; type = dtp->eltype; type_size = MPID_Datatype_get_basic_size(type); for (i=0; i<vec_len; i++) { count = (dloop_vec[i].DLOOP_VECTOR_LEN)/type_size; (*uop)((char *)source_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), (char *)target_buf + MPIU_PtrToAint(dloop_vec[i].DLOOP_VECTOR_BUF), &count, &type); } MPID_Segment_free(segp); } }
int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; int errflag = FALSE; MPIU_CHKPMEM_DECL(5); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; /* RMA handlers should be set before calling this function */ mpi_errno = (*win_ptr)->RMAFns.Win_set_info(*win_ptr, info); MPIU_INSTR_DURATION_START(wincreate_allgather); /* allocate memory for the base addresses, disp_units, and completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **, comm_size*sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs"); MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size*sizeof(MPI_Aint), mpi_errno, "(*win_ptr)->sizes"); MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->disp_units"); MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *, comm_size*sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles"); MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->pt_rma_puts_accs"); for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0; /* get the addresses of the windows, window objects, and completion counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4*comm_size*sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the origin, we'd also need the window size. */ tmp_buf[4*rank] = MPIU_PtrToAint(base); tmp_buf[4*rank+1] = size; tmp_buf[4*rank+2] = (MPI_Aint) disp_unit; tmp_buf[4*rank+3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIU_INSTR_DURATION_END(wincreate_allgather); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i=0; i<comm_size; i++) { (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->sizes[i] = tmp_buf[k++]; (*win_ptr)->disp_units[i] = (int) tmp_buf[k++]; (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPID_Comm *win_comm_ptr; int errflag = FALSE; MPIU_CHKPMEM_DECL(4); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_CREATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_CREATE); /* FIXME: There should be no unreferenced args */ MPIU_UNREFERENCED_ARG(info); if(initRMAoptions) { int rc; MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions); /* Default is to enable the use of the immediate accumulate feature */ if (!MPL_env2bool( "MPICH_RMA_ACC_IMMED", &rc )) rc = 1; MPIDI_CH3_RMA_SetAccImmed(rc); #ifdef USE_MPIU_INSTR /* Define all instrumentation handle used in the CH3 RMA here*/ MPIU_INSTR_DURATION_INIT(wincreate_allgather,0,"WIN_CREATE:Allgather"); MPIU_INSTR_DURATION_INIT(winfree_rs,0,"WIN_FREE:ReduceScatterBlock"); MPIU_INSTR_DURATION_INIT(winfree_complete,0,"WIN_FREE:Complete"); MPIU_INSTR_DURATION_INIT(rmaqueue_alloc,0,"Allocate RMA Queue element"); MPIDI_CH3_RMA_InitInstr(); #endif MPIU_THREADSAFE_INIT_CLEAR(initRMAoptions); MPIU_THREADSAFE_INIT_BLOCK_END(initRMAoptions); } comm_size = comm_ptr->local_size; rank = comm_ptr->rank; *win_ptr = (MPID_Win *)MPIU_Handle_obj_alloc( &MPID_Win_mem ); MPIU_ERR_CHKANDJUMP1(!(*win_ptr),mpi_errno,MPI_ERR_OTHER,"**nomem", "**nomem %s","MPID_Win_mem"); MPIU_Object_set_ref(*win_ptr, 1); (*win_ptr)->fence_cnt = 0; (*win_ptr)->base = base; (*win_ptr)->size = size; (*win_ptr)->disp_unit = disp_unit; (*win_ptr)->start_group_ptr = NULL; (*win_ptr)->start_assert = 0; (*win_ptr)->attributes = NULL; (*win_ptr)->rma_ops_list_head = NULL; (*win_ptr)->rma_ops_list_tail = NULL; (*win_ptr)->lock_granted = 0; (*win_ptr)->current_lock_type = MPID_LOCK_NONE; (*win_ptr)->shared_lock_ref_cnt = 0; (*win_ptr)->lock_queue = NULL; (*win_ptr)->my_counter = 0; (*win_ptr)->my_pt_rma_puts_accs = 0; mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); (*win_ptr)->comm_ptr = win_comm_ptr; (*win_ptr)->myrank = rank; MPIU_INSTR_DURATION_START(wincreate_allgather); /* allocate memory for the base addresses, disp_units, and completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **, comm_size*sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs"); MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->disp_units"); MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *, comm_size*sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles"); MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->pt_rma_puts_accs"); for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0; /* get the addresses of the windows, window objects, and completion counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 3*comm_size*sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ tmp_buf[3*rank] = MPIU_PtrToAint(base); tmp_buf[3*rank+1] = (MPI_Aint) disp_unit; tmp_buf[3*rank+2] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 3 * sizeof(MPI_Aint), MPI_BYTE, comm_ptr, &errflag); MPIU_INSTR_DURATION_END(wincreate_allgather); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i=0; i<comm_size; i++) { (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->disp_units[i] = (int) tmp_buf[k++]; (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_CREATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIU_CHKPMEM_DECL(1); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *, comm_size * sizeof(MPIDI_Win_basic_info_t), mpi_errno, "(*win_ptr)->basic_info_table"); /* get the addresses of the windows, window objects, and completion * counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the * origin, we'd also need the window size. */ tmp_buf[4 * rank] = MPIU_PtrToAint(base); tmp_buf[4 * rank + 1] = size; tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }